Download and Preprocess Wyscout Data

This script downloads, unzips, and preprocesses Wyscout event and match data. The processed data is then split into training, validation, and test sets.
Dependencies

subprocess
numpy
pandas
openstarlab_preprocessing
Usage

import subprocess
from preprocessing import Event_data
import numpy as np
import pandas as pd

def download_with_wget(url, output_directory='.'):
    # Construct the wget command
    command = ['wget', url, '-P', output_directory]

    # Execute the command
    try:
        subprocess.run(command, check=True)
        print(f"Downloaded {url} successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Failed to download {url}. Error: {e}")

if __name__ == "__main__":
    #path for the wyscout data
    event_path='./event'
    matches_path='./matches'

    #download the wyscout data
    event_url = "https://figshare.com/ndownloader/files/14464685/events.zip"
    matches_url = "https://figshare.com/ndownloader/files/14464622/matches.zip"
    download_with_wget(event_url, event_path)
    download_with_wget(matches_url, matches_path)

    #unzip the downloaded files
    subprocess.run(['unzip', 'event/events.zip', '-d', 'event'])
    subprocess.run(['unzip', 'matches/matches.zip', '-d', 'matches'])

    #remove the unnecessary files (expect England/France/Italy/Spain/Germany files)
    subprocess.run(['rm', '-rf', 'event/events.zip'])
    subprocess.run(['rm', '-rf', 'matches/matches.zip'])
    subprocess.run(['rm', '-rf', 'event/events_European_Championship.json'])
    subprocess.run(['rm', '-rf', 'event/events_World_Cup.json'])
    subprocess.run(['rm', '-rf', 'matches/matches_European_Championship.json'])
    subprocess.run(['rm', '-rf', 'matches/matches_World_Cup.json'])

    #load and preprocess the data (increase max_workers for faster processing)
    wyscout_df=Event_data(data_provider='wyscout',event_path=event_path,wyscout_matches_path=matches_path,
                        preprocess_method="NMSTPP",max_workers=1).preprocessing()
    wyscout_df.to_csv('data.csv',index=False)

    #split the data into train valid and test
    Train_ratio=0.8
    Valid_ratio=0.1
    Test_ratio=0.1

    Train_id=[]
    Valid_id=[]
    Test_id=[]
    for i in np.unique(wyscout_df[['comp']]):
        temp=wyscout_df[wyscout_df['comp']==i]
        id_list=temp.match_id.unique()
        Train_id+=id_list[0:round(temp.match_id.nunique()*Train_ratio)].tolist()
        Valid_id+=id_list[round(temp.match_id.nunique()*Train_ratio):round(temp.match_id.nunique()*(Train_ratio+Valid_ratio))].tolist()
        Test_id+=id_list[round(temp.match_id.nunique()*(Train_ratio+Valid_ratio)):].tolist()

    train=wyscout_df[wyscout_df["match_id"].isin(Train_id)]
    valid=wyscout_df[wyscout_df["match_id"].isin(Valid_id)]
    test=wyscout_df[wyscout_df["match_id"].isin(Test_id)]

    train.to_csv("train.csv",index=False)
    valid.to_csv("valid.csv",index=False)
    test.to_csv("test.csv",index=False)
    print("---------------done-----------------")