Download and Preprocess StatsBomb and SkillCorner Data

This script downloads StatsBomb event and match data, matches it with SkillCorner tracking data, and processes the combined data.

Dependencies

  • os

  • json

  • tqdm

  • pandas

  • statsbombpy

  • openstarlab_preprocessing

Usage

import os
import json
from tqdm import tqdm
import pandas as pd
from statsbombpy import sb

from preprocessing import Event_data

#down_load_statsbomb_data function
def download_statsbomb_data(creds, save_dir,competition_id=11, season_id=281):

    os.makedirs(save_dir, exist_ok=True)

    def convert_df_in_dict(d):
        for key, value in d.items():
            if isinstance(value, pd.DataFrame):
                d[key] = value.to_dict(orient='records')
            elif isinstance(value, dict):
                convert_df_in_dict(value)
        return d

    # Get Statsbomb matches data
    matches = sb.matches(competition_id=competition_id, season_id=season_id, creds=creds)
    matches["competition_id"] = competition_id
    matches["season_id"] = season_id
    #moev the competition_id and season_id to the first column
    cols = matches.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    matches = matches[cols]
    #save the matches to csv
    matches.to_csv(os.path.join(save_dir, "matches.csv"), index=False)

    # Get Statsbomb lineups and events
    os.makedirs(os.path.join(save_dir, "lineups"), exist_ok=True)
    os.makedirs(os.path.join(save_dir, "events"), exist_ok=True)
    for match_id in tqdm(matches["match_id"].unique()):
        lineups = sb.lineups(match_id=match_id, creds=creds)
        events = sb.events(match_id=match_id, include_360_metrics=True, creds=creds)
        events.to_csv(os.path.join(save_dir, "events", f"{match_id}.csv"), index=False)
        #save the lineups as json and with row changes
        lineups = convert_df_in_dict(lineups)
        with open(os.path.join(save_dir, "lineups", f"{match_id}.json"), "w") as f:
            json.dump(lineups, f, indent=4)

if __name__ == "__main__":
    #Statsbomb API
    creds = {"user": "input your Statsbomb api user name here", "passwd": "input your Statsbomb api password here"}
    #Statsbomb event data saving dir
    save_dir = "/statsbomb"
    #path to the skillcorner tracking data
    tracking_path="/skillcorner/tracking"
    #path to the skillcorner match data
    match_path="/skillcorner/match"

    download_statsbomb_data(creds, save_dir)

    #output file
    output_dir="data.csv"

    #Match the statsbomb and skillcorner (one file)
    statsbomb_match_id=3894907 #for the match id matching refer to id_matching.csv
    skillcorner_match_id=1553748
    statsbomb_skillcorner_df=Event_data(data_provider='statsbomb_skillcorner',
                                        statsbomb_event_dir=save_dir+'/events',
                                        skillcorner_tracking_dir=tracking_path,
                                        skillcorner_match_dir=match_path,
                                        statsbomb_match_id=statsbomb_match_id,
                                        skillcorner_match_id=skillcorner_match_id
                                        ).load_data()
    statsbomb_skillcorner_df.to_csv(output_dir,index=False)

    #Match the statsbomb and skillcorner (multiple files)
    statsbomb_skillcorner_df=Event_data(data_provider='statsbomb_skillcorner',
                                        statsbomb_event_dir=save_dir+'/events',
                                        skillcorner_tracking_dir=tracking_path,
                                        skillcorner_match_dir=match_path,
                                        match_id_df=os.getcwd()+'/id_matching.csv', #(For laliga 23-24 data, lcoated in "openstarlab_preprocessing/open/example/id_matching.csv")
                                        ).load_data()
    statsbomb_skillcorner_df.to_csv(output_dir,index=False)
    print("---------------done-----------------")