diff --git a/preprocessing/sports/event_data/soccer/soccer_event_class.py b/preprocessing/sports/event_data/soccer/soccer_event_class.py index d0ea9c2..4802274 100644 --- a/preprocessing/sports/event_data/soccer/soccer_event_class.py +++ b/preprocessing/sports/event_data/soccer/soccer_event_class.py @@ -36,7 +36,7 @@ def __init__(self,data_provider,event_path=None,match_id=None,tracking_home_path statsbomb_event_dir=None, skillcorner_tracking_dir=None, skillcorner_match_dir=None, preprocess_method=None,sb360_path=None,wyscout_matches_path=None, st_track_path=None, st_meta_path=None,verbose=False, - preprocess_tracking=False): + preprocess_tracking=False, soccertrackv2=False): self.data_provider = data_provider self.event_path = event_path self.match_id = match_id @@ -60,6 +60,7 @@ def __init__(self,data_provider,event_path=None,match_id=None,tracking_home_path self.preprocess_tracking = preprocess_tracking self.verbose = verbose self.call_preprocess = False + self.soccertrackv2 = soccertrackv2 def load_data_single_file(self): #based on the data provider, load the dataloading function from load_data.py (single file) @@ -88,7 +89,10 @@ def load_data_single_file(self): elif self.data_provider == 'datastadium': df=soccer_load_data.load_datastadium(self.event_path,self.tracking_home_path,self.tracking_away_path) elif self.data_provider == 'bepro': - df=soccer_load_data.load_soccertrack(self.event_path, self.st_track_path, self.st_meta_path, self.verbose) + if self.soccertrackv2 == True: + df=soccer_load_data.load_soccertrack(self.event_path, self.tracking_path, self.meta_data, self.verbose) + elif self.soccertrackv2 == False: + df=soccer_load_data.load_bepro(self.event_path, self.tracking_path, self.meta_data, self.verbose) else: raise ValueError('Data provider not supported or not found') return df @@ -313,7 +317,10 @@ def preprocessing_single_df(self,df): elif self.data_provider == "datastadium": df_out=soccer_processing.UIED_datastadium(df) elif self.data_provider == "bepro": - df_out=soccer_processing.UIED_bepro(df) + if self.soccertrackv2 == True: + df_out=soccer_processing.UIED_soccertrackv2(df) + elif self.soccertrackv2 == False: + df_out=soccer_processing.UIED_bepro(df) elif self.data_provider == "wyscout": if self.preprocess_method == "UIED": df_out=soccer_processing.UIED_wyscout(df) diff --git a/preprocessing/sports/event_data/soccer/soccer_load_data.py b/preprocessing/sports/event_data/soccer/soccer_load_data.py index 591e2c4..da4c4d5 100644 --- a/preprocessing/sports/event_data/soccer/soccer_load_data.py +++ b/preprocessing/sports/event_data/soccer/soccer_load_data.py @@ -1396,7 +1396,7 @@ def calculate_sync_bias(event_df, tracking_data, period=1, verbose=False): ball_coordinates.append(ball_data_i) #find the time with the highest acceleration ball_coordinates = np.array(ball_coordinates) - ball_speed = np.linalg.norm(np.diff(ball_coordinates,axis=0),axis=1) + ball_speed = np.linalg.norm(np.diff(ball_coordinates, n=2, axis=0), axis=1) max_speed_index = np.argmax(ball_speed) max_speed_time = time_list[max_speed_index] bias = max_speed_time - first_event_time @@ -1537,6 +1537,377 @@ def get_tracking_features(event_df, tracking_data, meta_data, verbose=True): return event_df +def load_bepro(event_path: str, tracking_path: str, meta_path: str, verbose: bool = False) -> pd.DataFrame: + """ + Loads and processes event and tracking data from soccer match recordings. + + This function combines event data with tracking data by merging based on event time. It also adds + additional features extracted from metadata, such as player information, and converts position + coordinates to the correct scale for analysis. + + Args: + event_path (str): Path to the CSV file containing event data. + tracking_path (str): Path to the XML file containing tracking data. + meta_path (str): Path to the XML file containing match metadata (pitch, teams, players, etc.). + verbose (bool, optional): If True, prints additional information about the merging process and + feature extraction. Default is False. + + Returns: + pd.DataFrame: A DataFrame containing the merged and processed event and tracking data, + with additional features including player positions, speeds, ball position, + and metadata (e.g., player names, shirt numbers, positions). + """ + + def read_event_data(file_path): + #check if file_path is str or list + if isinstance(file_path, str): + file_path = [file_path] + file_list = [] + for file_path_i in file_path: + with open(file_path_i, 'r', encoding='utf-8') as f: + data = json.load(f) + event_df_i = pd.json_normalize( + data['data'], + record_path="events", # expand events list + meta=[ + "period_type", + "period_name", + "period_order", + "period_duration", + "period_start_time", + "event_time", + "team_name", + "player_shirt_number", + "player_name", + "x", "y", "to_x", "to_y", + "attack_direction" + ] + ) + event_df_i.columns = event_df_i.columns.str.replace("property.", "", regex=False) + file_list.append(event_df_i) + #extract the period_order from each file and concatenate the dataframes in the order of period_order + file_list = sorted(file_list, key=lambda x: x.iloc[0].period_order) + event_df = pd.concat(file_list, ignore_index=True) + #reorder the columns + event_df = event_df[[ + "period_type", + "period_name", + "period_order", + "period_duration", + "period_start_time", + "event_time", + "team_name", + "player_shirt_number", + "player_name", + "event_name", + "Outcome", + "Area", + "Direction", + "Distance", + "x", "y", "to_x", "to_y", + "attack_direction" + ]] + return event_df + + def read_meta_data(file_path): + # Parse the XML file + tree = ET.parse(file_path) + root = tree.getroot() + + # Extract match information + # match = root.find('.//match') + # match_info = {attr: match.get(attr) for attr in match.attrib} + + # Extract period information + # periods = root.findall('.//period') + # period_info = [{attr: period.get(attr) for attr in period.attrib} for period in periods] + + # Extract pitch information + pitch = root.find('.//pitch') + pitch_info = {attr: pitch.get(attr) for attr in pitch.attrib} + + # Extract team information + teams = root.findall('.//team') + team_info = [{attr: team.get(attr) for attr in team.attrib} for team in teams] + + # Extract player information + players = root.findall('.//players//player') + player_info = [{attr: player.get(attr) for attr in player.attrib} for player in players] + + # Extract active players information + # chunks = root.findall('.//chunk') + # active_players_info = [] + # for chunk in chunks: + # chunk_info = {attr: chunk.get(attr) for attr in chunk.attrib} + # chunk_info['players'] = [ + # {attr: player.get(attr) for attr in player.attrib} + # for player in chunk.findall('player') + # ] + # active_players_info.append(chunk_info) + + return { + # 'match_info': match_info, + # 'period_info': period_info, + 'pitch_info': pitch_info, + 'team_info': team_info, + 'player_info': player_info, + # 'active_players_info': active_players_info + } + + def read_tracking_data(file_path): + # Parse the XML file + tree = ET.parse(file_path) + root = tree.getroot() + + # List to store all frame data + frames = {} + + # Iterate through each frame + for frame in root.findall('frame'): + frame_data = { + 'matchTime': int(frame.get('matchTime')), + 'frameNumber': int(frame.get('frameNumber')), + 'eventPeriod': frame.get('eventPeriod'), + 'ballStatus': frame.get('ballStatus'), + 'players': [], + 'ball': None + } + + # Process player data + for player in frame.findall('player'): + player_data = { + 'playerId': player.get('playerId'), + 'loc': eval(player.get('loc')), # Convert string to list + 'speed': float(player.get('speed')) + } + frame_data['players'].append(player_data) + + # Process ball data + ball = frame.find('ball') + if ball is not None: + frame_data['ball'] = { + 'playerId': ball.get('playerId'), + 'loc': eval(ball.get('loc')), # Convert string to list + 'speed': ball.get('speed') + } + + frames[frame_data['matchTime']] = frame_data + + return frames + + #load the event data + + def get_additional_features(event_df, meta_data): + #player info: id name nameEN shirtNumber position + # create features period, seconds, event_type, event_type_2, outcome, home_team, x_unscaled, y_unscaled, + period_dict = {1:"FIRST_HALF",2:"SECOND_HALF",3:"EXTRA_FIRST_HALF",4:"EXTRA_SECOND_HALF"} + event_df["period"] = event_df["period_order"]+1 + event_df["event_period"] = event_df["period"].map(period_dict) + event_df["seconds"] = event_df["event_time"]/1000 + + event_df["event_type"] = event_df["event_name"] + + def map_team(name, team_dict): + # sort keys by length descending (longest first) + for key in sorted(team_dict.keys(), key=len, reverse=True): + if name.startswith(key): + return team_dict[key] + return None + + team_dict = {team_info["nameEn"]:str(team_info["id"]) for team_info in meta_data["team_info"]} + event_df["team_id"] = event_df["team_name"].apply(lambda x: map_team(x, team_dict)) + player_dict = {player_info["nameEn"]:player_info["id"] for player_info in meta_data["player_info"]} + event_df["player_id"] = event_df["player_name"].map(player_dict) + + home_team_dict = {str(team_info["id"]):team_info["side"] for team_info in meta_data["team_info"]} + event_df["home_team"] = event_df["team_id"].map(home_team_dict) + #convert "home" to 1 and "away" to 0 for home_team + event_df["home_team"] = event_df["home_team"].map({"home":1,"away":0}) + + #x and y coordinates of the field (height,width) for the event data (inverse of the tracking data) + event_df["x_unscaled"] = event_df["y"]*int(meta_data["pitch_info"]["width"]) + event_df["y_unscaled"] = event_df["x"]*int(meta_data["pitch_info"]["height"]) + event_df["to_x_unscaled"] = event_df["to_y"]*int(meta_data["pitch_info"]["width"]) + event_df["to_y_unscaled"] = event_df["to_x"]*int(meta_data["pitch_info"]["height"]) + return event_df + + def calculate_sync_bias(event_df, tracking_data, period=1, verbose=False): + # 'FIRST_HALF' "SECOND_HALF" + # Calculate the bias between event time and tracking time + limit = 5.0 #seconds + time_list = [key for key in tracking_data.keys()] + #split the time_list into two halves + if period == 1: + time_list = [time for time in time_list if tracking_data[time]['eventPeriod'] == 'FIRST_HALF'] + first_event_time = event_df[event_df["event_period"]=="FIRST_HALF"].iloc[0].event_time if "FIRST_HALF" in event_df["event_period"].values else 0 + elif period == 2: + time_list = [time for time in time_list if tracking_data[time]['eventPeriod'] == 'SECOND_HALF'] + first_event_time = event_df[event_df["event_period"]=="SECOND_HALF"].iloc[0].event_time if "SECOND_HALF" in event_df["event_period"].values else 0 + elif period == 3: + time_list = [time for time in time_list if tracking_data[time]['eventPeriod'] == 'EXTRA_FIRST_HALF'] + first_event_time = event_df[event_df["event_period"]=="EXTRA_FIRST_HALF"].iloc[0].event_time if "EXTRA_FIRST_HALF" in event_df["event_period"].values else 0 + + if time_list == []: + return 0 + + time_list.sort() + start_time = max(time_list[0],0) + #round to the nearest 1000 + start_time = round(start_time/1000)*1000 + print("start_time:",start_time) if verbose else None + #drop the time that exceeds the limit of the event time + time_list = [time for time in time_list if time <= start_time+limit*1000] + #order the time_list in ascending order + time_list.sort() + + ball_coordinates = [] + for time_i in time_list: + tracking_data_i = tracking_data[time_i] + ball_data_i = tracking_data_i['ball']['loc'] + ball_coordinates.append(ball_data_i) + #find the time with the highest acceleration + ball_coordinates = np.array(ball_coordinates) + ball_speed = np.linalg.norm(np.diff(ball_coordinates, n=2, axis=0), axis=1) + max_speed_index = np.argmax(ball_speed) + max_speed_time = time_list[max_speed_index] + bias = max_speed_time - first_event_time + + return bias + + def get_tracking_features(event_df, tracking_data, meta_data, verbose=True): + # combine the event data with the tracking data via event_time and matchTime + #get the player info + time_list = [key for key in tracking_data.keys()] + time_diff_list = [] + player_dict = {} + home_team_player_count = 0 + away_team_player_count = 0 + home_team_dict = {int(team_info["id"]):team_info["side"] for team_info in meta_data["team_info"]} + for player_i in meta_data["player_info"]: + player_dict[player_i["id"]] = player_i + team_id = int(player_dict[player_i["id"]]['teamId']) + if home_team_dict[team_id] == 'home': + player_dict[player_i["id"]]["player_num"] = home_team_player_count+1 + home_team_player_count += 1 + elif home_team_dict[team_id] == 'away': + player_dict[player_i["id"]]["player_num"] = away_team_player_count+1 + away_team_player_count += 1 + else: + print("team_id not found") + pdb.set_trace() + + #create the additional features + tracking_features=["player_id","x","y","speed"] + meta_features=["name","nameEn","shirtNumber","position"] + ball_features = ["ball_x","ball_y","ball_speed"] + additional_features = tracking_features+meta_features + additional_featurs_dict = {} + for i in range(home_team_player_count): + for j in range(len(additional_features)): + additional_featurs_dict[f"home_{additional_features[j]}_{i+1}"] = [] + for i in range(away_team_player_count): + for j in range(len(additional_features)): + additional_featurs_dict[f"away_{additional_features[j]}_{i+1}"] = [] + for j in range(len(ball_features)): + additional_featurs_dict[ball_features[j]] = [] + + additional_featurs_dict["tracking_time"] = [] + additaional_features_dict_key_list = [key for key in additional_featurs_dict.keys()] + + #get the sync bias for the event and tracking data + bias_1 = calculate_sync_bias(event_df, tracking_data, period=1, verbose=verbose) #FIRST_HALF + bias_2 = calculate_sync_bias(event_df, tracking_data, period=2, verbose=verbose) #SECOND_HALF + bias_3 = calculate_sync_bias(event_df, tracking_data, period=3, verbose=verbose) #EXTRA_FIRST_HALF + + print("bias_1:",bias_1,"bias_2:",bias_2,"bias_3:",bias_3) if verbose else None + + if verbose: + iterable = tqdm(range(len(event_df))) + else: + iterable = range(len(event_df)) + for i in iterable: + updated_features = [] + event_time = event_df.iloc[i].event_time + period = event_df.iloc[i].event_period + if period == 'FIRST_HALF': + event_time += bias_1 + elif period == 'SECOND_HALF': + event_time += bias_2 + elif period == 'EXTRA_FIRST_HALF': + event_time += bias_3 + else: + print("period not included") + #find the nearest time in the tracking data + nearest_time = min(time_list, key=lambda x:abs(x-event_time)) + try: + additional_featurs_dict["tracking_time"].append(nearest_time) + updated_features+=["tracking_time"] + except: + pass + time_diff_list.append(nearest_time-event_time) + #get the tracking data + tracking_data_i = tracking_data[nearest_time] + for player_track_j in tracking_data_i['players']: + player_j_id = player_track_j['playerId'] + player_j_num = player_dict[player_j_id]["player_num"] + player_j_team = player_dict[player_j_id]["teamId"] + player_j_home = home_team_dict[int(player_j_team)] + # append the tracking data and meta data to the additional features + additional_featurs_dict[f"{player_j_home}_player_id_{player_j_num}"].append(player_track_j['playerId']) + additional_featurs_dict[f"{player_j_home}_x_{player_j_num}"].append(round(player_track_j['loc'][0]*int(meta_data["pitch_info"]["width"]),2)) + additional_featurs_dict[f"{player_j_home}_y_{player_j_num}"].append(round(player_track_j['loc'][1]*int(meta_data["pitch_info"]["height"]),2)) + additional_featurs_dict[f"{player_j_home}_speed_{player_j_num}"].append(player_track_j['speed']) + additional_featurs_dict[f"{player_j_home}_name_{player_j_num}"].append(player_dict[player_j_id]["name"]) + additional_featurs_dict[f"{player_j_home}_nameEn_{player_j_num}"].append(player_dict[player_j_id]["nameEn"]) + additional_featurs_dict[f"{player_j_home}_shirtNumber_{player_j_num}"].append(player_dict[player_j_id]["shirtNumber"]) + additional_featurs_dict[f"{player_j_home}_position_{player_j_num}"].append(player_dict[player_j_id]["position"]) + updated_features+=[f"{player_j_home}_player_id_{player_j_num}",f"{player_j_home}_x_{player_j_num}",f"{player_j_home}_y_{player_j_num}",f"{player_j_home}_speed_{player_j_num}",f"{player_j_home}_name_{player_j_num}",f"{player_j_home}_nameEn_{player_j_num}",f"{player_j_home}_shirtNumber_{player_j_num}",f"{player_j_home}_position_{player_j_num}"] + ball_track = tracking_data_i['ball'] + additional_featurs_dict[f"ball_x"].append(round(ball_track['loc'][0]*int(meta_data["pitch_info"]["width"]),2)) + additional_featurs_dict[f"ball_y"].append(round(ball_track['loc'][1]*int(meta_data["pitch_info"]["height"]),2)) + if ball_track['speed'] == 'NA': + additional_featurs_dict[f"ball_speed"].append(None) + else: + additional_featurs_dict[f"ball_speed"].append(ball_track['speed']) + updated_features+=["ball_x","ball_y","ball_speed"] + # for features in additaional_features_dict_key_list but not in updated_features, append None + for key in additaional_features_dict_key_list: + if key not in updated_features: + additional_featurs_dict[key].append(None) + + #add the additional features to the event_df + out_event_df = event_df.copy() + if verbose: + for key in additional_featurs_dict.keys(): + print(key,len(additional_featurs_dict[key])) + + # Create a DataFrame from the additional features dictionary + additional_features_df = pd.DataFrame(additional_featurs_dict) + + # Concatenate the original event_df with the additional features DataFrame + out_event_df = pd.concat([event_df, additional_features_df], axis=1) + + #print the mean and std of the time_diff_list + if verbose: + print("mean time difference:",round(np.mean(time_diff_list),4)) + print("std time difference:",round(np.std(time_diff_list),4)) + print("max time difference:",round(np.max(time_diff_list),4)) + print("min time difference:",round(np.min(time_diff_list),4)) + return out_event_df + + # Load the event data + event_df = read_event_data(event_path) + # Load the tracking data + tracking_data = read_tracking_data(tracking_path) + # Load the meta data + meta_data = read_meta_data(meta_path) + # Get additional features + event_df = get_additional_features(event_df, meta_data) + # Get tracking features + event_df = get_tracking_features(event_df, tracking_data, meta_data, verbose=verbose) + + return event_df + def load_pff2metrica(event_path:str, match_id:str = None) -> pd.DataFrame: """ Convert PFF-style event data to Metrica format. @@ -1825,12 +2196,22 @@ def apply_subtype(success_col, present_series): # event=load_datastadium(datastadium_event_path,datastadium_home_tracking_path,datastadium_away_tracking_path) # event.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/load.csv",index=False) - #test load_soccertrack - soccer_track_event_path="/data_pool_1/soccertrackv2/2023-11-18/Event/event.csv" - soccer_track_tracking_path="/data_pool_1/soccertrackv2/2023-11-18/Tracking/tracking.xml" - soccer_track_meta_path="/data_pool_1/soccertrackv2/2023-11-18/Tracking/meta.xml" - df_soccertrack=load_soccertrack(soccer_track_event_path,soccer_track_tracking_path,soccer_track_meta_path,True) - df_soccertrack.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/soccertrack/load.csv",index=False) + # #test load_soccertrack + # soccer_track_event_path="/data_pool_1/soccertrackv2/2023-11-18/Event/event.csv" + # soccer_track_tracking_path="/data_pool_1/soccertrackv2/2023-11-18/Tracking/tracking.xml" + # soccer_track_meta_path="/data_pool_1/soccertrackv2/2023-11-18/Tracking/meta.xml" + # df_soccertrack=load_soccertrack(soccer_track_event_path,soccer_track_tracking_path,soccer_track_meta_path,True) + # df_soccertrack.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/soccertrack/load.csv",index=False) + + #test load_bepro + soccer_track_event_path=["/data_pool_1/soccertrackv2/117093/2023-11-25_筑波大学 vs 筑波大学 - B1_1st Half.json", + "/data_pool_1/soccertrackv2/117093/2023-11-25_筑波大学 vs 筑波大学 - B1_2nd Half.json"] + soccer_track_tracking_path="/data_pool_1/soccertrackv2/117093/tracker_box_data_125091.xml" + soccer_track_meta_path="/data_pool_1/soccertrackv2/117093/tracker_box_metadata_125091.xml" + df_soccertrack=load_bepro(soccer_track_event_path,soccer_track_tracking_path,soccer_track_meta_path,True) + os.makedirs(os.getcwd()+"/test/sports/event_data/data/bepro", exist_ok=True) + df_soccertrack.to_csv(os.getcwd()+"/test/sports/event_data/data/bepro/load.csv",index=False) + print("----------------done-----------------") # pdb.set_trace() diff --git a/preprocessing/sports/event_data/soccer/soccer_processing.py b/preprocessing/sports/event_data/soccer/soccer_processing.py index bd9e1bb..1e97266 100644 --- a/preprocessing/sports/event_data/soccer/soccer_processing.py +++ b/preprocessing/sports/event_data/soccer/soccer_processing.py @@ -1510,7 +1510,7 @@ def UIED_datastadium(data): return df -def UIED_bepro(data): +def UIED_soccertrackv2(data): """ Processes football event data from a DataFrame or CSV file, creating various features for analysis. @@ -1851,6 +1851,327 @@ def UIED_bepro(data): return df +def UIED_bepro(data): + """ + Processes football event data from a DataFrame or CSV file, creating various features for analysis. + + Parameters: + - data (pd.DataFrame or str): If a string, it should be a path to a CSV file. If a DataFrame, it should contain the event data. + + Returns: + - pd.DataFrame: Processed DataFrame with additional features and cleaned data. + """ + # Load data from DataFrame or file path + if isinstance(data, pd.DataFrame): + df = data + elif isinstance(data, str): + if os.path.exists(data): + df = pd.read_csv(data) + else: + raise FileNotFoundError("The file path does not exist") + else: + raise ValueError("The data must be a pandas DataFrame or a file path") + + df = df.copy() + + # Create 'action' column by concatenating 'event_type' and 'event_type_2' + df["success"]=df["Outcome"].map({"Succeeded": 1, "Failed": 0, 'Goals': 1}).fillna(0) + df["action"] = df["event_type"] + + # Define possession team actions + all_cation=['Passes', 'Passes Received', 'Set Pieces', 'Turnovers', + 'Interceptions', 'Key Passes', 'Shots & Goals', 'Crosses', + 'Aerial Control', 'Clearances', 'Recoveries', 'Step-in', 'Blocks', + 'Tackles', 'Take-on', 'Duels', 'Fouls', 'Mistakes', + 'Defensive Line Supports', 'Crosses Received', 'Assists', + 'Goals Conceded', 'Set Piece Defence', 'Saves', None, 'None' + ] + + possession_team_actions = ['Passes', 'Passes Received', 'Set Pieces', + 'Key Passes', 'Shots & Goals', 'Crosses', + 'Aerial Control', + 'Mistakes', + 'Crosses Received', 'Assists', + ] + + possession = [] + # Determine possession + for i in range(len(df)): + if i == 0: + possession.append(df["team_id"].iloc[i]) + else: + if df.action.iloc[i] not in all_cation: + if not pd.isna(df.action.iloc[i]): + print(f"Warning: action {df.action.iloc[i]} was not found in the all action list") + if df["team_id"].iloc[i] == df["team_id"].iloc[i - 1]: + possession.append(df["team_id"].iloc[i]) + else: + if df["action"].iloc[i] in possession_team_actions: + possession.append(df["team_id"].iloc[i]) + else: + possession.append(df["team_id"].iloc[i - 1]) + + df["possession_team"] = possession + + temp_home_score = 0 + temp_away_score = 0 + home_score = [] + away_score = [] + goal_diff = [] + goal= [] + for i in range(len(df)): + home_score.append(temp_home_score) + away_score.append(temp_away_score) + goal_diff.append(temp_home_score - temp_away_score) + #check if Goal but not GoalKick and goalagainstis in the str of df["action"].iloc[i] + if str(df["action"].iloc[i]) == "Shots & Goals" and df["success"].iloc[i] == 1: + if df["home_team"].iloc[i] == 1: + temp_home_score += 1 + else: + temp_away_score += 1 + goal.append(1) + else: + goal.append(0) + + df["home_score"] = home_score + df["away_score"] = away_score + df["goal_diff"] = goal_diff + df["goal"] = goal + + #consider the side of the 2 team + side_dict={} + period_list=df.period.unique() + period_list=period_list[(period_list==1) | (period_list==2) | (period_list==3) | (period_list==4) | (period_list==5)] + for period in period_list: + period_df=df[df["period"]==period] + home_team_df=period_df[period_df["home_team"]==1] + attack_direction= home_team_df.iloc[0]["attack_direction"] + if attack_direction == "LEFT": + side_dict[period]=1 + elif attack_direction == "RIGHT": + side_dict[period]=0 + else: + side_dict[period]=1 + + + #group the event into simpliefied actions + short_pass_actions=['Passes', 'Set Pieces', 'Key Passes',] + long_pass_actions=[] + high_pass_actions=[] + shot_actions=['Shots & Goals',] + carray_actions=[] + dribble_actions=[] + cross_actions=['Crosses',] + drop_actions=[ 'Passes Received', 'Turnovers', + 'Interceptions', + 'Aerial Control', 'Clearances', 'Recoveries', 'Step-in', 'Blocks', + 'Tackles', 'Take-on', 'Duels', 'Fouls', 'Mistakes', + 'Defensive Line Supports', 'Crosses Received', 'Assists', + 'Goals Conceded', 'Set Piece Defence', 'Saves', None, 'None' + ] + + action_list=[] + for i in range(len(df)): + if df["action"].iloc[i] in short_pass_actions: + distance_i = np.sqrt((df["x_unscaled"].iloc[i] - df["to_x_unscaled"].iloc[i])**2 +\ + (df["y_unscaled"].iloc[i] - df["to_y_unscaled"].iloc[i])**2) + if distance_i>=45: + action_list.append("long_pass") + else: + action_list.append("short_pass") + + elif df["action"].iloc[i] in high_pass_actions: + action_list.append("high_pass") + elif df["action"].iloc[i] in shot_actions: + action_list.append("shot") + elif df["action"].iloc[i] in carray_actions: + action_list.append("carry") + elif df["action"].iloc[i] in dribble_actions: + action_list.append("dribble") + elif df["action"].iloc[i] in cross_actions: + action_list.append("cross") + elif df["action"].iloc[i] in drop_actions or pd.isna(df["action"].iloc[i]): + action_list.append("drop") + else: + action= df["action"].iloc[i] + print(f"Warning: action {action} was not found in the action list, it will be dropped") + action_list.append("drop") + + df["action"]=action_list + #drop the drop actions + df=df[df["action"]!="drop"].reset_index(drop=True) + + #compute + #consider the + dist2gaol_list=[] + angle2goal_list=[] + for i in range(len(df)): + period=df["period"].iloc[i] + home_team=df["home_team"].iloc[i] + side = side_dict.get(period, 1) # default to 1 if period not found + if (side == 1 and home_team == 1) or (side == 0 and home_team == 0): + distance2goal = ((df["x_unscaled"].iloc[i] - 105) ** 2 + + (df["y_unscaled"].iloc[i] - 34) ** 2) ** 0.5 + angle = np.abs(np.arctan2(df["y_unscaled"].iloc[i] - 34, + df["x_unscaled"].iloc[i] - 105)) + elif (side == 1 and home_team == 0) or (side == 0 and home_team == 1): + distance2goal = ((df["x_unscaled"].iloc[i] - 0) ** 2 + + (df["y_unscaled"].iloc[i] - 34) ** 2) ** 0.5 + angle = np.abs(np.arctan2(df["y_unscaled"].iloc[i] - 34, + df["x_unscaled"].iloc[i] - 0)) + dist2gaol_list.append(distance2goal) + angle2goal_list.append(angle) + + df["dist2goal"] = dist2gaol_list + df["angle2goal"] = angle2goal_list + + #create the time related features (delta_T) + delta_t_list=[] + for i in range(len(df)): + if i==0: + delta_t_list.append(0) + else: + delta_t_list.append(df["seconds"].iloc[i]-df["seconds"].iloc[i-1]) + df["delta_T"]=delta_t_list + + #create the location related features (deltaX, deltaY, distance) + delta_x_list=[] + delta_y_list=[] + dist_list=[] + + for i in range(len(df)): + if i==0: + delta_x=0 + delta_y=0 + distance=0 + else: + delta_x=df["x_unscaled"].iloc[i]-df["x_unscaled"].iloc[i-1] + delta_y=df["y_unscaled"].iloc[i]-df["y_unscaled"].iloc[i-1] + distance = np.sqrt(delta_x**2+delta_y**2) + delta_x_list.append(delta_x) + delta_y_list.append(delta_y) + dist_list.append(distance) + df["deltaX"]=delta_x_list + df["deltaY"]=delta_y_list + df["distance"]=dist_list + + + #create the possession id, end of possession, end of period, end of game + poss_id_list = [] + poss_id = 0 + for i in range(len(df)): + if i == 0: + poss_id_list.append(poss_id) + else: + if df["possession_team"].iloc[i] == df["possession_team"].iloc[i - 1]: + poss_id_list.append(poss_id) + else: + poss_id += 1 + poss_id_list.append(poss_id) + poss_id+=1 + df["poss_id"] = poss_id_list + + new_df = [] + period_list=df.period.unique() + #drop unusual period like 0, 5, 6 + period_list=period_list[(period_list==1) | (period_list==2) | (period_list==3) | (period_list==4) | (period_list==5)] + for period in period_list: + period_df = df[df["period"] == period] + for poss_id in period_df.poss_id.unique(): + poss_df = period_df[period_df["poss_id"] == poss_id] + for i in range(len(poss_df)): + new_df.append(poss_df.iloc[i]) + last_row = poss_df.iloc[-1].copy() + last_row["action"] = "_" + #change the value of the features to 0 + last_row['goal']=0 + last_row["success"]=0 + last_row["deltaX"]=0 + last_row["deltaY"]=0 + last_row["distance"]=0 + last_row["dist2goal"]=0 + last_row["angle2goal"]=0.5 + last_row["delta_T"]=0 + new_df.append(last_row) + last_row = period_df.iloc[-1].copy() + #change the value of the features to 0 + last_row['goal']=0 + last_row["success"]=0 + last_row["deltaX"]=0 + last_row["deltaY"]=0 + last_row["distance"]=0 + last_row["dist2goal"]=0 + last_row["angle2goal"]=0.5 + last_row["delta_T"]=0 + if period == df.period.unique()[-1]: + last_row["action"] = "game_over" + new_df.append(last_row) + else: + last_row["action"] = "period_over" + new_df.append(last_row) + df = pd.concat(new_df, axis=1).T.reset_index(drop=True) + + # #create the seconds column + # seconds_list=[] + # for i in range(len(df)): + # if df["Period"].iloc[i]==1: + # seconds_list.append(df.Minute.iloc[i]*60+df.Second.iloc[i]) + # elif df["Period"].iloc[i]==2: + # seconds_list.append(df.Minute.iloc[i]*60+df.Second.iloc[i]+60*45) + # #if there is error in the period col, use the previous row period + + # df["seconds"]=seconds_list + + #reset the features value to 0 (angle2goal to 0.5)for beginning of each period + new_df=[] + period_list=df.period.unique() + #drop unusual period like 0, 5, 6 + period_list=period_list[(period_list==1) | (period_list==2) | (period_list==3) | (period_list==4) | (period_list==5)] + for period in period_list: + period_df=df[df["period"]==period].copy() + for i in range(len(period_df)): + if i==0: + first_row=period_df.iloc[i].copy() + first_row["deltaX"]=0 + first_row["deltaY"]=0 + first_row["distance"]=0 + first_row["dist2goal"]=0 + first_row["angle2goal"]=0.5 + first_row["delta_T"]=0 + new_df.append(first_row) + else: + new_df.append(period_df.iloc[i]) + df=pd.concat(new_df,axis=1).T.reset_index(drop=True) + + #convert seconds, distance, dist2goal, angle2goal, start_x, start_y into type float + df["seconds"]=df["seconds"].astype(float) + df["distance"]=df["distance"].astype(float) + df["dist2goal"]=df["dist2goal"].astype(float) + df["angle2goal"]=df["angle2goal"].astype(float) + df["start_x"]=df["x_unscaled"].astype(float) + df["start_y"]=df["y_unscaled"].astype(float) + + #round numerical columns to 4 decimal places (period, minute, second, X, Y) + df = df.round({"Period": 4, "Minute": 4, "Second": 4, "seconds": 4, "start_x": 4, "start_y": 4, "deltaX": 4, "deltaY": 4, "distance": 4, "dist2goal": 4, "angle2goal": 4}) + + df['team'] = df.team_id + df['Period'] = df.period + df['Minute'] = df.seconds // 60 + df['Second'] = df.seconds % 60 + + #reorder columns + # away_player_id_23 away_x_23 away_y_23 away_speed_23 away_name_23 away_nameEn_23 away_shirtNumber_23 away_position_23 + + tracking_col_home = df.columns[df.columns.str.startswith("home_")].tolist() + tracking_col_home = [col for col in tracking_col_home if col != "home_team"] + tracking_col_away = df.columns[df.columns.str.startswith("away_")].tolist() + + df = df[['poss_id', 'team', 'home_team', 'action', 'success', 'goal', 'home_score', + 'away_score', 'goal_diff', 'Period', 'Minute', 'Second', 'seconds', "delta_T", 'start_x', + 'start_y', 'deltaX', 'deltaY', 'distance', 'dist2goal', 'angle2goal']+tracking_col_home+tracking_col_away] + + return df + if __name__ == '__main__': import pdb @@ -1891,9 +2212,14 @@ def UIED_bepro(data): # df_datastadium.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/preprocess_UIED.csv",index=False) - df_bepro_path=os.getcwd()+"/test/sports/event_data/data/soccertrack/load.csv" + # df_bepro_path=os.getcwd()+"/test/sports/event_data/data/soccertrack/load.csv" + # df_bepro=UIED_soccertrackv2(df_bepro_path) + # df_bepro.to_csv(os.getcwd()+"/test/sports/event_data/data/soccertrack/preprocess_UIED.csv",index=False) + + df_bepro_path=os.getcwd()+"/test/sports/event_data/data/bepro/load.csv" df_bepro=UIED_bepro(df_bepro_path) - df_bepro.to_csv(os.getcwd()+"/test/sports/event_data/data/soccertrack/preprocess_UIED.csv",index=False) + df_bepro.to_csv(os.getcwd()+"/test/sports/event_data/data/bepro/preprocess_UIED.csv",index=False) + print('-----------------end-----------------') # pdb.set_trace() diff --git a/pyproject.toml b/pyproject.toml index aa00627..e4bba31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "openstarlab_preprocessing" -version = "0.1.50" +version = "0.1.51" description = "openstarlab preprocessing package" readme = "README.md" requires-python = ">=3.8"