import glob
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
# read all yearly pbp csvs and add a year column
files = sorted(glob.glob('data/mvb_pbp_div1_*.csv'))
print('Found files:', files)
dfs = []
for f in files:
tmp = pd.read_csv(f)
basename = os.path.basename(f)
# expect filenames like mvb_pbp_div1_2020.csv -> extract year
year = basename.split('_')[-1].split('.')[0]
tmp['year'] = int(year)
dfs.append(tmp)
mvb_pbp = pd.concat(dfs, ignore_index=True)
# ensure year column is integer
mvb_pbp['year'] = mvb_pbp['year'].astype(int)
mvb_pbp
df = mvb_pbp.copy()
df["score_changed"] = (
df["away_score"].diff().fillna(0).ne(0)
|
df["home_score"].diff().fillna(0).ne(0)
)
# compute previous scores and score diffs
df[['away_prev', 'home_prev']] = df[['away_score', 'home_score']].shift(1).fillna(0)
df['away_diff'] = df['away_score'] - df['away_prev']
df['home_diff'] = df['home_score'] - df['home_prev']
# assign rally winner (team name) when a score increases
df['rally_winner'] = pd.NA
df.loc[df['away_diff'] > 0, 'rally_winner'] = df.loc[df['away_diff'] > 0, 'away_team']
df.loc[df['home_diff'] > 0, 'rally_winner'] = df.loc[df['home_diff'] > 0, 'home_team']
# assign rally id
match_cols = ["date", "away_team", "home_team"]
df["rally_id"] = (
df.groupby(match_cols)["score_changed"]
.cumsum()
)
# Ensure `set` is numeric so we can group by it
df['set'] = pd.to_numeric(df['set'], errors='coerce')
# Determine the final row for each match/set so we know the set score
group_cols = ['date', 'away_team', 'home_team', 'set']
final_rows = df.groupby(group_cols, dropna=False).tail(1).copy()
# mark which team won each set
final_rows['away_set_win'] = (final_rows['away_score'] > final_rows['home_score']).astype(int)
final_rows['home_set_win'] = (final_rows['home_score'] > final_rows['away_score']).astype(int)
# sort by set so cumulative sums are correct
final_rows = final_rows.sort_values(['date', 'away_team', 'home_team', 'set'])
# cumulative sets won up to and including this set
final_rows['cum_away_sets'] = final_rows.groupby(['date', 'away_team', 'home_team'])['away_set_win'].cumsum()
final_rows['cum_home_sets'] = final_rows.groupby(['date', 'away_team', 'home_team'])['home_set_win'].cumsum()
# sets won BEFORE this set (shift cumulative sums by 1)
final_rows['away_sets_before'] = final_rows.groupby(['date', 'away_team', 'home_team'])['cum_away_sets'].shift(1).fillna(0).astype(int)
final_rows['home_sets_before'] = final_rows.groupby(['date', 'away_team', 'home_team'])['cum_home_sets'].shift(1).fillna(0).astype(int)
# prepare per-set before-values to merge back into full dataframe
set_before_cols = final_rows[['date', 'away_team', 'home_team', 'set', 'away_sets_before', 'home_sets_before']]
# aggregate sets won per match (final totals)
sets_summary = final_rows.groupby(['date', 'away_team', 'home_team']).agg(
away_sets_total=('away_set_win', 'sum'),
home_sets_total=('home_set_win', 'sum')
).reset_index()
sets_summary = sets_summary.rename(columns={'away_sets_total': 'away_sets', 'home_sets_total': 'home_sets'})
# determine match winner (team with more sets)
sets_summary['match_winner'] = sets_summary.apply(
lambda r: 1 if r['away_sets'] > r['home_sets'] else (0 if r['home_sets'] > r['away_sets'] else pd.NA),
axis=1
)
# merge per-set "before" counts into both dataframes so rows in set `s` see sets won before that set
df = df.merge(set_before_cols, on=['date', 'away_team', 'home_team', 'set'], how='left')
# merge match totals and match_winner
df = df.merge(sets_summary, on=['date', 'away_team', 'home_team'], how='left')
# drop helper columns
df.drop(columns=['cum_away_sets', 'cum_home_sets'], inplace=True, errors='ignore')
df['point_diff'] = df['away_prev'] - df['home_prev']
df["match_id"] = (
df["date"].astype(str)
+ "_"
+ df["away_team"]
+ "_"
+ df["home_team"]
)
# rows corresponding to the end of a rally (point scored)
# drop rows where match_winner is missing to avoid downstream NaNs
df = df[df['match_winner'].notna()]
rallies = df[df["score_changed"]].copy()
# creating some more variables
rallies['points_played'] = rallies['away_prev'] + rallies['home_prev']
rallies["score_interaction"] = (
rallies["point_diff"]
* rallies["points_played"]
)
rallies["final_set_diff"] = rallies['away_sets'] - rallies['home_sets']
rallies["set_diff"] = (
rallies["away_sets_before"]
- rallies["home_sets_before"]
)
rallies = rallies[abs(rallies['set_diff']) != 3]
rallies["set_state"] = (
rallies["away_sets_before"].astype(str)
+ "-"
+ rallies["home_sets_before"].astype(str)
)
# manually filter for now
rallies = rallies[(rallies['set_state'] != "1-3") | rallies['set_state'] != "3-1"]