NCAA Win Probability Model

Download NCAA pbp data
Convert it into one row per rally state
Compute ELO ratings
Generate training dataframe
Fit the first baseline logistic model

Things to do:

Step 1: Building the training table

Steps: * Read in all play by play data * Convert pbp to rally data and match data * Create features (elo, set differential, point differential)

import glob
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
# read all yearly pbp csvs and add a year column
files = sorted(glob.glob('data/mvb_pbp_div1_*.csv'))
print('Found files:', files)
dfs = []
for f in files:
    tmp = pd.read_csv(f)
    basename = os.path.basename(f)
    # expect filenames like mvb_pbp_div1_2020.csv -> extract year
    year = basename.split('_')[-1].split('.')[0]
    tmp['year'] = int(year)
    dfs.append(tmp)
mvb_pbp = pd.concat(dfs, ignore_index=True)
# ensure year column is integer
mvb_pbp['year'] = mvb_pbp['year'].astype(int)
mvb_pbp

df = mvb_pbp.copy()

df["score_changed"] = (
    df["away_score"].diff().fillna(0).ne(0)
    |
    df["home_score"].diff().fillna(0).ne(0)
)

# compute previous scores and score diffs
df[['away_prev', 'home_prev']] = df[['away_score', 'home_score']].shift(1).fillna(0)
df['away_diff'] = df['away_score'] - df['away_prev']
df['home_diff'] = df['home_score'] - df['home_prev']

# assign rally winner (team name) when a score increases
df['rally_winner'] = pd.NA
df.loc[df['away_diff'] > 0, 'rally_winner'] = df.loc[df['away_diff'] > 0, 'away_team']
df.loc[df['home_diff'] > 0, 'rally_winner'] = df.loc[df['home_diff'] > 0, 'home_team']

# assign rally id
match_cols = ["date", "away_team", "home_team"]

df["rally_id"] = (
    df.groupby(match_cols)["score_changed"]
      .cumsum()
)

# Ensure `set` is numeric so we can group by it
df['set'] = pd.to_numeric(df['set'], errors='coerce')

# Determine the final row for each match/set so we know the set score
group_cols = ['date', 'away_team', 'home_team', 'set']
final_rows = df.groupby(group_cols, dropna=False).tail(1).copy()




# mark which team won each set
final_rows['away_set_win'] = (final_rows['away_score'] > final_rows['home_score']).astype(int)
final_rows['home_set_win'] = (final_rows['home_score'] > final_rows['away_score']).astype(int)

# sort by set so cumulative sums are correct
final_rows = final_rows.sort_values(['date', 'away_team', 'home_team', 'set'])

# cumulative sets won up to and including this set
final_rows['cum_away_sets'] = final_rows.groupby(['date', 'away_team', 'home_team'])['away_set_win'].cumsum()
final_rows['cum_home_sets'] = final_rows.groupby(['date', 'away_team', 'home_team'])['home_set_win'].cumsum()

# sets won BEFORE this set (shift cumulative sums by 1)
final_rows['away_sets_before'] = final_rows.groupby(['date', 'away_team', 'home_team'])['cum_away_sets'].shift(1).fillna(0).astype(int)
final_rows['home_sets_before'] = final_rows.groupby(['date', 'away_team', 'home_team'])['cum_home_sets'].shift(1).fillna(0).astype(int)

# prepare per-set before-values to merge back into full dataframe
set_before_cols = final_rows[['date', 'away_team', 'home_team', 'set', 'away_sets_before', 'home_sets_before']]

# aggregate sets won per match (final totals)
sets_summary = final_rows.groupby(['date', 'away_team', 'home_team']).agg(
    away_sets_total=('away_set_win', 'sum'),
    home_sets_total=('home_set_win', 'sum')
).reset_index()
sets_summary = sets_summary.rename(columns={'away_sets_total': 'away_sets', 'home_sets_total': 'home_sets'})

# determine match winner (team with more sets)
sets_summary['match_winner'] = sets_summary.apply(
    lambda r: 1 if r['away_sets'] > r['home_sets'] else (0 if r['home_sets'] > r['away_sets'] else pd.NA),
    axis=1
)



# merge per-set "before" counts into both dataframes so rows in set `s` see sets won before that set
df = df.merge(set_before_cols, on=['date', 'away_team', 'home_team', 'set'], how='left')

# merge match totals and match_winner
df = df.merge(sets_summary, on=['date', 'away_team', 'home_team'], how='left')

# drop helper columns
df.drop(columns=['cum_away_sets', 'cum_home_sets'], inplace=True, errors='ignore')


df['point_diff'] = df['away_prev'] - df['home_prev']

df["match_id"] = (
    df["date"].astype(str)
    + "_"
    + df["away_team"]
    + "_"
    + df["home_team"]
)


# rows corresponding to the end of a rally (point scored)
# drop rows where match_winner is missing to avoid downstream NaNs
df = df[df['match_winner'].notna()]

rallies = df[df["score_changed"]].copy()


# creating some more variables
rallies['points_played'] = rallies['away_prev'] + rallies['home_prev']
rallies["score_interaction"] = (
    rallies["point_diff"]
    * rallies["points_played"]
)
rallies["final_set_diff"] = rallies['away_sets'] - rallies['home_sets']
rallies["set_diff"] = (
    rallies["away_sets_before"]
    - rallies["home_sets_before"]
)

rallies = rallies[abs(rallies['set_diff']) != 3]

rallies["set_state"] = (
    rallies["away_sets_before"].astype(str)
    + "-"
    + rallies["home_sets_before"].astype(str)
)
# manually filter for now
rallies = rallies[(rallies['set_state'] != "1-3") | rallies['set_state'] != "3-1"]

Found files: ['data/mvb_pbp_div1_2020.csv', 'data/mvb_pbp_div1_2021.csv', 'data/mvb_pbp_div1_2022.csv', 'data/mvb_pbp_div1_2023.csv', 'data/mvb_pbp_div1_2024.csv']

/var/folders/4y/477v8cl903l85qv80z_6fmfw0000gn/T/ipykernel_58166/3603697490.py:140: Pandas4Warning: 'or' operations between boolean dtype and str are deprecated and will raise in a future version. Explicitly cast the strings to a boolean dtype before operating instead.
  rallies = rallies[(rallies['set_state'] != "1-3") | rallies['set_state'] != "3-1"]

# pd.set_option('display.max_rows', None)
# sets_summary[sets_summary['match_winner'].isna()]

# df[(df['date'] == '02/04/2021') & (df['away_team'] == 'UCLA')]
# df[(df['date'] == '03/11/2021') & (df['away_team'] == 'Pepperdine')]

# 02/04/2021_UCLA_BYU has an error in between sets 1 and sets 2 kinda weird lets drop it

# final_rows[(final_rows['date'] == '02/04/2021') & (final_rows['away_team'] == 'UCLA')]
# sets_summary[
#     sets_summary["away_sets"] == sets_summary["home_sets"]
# ]

# some others are displaying 0-3 which shouldnt be possible
# pd.set_option('display.max_rows', None)
# rallies[rallies['home_sets_before'] == 3]
# df[(df['match_id']== '02/16/2023_Ball St._Loyola Chicago') & (df['set'] == 1)]
# df[(df['match_id']== '02/16/2023_Ball St._Loyola Chicago') & (df['set'] == 1)]
# rallies[(rallies['match_id']== '02/16/2023_Ball St._Loyola Chicago') & (rallies['set'] == 1)]

Creating Elo

Basic elo model from ChatGPT.

from collections import defaultdict



matches = (rallies
          .groupby(['year', 'match_id', 'date', 'away_team', 'home_team', 'match_winner', 'away_sets', 'home_sets'])
          [['year', 'match_id', 'date', 'away_team', 'home_team', 'match_winner', 'away_sets', 'home_sets', 'final_set_diff']]
          .tail(1)
          .reset_index(drop=True))  # Cleans the index while keeping the columns

# prepare columns
matches["away_elo_pre"] = None
matches["home_elo_pre"] = None
matches["elo_diff"] = None

K = 20

# compute Elo separately for each year; reset team ratings to default at year start
year_elos = []
for yr, group in matches.groupby('year', sort=True):
    elo = defaultdict(lambda: 1500)
    for idx, row in group.iterrows():
        away = row["away_team"]
        home = row["home_team"]

        away_elo = elo[away]
        home_elo = elo[home]

        matches.loc[idx, "away_elo_pre"] = away_elo
        matches.loc[idx, "home_elo_pre"] = home_elo
        matches.loc[idx, "elo_diff"] = away_elo - home_elo

        # expected probability away wins
        expected_away = 1 / (1 + 10 ** ((home_elo - away_elo) / 400))
        expected_home = 1 - expected_away

        actual_away = row["match_winner"]
        actual_home = 1 - actual_away

        elo[away] = away_elo + K * (actual_away - expected_away)
        elo[home] = home_elo + K * (actual_home - expected_home)

    # record final ratings for this year
    for team, rating in elo.items():
        year_elos.append({"year": yr, "team": team, "elo": rating})

# create DataFrame of year-end elos and pick top 3 per year
year_elo_df = pd.DataFrame(year_elos)
top3_per_year = (
    year_elo_df.sort_values(['year','elo'], ascending=[True, False])
    .groupby('year')
    .head(3)
    .reset_index(drop=True)
 )
top3_per_year

matches[
    ["year",
        "away_team",
        "home_team",
        "away_elo_pre",
        "home_elo_pre",
        "elo_diff"
    ]
]

	year	away_team	home_team	away_elo_pre	home_elo_pre	elo_diff
0	2020	CSUN	UC Irvine	1500	1500	0
1	2020	UC Irvine	CSUN	1510.0	1490.0	20.0
2	2020	UC Santa Barbara	CSUN	1500	1500.575011	-0.575011
3	2020	CSUN	UC Santa Barbara	1490.558461	1510.01655	-19.458089
4	2020	UC San Diego	CSUN	1500	1481.117925	18.882075
...	...	...	...	...	...	...
3742	2024	LeMoyne-Owen	Morehouse	1393.13762	1403.608563	-10.470943
3743	2024	LeMoyne-Owen	Morehouse	1383.438906	1413.307276	-29.86837
3744	2024	LeMoyne-Owen	Morehouse	1374.296476	1422.449707	-48.153231
3745	2024	Talladega	Morehouse	1507.413001	1431.072552	76.340449
3746	2024	Morehouse	Life (GA)	1423.235116	1478.410381	-55.175265

3747 rows × 6 columns

# Merge to rallies
rallies = rallies.merge(matches[['match_id', 'elo_diff']], on = 'match_id')

Model

# rallies['point_diff'].drop_duplicates()
# rallies['away_sets_before'].drop_duplicates()
# rallies['home_sets_before'].drop_duplicates()
# rallies["match_winner"].value_counts(dropna = False)
# rallies["match_winner"].isna().sum()
# rallies[rallies["match_winner"].isna()]
# rallies["match_winner"].value_counts(dropna = False)
# rallies[["match_winner"]]

EDA

empirical = (
    rallies
    .groupby("point_diff")
    .agg(
        win_prob=("match_winner","mean"),
        n=("match_winner","size")
    )
    .reset_index()
)

empirical.sort_values("point_diff").head()

	point_diff	win_prob	n
0	-22.0	0.0	1
1	-21.0	0.0	1
2	-20.0	0.0	7
3	-19.0	0.090909	22
4	-18.0	0.035714	56

import matplotlib.pyplot as plt

plot_df = empirical[empirical["n"] > 100]

plt.plot(
    plot_df["point_diff"],
    plot_df["win_prob"]
)

plt.xlabel("Point Differential")
plt.ylabel("Away Match Win Probability")
plt.show()

rallies.groupby("set_diff")["match_winner"].mean()

set_diff
-2    0.032427
-1    0.152919
 0    0.404267
 1    0.757662
 2    0.945289
Name: match_winner, dtype: object

Train on 2020-2023 and test on 2024

Baseline LR Model

# import statsmodels.api as sm

train = rallies[rallies["year"] <= 2023].copy()
train = train[(train['set_state'] != "1-3") & (train['set_state'] != "3-1")]

test = rallies[rallies["year"] == 2024].copy()
test = test[(test['set_state'] != "1-3") & (test['set_state'] != "3-1")]



features = [
     "point_diff",
    # "points_played",
    # "score_interaction",
    'set_diff',
    "elo_diff"
    # 'away_prev',
    # 'home_prev',
    # 'away_sets_before', 
    # 'home_sets_before',
    # "point_diff",
    # "set_diff",
]
baseline_model = LogisticRegression(
    max_iter=1000
)

baseline_fit = baseline_model.fit(
        train[features],
        train["match_winner"].astype(int)
    )
# View the pieces manually

print("Intercept:", baseline_model.intercept_)
print("Coefficients:", baseline_model.coef_)
print("R-Squared Score:", baseline_model.score(
    train[features],
    train["match_winner"].astype(int)
))

# Combine into a clean DataFrame
summary_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": baseline_model.coef_[0]
})

summary_df

Intercept: [-0.31811274]
Coefficients: [[0.25666261 1.40366086 0.00993638]]
R-Squared Score: 0.8063610296298738

	Feature	Coefficient
0	point_diff	0.256663
1	set_diff	1.403661
2	elo_diff	0.009936

# Testing
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss

# Use [:, 1] to select all rows, but only the second column
y_probs = baseline_model.predict_proba(test[features])[:, 1]

# Now y_probs is a 1D array with shape (147326,)
score = roc_auc_score(test["match_winner"].astype(int), y_probs) 


print("ROC:", score)




ll = log_loss(
test["match_winner"].astype(int), y_probs
)

print("Log Loss:", ll)


bs = brier_score_loss(
test["match_winner"].astype(int), y_probs
)

print("Brier Score:", bs)

ROC: 0.8935547300861888
Log Loss: 0.3978491196383412
Brier Score: 0.1278028090713494

Fitting Models Per Set State

train = rallies[rallies["year"] <= 2023].copy()
train = train[(train['set_state'] != "1-3") & (train['set_state'] != "3-1")]

test = rallies[rallies["year"] == 2024].copy()
test = test[(test['set_state'] != "1-3") & (test['set_state'] != "3-1")]



features = [
     "point_diff",
    "points_played",
    "score_interaction",
    "elo_diff"
    # 'away_prev',
    # 'home_prev',
    # 'away_sets_before', 
    # 'home_sets_before',
    # "point_diff",
    # "set_diff",
]

state_models = {}

for state, subset in train.groupby("set_state"):

    # if len(subset) < 1000:
    #     continue

    model = LogisticRegression(
        max_iter=1000
    )
    print(state, subset["match_winner"].value_counts())

    model.fit(
        subset[features],
        subset["match_winner"].astype(int)
    )

    state_models[state] = model

0-0 match_winner
0    78348
1    48667
Name: count, dtype: int64
0-1 match_winner
0    65652
1    11266
Name: count, dtype: int64
0-2 match_winner
0    53693
1     1702
Name: count, dtype: int64
1-0 match_winner
1    37175
0    13103
Name: count, dtype: int64
1-1 match_winner
0    22759
1    17894
Name: count, dtype: int64
1-2 match_winner
0    28005
1     5649
Name: count, dtype: int64
2-0 match_winner
1    29207
0     1817
Name: count, dtype: int64
2-1 match_winner
1    20462
0     5737
Name: count, dtype: int64
2-2 match_winner
0    7231
1    6748
Name: count, dtype: int64

Point differential per set state

next(iter(state_models.values()))

LogisticRegression(max_iter=1000)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

LogisticRegression

?Documentation for LogisticRegressioniFitted

Parameters

	max_iter max_iter: int, default=100 Maximum number of iterations taken for the solvers to converge.	1000
	penalty penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add an L2 penalty term and it is the default choice; - `'l1'`: add an L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning:: Some penalties may not work with some solvers. See the parameter `solver` below, to know the compatibility between the penalty and solver. .. versionadded:: 0.19 l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8 `penalty` was deprecated in version 1.8 and will be removed in 1.10. Use `l1_ratio` and `C` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for `penalty='l1'`, `l1_ratio` set to any float between 0 and 1 for `penalty='elasticnet'`, and `C=np.inf` for `penalty=None`.	'deprecated'
	C C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.	1.0
	l1_ratio l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning:: Certain values of `l1_ratio`, i.e. some penalties, may not work with some solvers. See the parameter `solver` below, to know the compatibility between the penalty and solver. .. versionchanged:: 1.8 Default value changed from None to 0.0. .. deprecated:: 1.8 `None` is deprecated and will be removed in version 1.10. Always use `l1_ratio` to specify the penalty type.	0.0
	dual dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation <regularized-logistic-loss>`) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.	False
	tol tol: float, default=1e-4 Tolerance for stopping criteria.	0.0001
	fit_intercept fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.	True
	intercept_scaling intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a "synthetic" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note:: The synthetic feature weight is subject to L1 or L2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) `intercept_scaling` has to be increased.	1
	class_weight class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17 class_weight='balanced'	None
	random_state random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary <random_state>` for details.	None
	solver solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except 'liblinear' minimize the full multinomial loss, 'liblinear' will raise an error. - 'newton-cholesky' is a good choice for `n_samples` >> `n_features * n_classes`, especially with one-hot encoded categorical features with rare categories. Be aware that the memory usage of this solver has a quadratic dependency on `n_features * n_classes` because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag' and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a one-versus-rest scheme for the multiclass setting one can wrap it with the :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning:: The choice of the algorithm depends on the penalty chosen (`l1_ratio=0` for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for Elastic-Net) and on (multinomial) multiclass support: ================= ======================== ====================== solver l1_ratio multinomial multiclass ================= ======================== ====================== 'lbfgs' l1_ratio=0 yes 'liblinear' l1_ratio=1 or l1_ratio=0 no 'newton-cg' l1_ratio=0 yes 'newton-cholesky' l1_ratio=0 yes 'sag' l1_ratio=0 yes 'saga' 0<=l1_ratio<=1 yes ================= ======================== ====================== .. note:: 'sag' and 'saga' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from :mod:`sklearn.preprocessing`. .. seealso:: Refer to the :ref:`User Guide <Logistic_regression>` for more information regarding :class:`LogisticRegression` and more specifically the :ref:`Table <logistic_regression_solvers>` summarizing solver/penalty supports. .. versionadded:: 0.17 Stochastic Average Gradient (SAG) descent solver. Multinomial support in version 0.18. .. versionadded:: 0.19 SAGA solver. .. versionchanged:: 0.22 The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2 newton-cholesky solver. Multinomial support in version 1.6.	'lbfgs'
	verbose verbose: int, default=0 For the liblinear and lbfgs solvers set verbose to any positive number for verbosity.	0
	warm_start warm_start: bool, default=False When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. See :term:`the Glossary <warm_start>`. .. versionadded:: 0.17 warm_start to support lbfgs, newton-cg, sag, saga solvers.	False
	n_jobs n_jobs: int, default=None Does not have any effect. .. deprecated:: 1.8 `n_jobs` is deprecated in version 1.8 and will be removed in 1.10.	None

Fitted attributes

Name	Type	Value
classes_ classes_: ndarray of shape (n_classes, ) A list of class labels known to the classifier.	ndarray[int64](2,)	[0,1]
coef_ coef_: ndarray or CSR matrix of shape (1, n_features) or (n_classes, n_features) Coefficients of the features in the decision function. `coef_` is of shape (1, n_features) when the given problem is binary. By default, it will be created as a dense array, but can be turned to sparse (CSR format) through :meth:`sparsify` (which can be beneficial under L1 regularization when many coefficients are zero), and back to dense through :meth:`densify`.	ndarray[float64](1, 4)	[[ 0.28, 0. ,-0. , 0.01]]
feature_names_in_ feature_names_in_: ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0	ndarray[object](4,)	['point_diff','points_played','score_interaction','elo_diff']
intercept_ intercept_: ndarray of shape (1,) or (n_classes,) Intercept (a.k.a. bias) added to the decision function. If `fit_intercept` is set to False, the intercept is set to zero. `intercept_` is of shape (1,) when the given problem is binary.	ndarray[float64](1,)	[-0.52]
n_features_in_ n_features_in_: int Number of features seen during :term:`fit`. .. versionadded:: 0.24	int	4
n_iter_ n_iter_: ndarray of shape (1, ) Actual number of iterations for all classes. .. versionchanged:: 0.20 In SciPy <= 1.0.0 the number of lbfgs iterations may exceed ``max_iter``. ``n_iter_`` will now report at most ``max_iter``.	ndarray[int32](1,)	[38]

coef_df = pd.DataFrame([
    {
        "state": state,
        "intercept": model.intercept_[0],
        # "away_prev": model.coef_[0][0],
        # "home_prev": model.coef_[0][1],
        "point_diff": model.coef_[0][0],
        "points_played": model.coef_[0][1],
        "score_interaction": model.coef_[0][2],
        # "set_diff": model.coef_[0][3],
        "elo_coef": model.coef_[0][3]
    }
    for state, model in state_models.items()
])


state_order = [
    "0-0",
    "1-0",
    "0-1",
    "1-1",
    "2-0",
    "0-2",
    "2-1",
    "1-2",
    "2-2"
]

coef_df["state"] = pd.Categorical(
    coef_df["state"],
    categories=state_order,
    ordered=True
)

coef_df = coef_df.sort_values("state")

import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))

for col in [
    # "point_diff",
      "points_played",
        # "score_interaction"
          "elo_coef"
        ]:
# ["away_prev", "home_prev", "elo_coef"]:

    plt.plot(
        coef_df["state"],
        coef_df[col],
        marker="o",
        label=col
    )

plt.axhline(
    0,
    linestyle="--",
    alpha=.5
)

plt.xlabel("Set State")
plt.ylabel("Coefficient")
plt.title("Logistic Regression Coefficients by Set State")

plt.legend()
plt.grid(alpha=.3)

plt.show()

Testing Results

test["pred"] = np.nan

for b, model in state_models.items():

    idx = test["set_state"] == b

    if idx.sum() == 0:
        continue

    test.loc[idx, "pred"] = (
        model.predict_proba(
            test.loc[idx, features]
        )[:,1]
    )


eval_df = test.dropna(subset=["pred"]).copy()

from sklearn.metrics import log_loss

ll = log_loss(
    eval_df["match_winner"].astype(int),
    eval_df["pred"]
)

print("Log Loss:", ll)

from sklearn.metrics import brier_score_loss

bs = brier_score_loss(
    eval_df["match_winner"].astype(int),
    eval_df["pred"]
)

print("Brier Score:", bs)

Log Loss: 0.3935792494201294
Brier Score: 0.12619771892591225

cal = eval_df[
    ["pred", "match_winner"]
].copy()

cal["bin"] = pd.qcut(
    cal["pred"],
    q=10,
    duplicates="drop"
)

calibration = (
    cal
    .groupby("bin", observed=True)
    .agg(
        predicted=("pred","mean"),
        observed=("match_winner","mean"),
        n=("match_winner","size")
    )
    .reset_index()
)

print(calibration)

                    bin  predicted  observed      n
0  (-0.0009671, 0.0187]   0.008667  0.009095  14733
1      (0.0187, 0.0541]   0.034486    0.0412  14733
2       (0.0541, 0.111]   0.081313  0.087293  14732
3         (0.111, 0.19]   0.148010  0.153058  14733
4         (0.19, 0.298]   0.241249  0.239343  14732
5        (0.298, 0.438]   0.364524  0.382882  14733
6        (0.438, 0.602]   0.517240  0.517241  14732
7        (0.602, 0.775]   0.688706  0.696735  14733
8        (0.775, 0.911]   0.845219  0.876663  14732
9          (0.911, 1.0]   0.960752  0.974004  14733

import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))

plt.plot(
    calibration["predicted"],
    calibration["observed"],
    marker="o"
)

plt.plot(
    [0,1],
    [0,1],
    "--"
)

plt.xlabel("Predicted Probability")
plt.ylabel("Observed Win Rate")
plt.title("2024 Calibration")

plt.show()

from sklearn.metrics import roc_auc_score

auc = roc_auc_score(
    eval_df["match_winner"].astype(int),
    eval_df["pred"]
)

print("AUC:", auc)

AUC: 0.8961475556717216