--- title: "Predicting NCAA Men's Volleyball In-game Win Probability version 3" # description: "A short summary of what this post covers." author: "Kenny Chen" date: "2026-06-18" categories: [Statistics] # image: thumbnail.jpg --- NCAA Win Probability Model

NCAA Win Probability Model

  1. Download NCAA pbp data
  2. Convert it into one row per rally state
  3. Compute ELO ratings
  4. Generate training dataframe
  5. Fit the first baseline logistic model

Things to do:

Step 1: Building the training table

Steps: * Read in all play by play data * Convert pbp to rally data and match data * Create features (elo, set differential, point differential)

import glob
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
# read all yearly pbp csvs and add a year column
files = sorted(glob.glob('data/mvb_pbp_div1_*.csv'))
print('Found files:', files)
dfs = []
for f in files:
    tmp = pd.read_csv(f)
    basename = os.path.basename(f)
    # expect filenames like mvb_pbp_div1_2020.csv -> extract year
    year = basename.split('_')[-1].split('.')[0]
    tmp['year'] = int(year)
    dfs.append(tmp)
mvb_pbp = pd.concat(dfs, ignore_index=True)
# ensure year column is integer
mvb_pbp['year'] = mvb_pbp['year'].astype(int)
mvb_pbp

df = mvb_pbp.copy()

df["score_changed"] = (
    df["away_score"].diff().fillna(0).ne(0)
    |
    df["home_score"].diff().fillna(0).ne(0)
)

# compute previous scores and score diffs
df[['away_prev', 'home_prev']] = df[['away_score', 'home_score']].shift(1).fillna(0)
df['away_diff'] = df['away_score'] - df['away_prev']
df['home_diff'] = df['home_score'] - df['home_prev']

# assign rally winner (team name) when a score increases
df['rally_winner'] = pd.NA
df.loc[df['away_diff'] > 0, 'rally_winner'] = df.loc[df['away_diff'] > 0, 'away_team']
df.loc[df['home_diff'] > 0, 'rally_winner'] = df.loc[df['home_diff'] > 0, 'home_team']

# assign rally id
match_cols = ["date", "away_team", "home_team"]

df["rally_id"] = (
    df.groupby(match_cols)["score_changed"]
      .cumsum()
)

# Ensure `set` is numeric so we can group by it
df['set'] = pd.to_numeric(df['set'], errors='coerce')

# Determine the final row for each match/set so we know the set score
group_cols = ['date', 'away_team', 'home_team', 'set']
final_rows = df.groupby(group_cols, dropna=False).tail(1).copy()




# mark which team won each set
final_rows['away_set_win'] = (final_rows['away_score'] > final_rows['home_score']).astype(int)
final_rows['home_set_win'] = (final_rows['home_score'] > final_rows['away_score']).astype(int)

# sort by set so cumulative sums are correct
final_rows = final_rows.sort_values(['date', 'away_team', 'home_team', 'set'])

# cumulative sets won up to and including this set
final_rows['cum_away_sets'] = final_rows.groupby(['date', 'away_team', 'home_team'])['away_set_win'].cumsum()
final_rows['cum_home_sets'] = final_rows.groupby(['date', 'away_team', 'home_team'])['home_set_win'].cumsum()

# sets won BEFORE this set (shift cumulative sums by 1)
final_rows['away_sets_before'] = final_rows.groupby(['date', 'away_team', 'home_team'])['cum_away_sets'].shift(1).fillna(0).astype(int)
final_rows['home_sets_before'] = final_rows.groupby(['date', 'away_team', 'home_team'])['cum_home_sets'].shift(1).fillna(0).astype(int)

# prepare per-set before-values to merge back into full dataframe
set_before_cols = final_rows[['date', 'away_team', 'home_team', 'set', 'away_sets_before', 'home_sets_before']]

# aggregate sets won per match (final totals)
sets_summary = final_rows.groupby(['date', 'away_team', 'home_team']).agg(
    away_sets_total=('away_set_win', 'sum'),
    home_sets_total=('home_set_win', 'sum')
).reset_index()
sets_summary = sets_summary.rename(columns={'away_sets_total': 'away_sets', 'home_sets_total': 'home_sets'})

# determine match winner (team with more sets)
sets_summary['match_winner'] = sets_summary.apply(
    lambda r: 1 if r['away_sets'] > r['home_sets'] else (0 if r['home_sets'] > r['away_sets'] else pd.NA),
    axis=1
)



# merge per-set "before" counts into both dataframes so rows in set `s` see sets won before that set
df = df.merge(set_before_cols, on=['date', 'away_team', 'home_team', 'set'], how='left')

# merge match totals and match_winner
df = df.merge(sets_summary, on=['date', 'away_team', 'home_team'], how='left')

# drop helper columns
df.drop(columns=['cum_away_sets', 'cum_home_sets'], inplace=True, errors='ignore')


df['point_diff'] = df['away_prev'] - df['home_prev']

df["match_id"] = (
    df["date"].astype(str)
    + "_"
    + df["away_team"]
    + "_"
    + df["home_team"]
)


# rows corresponding to the end of a rally (point scored)
# drop rows where match_winner is missing to avoid downstream NaNs
df = df[df['match_winner'].notna()]

rallies = df[df["score_changed"]].copy()


# creating some more variables
rallies['points_played'] = rallies['away_prev'] + rallies['home_prev']
rallies["score_interaction"] = (
    rallies["point_diff"]
    * rallies["points_played"]
)
rallies["final_set_diff"] = rallies['away_sets'] - rallies['home_sets']
rallies["set_diff"] = (
    rallies["away_sets_before"]
    - rallies["home_sets_before"]
)

rallies = rallies[abs(rallies['set_diff']) != 3]

rallies["set_state"] = (
    rallies["away_sets_before"].astype(str)
    + "-"
    + rallies["home_sets_before"].astype(str)
)
# manually filter for now
rallies = rallies[(rallies['set_state'] != "1-3") | rallies['set_state'] != "3-1"]
Found files: ['data/mvb_pbp_div1_2020.csv', 'data/mvb_pbp_div1_2021.csv', 'data/mvb_pbp_div1_2022.csv', 'data/mvb_pbp_div1_2023.csv', 'data/mvb_pbp_div1_2024.csv']
/var/folders/4y/477v8cl903l85qv80z_6fmfw0000gn/T/ipykernel_58166/3603697490.py:140: Pandas4Warning: 'or' operations between boolean dtype and str are deprecated and will raise in a future version. Explicitly cast the strings to a boolean dtype before operating instead.
  rallies = rallies[(rallies['set_state'] != "1-3") | rallies['set_state'] != "3-1"]
# pd.set_option('display.max_rows', None)
# sets_summary[sets_summary['match_winner'].isna()]

# df[(df['date'] == '02/04/2021') & (df['away_team'] == 'UCLA')]
# df[(df['date'] == '03/11/2021') & (df['away_team'] == 'Pepperdine')]

# 02/04/2021_UCLA_BYU has an error in between sets 1 and sets 2 kinda weird lets drop it

# final_rows[(final_rows['date'] == '02/04/2021') & (final_rows['away_team'] == 'UCLA')]
# sets_summary[
#     sets_summary["away_sets"] == sets_summary["home_sets"]
# ]

# some others are displaying 0-3 which shouldnt be possible
# pd.set_option('display.max_rows', None)
# rallies[rallies['home_sets_before'] == 3]
# df[(df['match_id']== '02/16/2023_Ball St._Loyola Chicago') & (df['set'] == 1)]
# df[(df['match_id']== '02/16/2023_Ball St._Loyola Chicago') & (df['set'] == 1)]
# rallies[(rallies['match_id']== '02/16/2023_Ball St._Loyola Chicago') & (rallies['set'] == 1)]

Creating Elo

Basic elo model from ChatGPT.

from collections import defaultdict



matches = (rallies
          .groupby(['year', 'match_id', 'date', 'away_team', 'home_team', 'match_winner', 'away_sets', 'home_sets'])
          [['year', 'match_id', 'date', 'away_team', 'home_team', 'match_winner', 'away_sets', 'home_sets', 'final_set_diff']]
          .tail(1)
          .reset_index(drop=True))  # Cleans the index while keeping the columns

# prepare columns
matches["away_elo_pre"] = None
matches["home_elo_pre"] = None
matches["elo_diff"] = None

K = 20

# compute Elo separately for each year; reset team ratings to default at year start
year_elos = []
for yr, group in matches.groupby('year', sort=True):
    elo = defaultdict(lambda: 1500)
    for idx, row in group.iterrows():
        away = row["away_team"]
        home = row["home_team"]

        away_elo = elo[away]
        home_elo = elo[home]

        matches.loc[idx, "away_elo_pre"] = away_elo
        matches.loc[idx, "home_elo_pre"] = home_elo
        matches.loc[idx, "elo_diff"] = away_elo - home_elo

        # expected probability away wins
        expected_away = 1 / (1 + 10 ** ((home_elo - away_elo) / 400))
        expected_home = 1 - expected_away

        actual_away = row["match_winner"]
        actual_home = 1 - actual_away

        elo[away] = away_elo + K * (actual_away - expected_away)
        elo[home] = home_elo + K * (actual_home - expected_home)

    # record final ratings for this year
    for team, rating in elo.items():
        year_elos.append({"year": yr, "team": team, "elo": rating})

# create DataFrame of year-end elos and pick top 3 per year
year_elo_df = pd.DataFrame(year_elos)
top3_per_year = (
    year_elo_df.sort_values(['year','elo'], ascending=[True, False])
    .groupby('year')
    .head(3)
    .reset_index(drop=True)
 )
top3_per_year

matches[
    ["year",
        "away_team",
        "home_team",
        "away_elo_pre",
        "home_elo_pre",
        "elo_diff"
    ]
]
year away_team home_team away_elo_pre home_elo_pre elo_diff
0 2020 CSUN UC Irvine 1500 1500 0
1 2020 UC Irvine CSUN 1510.0 1490.0 20.0
2 2020 UC Santa Barbara CSUN 1500 1500.575011 -0.575011
3 2020 CSUN UC Santa Barbara 1490.558461 1510.01655 -19.458089
4 2020 UC San Diego CSUN 1500 1481.117925 18.882075
... ... ... ... ... ... ...
3742 2024 LeMoyne-Owen Morehouse 1393.13762 1403.608563 -10.470943
3743 2024 LeMoyne-Owen Morehouse 1383.438906 1413.307276 -29.86837
3744 2024 LeMoyne-Owen Morehouse 1374.296476 1422.449707 -48.153231
3745 2024 Talladega Morehouse 1507.413001 1431.072552 76.340449
3746 2024 Morehouse Life (GA) 1423.235116 1478.410381 -55.175265

3747 rows × 6 columns

# Merge to rallies
rallies = rallies.merge(matches[['match_id', 'elo_diff']], on = 'match_id')

Model

# rallies['point_diff'].drop_duplicates()
# rallies['away_sets_before'].drop_duplicates()
# rallies['home_sets_before'].drop_duplicates()
# rallies["match_winner"].value_counts(dropna = False)
# rallies["match_winner"].isna().sum()
# rallies[rallies["match_winner"].isna()]
# rallies["match_winner"].value_counts(dropna = False)
# rallies[["match_winner"]]

EDA

empirical = (
    rallies
    .groupby("point_diff")
    .agg(
        win_prob=("match_winner","mean"),
        n=("match_winner","size")
    )
    .reset_index()
)

empirical.sort_values("point_diff").head()
point_diff win_prob n
0 -22.0 0.0 1
1 -21.0 0.0 1
2 -20.0 0.0 7
3 -19.0 0.090909 22
4 -18.0 0.035714 56
import matplotlib.pyplot as plt

plot_df = empirical[empirical["n"] > 100]

plt.plot(
    plot_df["point_diff"],
    plot_df["win_prob"]
)

plt.xlabel("Point Differential")
plt.ylabel("Away Match Win Probability")
plt.show()

rallies.groupby("set_diff")["match_winner"].mean()
set_diff
-2    0.032427
-1    0.152919
 0    0.404267
 1    0.757662
 2    0.945289
Name: match_winner, dtype: object

Train on 2020-2023 and test on 2024

Baseline LR Model

# import statsmodels.api as sm

train = rallies[rallies["year"] <= 2023].copy()
train = train[(train['set_state'] != "1-3") & (train['set_state'] != "3-1")]

test = rallies[rallies["year"] == 2024].copy()
test = test[(test['set_state'] != "1-3") & (test['set_state'] != "3-1")]



features = [
     "point_diff",
    # "points_played",
    # "score_interaction",
    'set_diff',
    "elo_diff"
    # 'away_prev',
    # 'home_prev',
    # 'away_sets_before', 
    # 'home_sets_before',
    # "point_diff",
    # "set_diff",
]
baseline_model = LogisticRegression(
    max_iter=1000
)

baseline_fit = baseline_model.fit(
        train[features],
        train["match_winner"].astype(int)
    )
# View the pieces manually

print("Intercept:", baseline_model.intercept_)
print("Coefficients:", baseline_model.coef_)
print("R-Squared Score:", baseline_model.score(
    train[features],
    train["match_winner"].astype(int)
))

# Combine into a clean DataFrame
summary_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": baseline_model.coef_[0]
})

summary_df
Intercept: [-0.31811274]
Coefficients: [[0.25666261 1.40366086 0.00993638]]
R-Squared Score: 0.8063610296298738
Feature Coefficient
0 point_diff 0.256663
1 set_diff 1.403661
2 elo_diff 0.009936
# Testing
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss

# Use [:, 1] to select all rows, but only the second column
y_probs = baseline_model.predict_proba(test[features])[:, 1]

# Now y_probs is a 1D array with shape (147326,)
score = roc_auc_score(test["match_winner"].astype(int), y_probs) 


print("ROC:", score)




ll = log_loss(
test["match_winner"].astype(int), y_probs
)

print("Log Loss:", ll)


bs = brier_score_loss(
test["match_winner"].astype(int), y_probs
)

print("Brier Score:", bs)
ROC: 0.8935547300861888
Log Loss: 0.3978491196383412
Brier Score: 0.1278028090713494

Fitting Models Per Set State

train = rallies[rallies["year"] <= 2023].copy()
train = train[(train['set_state'] != "1-3") & (train['set_state'] != "3-1")]

test = rallies[rallies["year"] == 2024].copy()
test = test[(test['set_state'] != "1-3") & (test['set_state'] != "3-1")]



features = [
     "point_diff",
    "points_played",
    "score_interaction",
    "elo_diff"
    # 'away_prev',
    # 'home_prev',
    # 'away_sets_before', 
    # 'home_sets_before',
    # "point_diff",
    # "set_diff",
]

state_models = {}

for state, subset in train.groupby("set_state"):

    # if len(subset) < 1000:
    #     continue

    model = LogisticRegression(
        max_iter=1000
    )
    print(state, subset["match_winner"].value_counts())

    model.fit(
        subset[features],
        subset["match_winner"].astype(int)
    )

    state_models[state] = model
0-0 match_winner
0    78348
1    48667
Name: count, dtype: int64
0-1 match_winner
0    65652
1    11266
Name: count, dtype: int64
0-2 match_winner
0    53693
1     1702
Name: count, dtype: int64
1-0 match_winner
1    37175
0    13103
Name: count, dtype: int64
1-1 match_winner
0    22759
1    17894
Name: count, dtype: int64
1-2 match_winner
0    28005
1     5649
Name: count, dtype: int64
2-0 match_winner
1    29207
0     1817
Name: count, dtype: int64
2-1 match_winner
1    20462
0     5737
Name: count, dtype: int64
2-2 match_winner
0    7231
1    6748
Name: count, dtype: int64

Point differential per set state

next(iter(state_models.values()))
LogisticRegression(max_iter=1000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
coef_df = pd.DataFrame([
    {
        "state": state,
        "intercept": model.intercept_[0],
        # "away_prev": model.coef_[0][0],
        # "home_prev": model.coef_[0][1],
        "point_diff": model.coef_[0][0],
        "points_played": model.coef_[0][1],
        "score_interaction": model.coef_[0][2],
        # "set_diff": model.coef_[0][3],
        "elo_coef": model.coef_[0][3]
    }
    for state, model in state_models.items()
])


state_order = [
    "0-0",
    "1-0",
    "0-1",
    "1-1",
    "2-0",
    "0-2",
    "2-1",
    "1-2",
    "2-2"
]

coef_df["state"] = pd.Categorical(
    coef_df["state"],
    categories=state_order,
    ordered=True
)

coef_df = coef_df.sort_values("state")

import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))

for col in [
    # "point_diff",
      "points_played",
        # "score_interaction"
          "elo_coef"
        ]:
# ["away_prev", "home_prev", "elo_coef"]:

    plt.plot(
        coef_df["state"],
        coef_df[col],
        marker="o",
        label=col
    )

plt.axhline(
    0,
    linestyle="--",
    alpha=.5
)

plt.xlabel("Set State")
plt.ylabel("Coefficient")
plt.title("Logistic Regression Coefficients by Set State")

plt.legend()
plt.grid(alpha=.3)

plt.show()

Testing Results

test["pred"] = np.nan

for b, model in state_models.items():

    idx = test["set_state"] == b

    if idx.sum() == 0:
        continue

    test.loc[idx, "pred"] = (
        model.predict_proba(
            test.loc[idx, features]
        )[:,1]
    )


eval_df = test.dropna(subset=["pred"]).copy()
from sklearn.metrics import log_loss

ll = log_loss(
    eval_df["match_winner"].astype(int),
    eval_df["pred"]
)

print("Log Loss:", ll)

from sklearn.metrics import brier_score_loss

bs = brier_score_loss(
    eval_df["match_winner"].astype(int),
    eval_df["pred"]
)

print("Brier Score:", bs)
Log Loss: 0.3935792494201294
Brier Score: 0.12619771892591225
cal = eval_df[
    ["pred", "match_winner"]
].copy()

cal["bin"] = pd.qcut(
    cal["pred"],
    q=10,
    duplicates="drop"
)

calibration = (
    cal
    .groupby("bin", observed=True)
    .agg(
        predicted=("pred","mean"),
        observed=("match_winner","mean"),
        n=("match_winner","size")
    )
    .reset_index()
)

print(calibration)
                    bin  predicted  observed      n
0  (-0.0009671, 0.0187]   0.008667  0.009095  14733
1      (0.0187, 0.0541]   0.034486    0.0412  14733
2       (0.0541, 0.111]   0.081313  0.087293  14732
3         (0.111, 0.19]   0.148010  0.153058  14733
4         (0.19, 0.298]   0.241249  0.239343  14732
5        (0.298, 0.438]   0.364524  0.382882  14733
6        (0.438, 0.602]   0.517240  0.517241  14732
7        (0.602, 0.775]   0.688706  0.696735  14733
8        (0.775, 0.911]   0.845219  0.876663  14732
9          (0.911, 1.0]   0.960752  0.974004  14733
import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))

plt.plot(
    calibration["predicted"],
    calibration["observed"],
    marker="o"
)

plt.plot(
    [0,1],
    [0,1],
    "--"
)

plt.xlabel("Predicted Probability")
plt.ylabel("Observed Win Rate")
plt.title("2024 Calibration")

plt.show()

from sklearn.metrics import roc_auc_score

auc = roc_auc_score(
    eval_df["match_winner"].astype(int),
    eval_df["pred"]
)

print("AUC:", auc)
AUC: 0.8961475556717216