Back to Article
ELO rating system for 1.Bundesliga saison 2015/2016
Download Notebook

ELO rating system for 1.Bundesliga saison 2015/2016

  1. Statsbomb open-data minimal exploratory analysis

  2. ELO rating system (with arbitrary K and s factors)

  3. Application the ELO rating system to 1. Bundesliga 2015/2016 season matches

  4. Finding optimal factors s and K based on the season’s data

  5. Display the final ranking table with optimal K and s factors

  6. Finding the most surprising win in the season

In [1]:
# imports

import numpy as np
import pandas as pd

#from mplsoccer import Sbopen # unified for various data providers and provides basic data-related visualisation

from statsbombpy import sb # provides aggregated statistics

from sklearn.metrics import mean_squared_error
from itertools import product

1. Statsbomb open-data minimal exploratory analysis

In [2]:
competitions = sb.competitions()
competitions.info()
/home/nz/workspace/open-data/venv/lib/python3.10/site-packages/statsbombpy/api_client.py:21: NoAuthWarning: credentials were not supplied. open data access only
  warnings.warn(
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   competition_id             74 non-null     int64 
 1   season_id                  74 non-null     int64 
 2   country_name               74 non-null     object
 3   competition_name           74 non-null     object
 4   competition_gender         74 non-null     object
 5   competition_youth          74 non-null     bool  
 6   competition_international  74 non-null     bool  
 7   season_name                74 non-null     object
 8   match_updated              74 non-null     object
 9   match_updated_360          56 non-null     object
 10  match_available_360        10 non-null     object
 11  match_available            74 non-null     object
dtypes: bool(2), int64(2), object(8)
memory usage: 6.1+ KB
In [3]:
competitions.loc[
    (competitions['competition_name'] == "1. Bundesliga") &
    (competitions['season_name'] == "2015/2016")] 
competition_id season_id country_name competition_name competition_gender competition_youth competition_international season_name match_updated match_updated_360 match_available_360 match_available
1 9 27 Germany 1. Bundesliga male False False 2015/2016 2024-05-19T11:11:14.192381 None None 2024-05-19T11:11:14.192381
In [4]:
matches = sb.matches(competition_id=9, season_id=27)
matches.info()
/home/nz/workspace/open-data/venv/lib/python3.10/site-packages/statsbombpy/api_client.py:21: NoAuthWarning: credentials were not supplied. open data access only
  warnings.warn(
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   match_id               306 non-null    int64 
 1   match_date             306 non-null    object
 2   kick_off               306 non-null    object
 3   competition            306 non-null    object
 4   season                 306 non-null    object
 5   home_team              306 non-null    object
 6   away_team              306 non-null    object
 7   home_score             306 non-null    int64 
 8   away_score             306 non-null    int64 
 9   match_status           306 non-null    object
 10  match_status_360       306 non-null    object
 11  last_updated           306 non-null    object
 12  last_updated_360       0 non-null      object
 13  match_week             306 non-null    int64 
 14  competition_stage      306 non-null    object
 15  stadium                306 non-null    object
 16  referee                306 non-null    object
 17  home_managers          306 non-null    object
 18  away_managers          306 non-null    object
 19  data_version           306 non-null    object
 20  shot_fidelity_version  306 non-null    object
 21  xy_fidelity_version    306 non-null    object
dtypes: int64(4), object(18)
memory usage: 52.7+ KB
In [5]:
matches.head(5)
match_id match_date kick_off competition season home_team away_team home_score away_score match_status ... last_updated_360 match_week competition_stage stadium referee home_managers away_managers data_version shot_fidelity_version xy_fidelity_version
0 3890561 2016-05-14 15:30:00.000 Germany - 1. Bundesliga 2015/2016 Hoffenheim Schalke 04 1 4 available ... None 34 Regular Season PreZero Arena Felix Brych Julian Nagelsmann André Breitenreiter 1.1.0 2 2
1 3890505 2016-04-02 15:30:00.000 Germany - 1. Bundesliga 2015/2016 Bayern Munich Eintracht Frankfurt 1 0 available ... None 28 Regular Season Allianz Arena Florian Meyer Josep Guardiola i Sala Niko KovaÄŤ 1.1.0 2 2
2 3890511 2016-04-08 20:30:00.000 Germany - 1. Bundesliga 2015/2016 Hertha Berlin Hannover 96 2 2 available ... None 29 Regular Season Olympiastadion Berlin Benjamin Brand Pál Dárdai Daniel Stendel 1.1.0 2 2
3 3890515 2016-04-09 15:30:00.000 Germany - 1. Bundesliga 2015/2016 Hamburger SV Darmstadt 98 1 2 available ... None 29 Regular Season Volksparkstadion Peter Sippel Bruno Labbadia Dirk Schuster 1.1.0 2 2
4 3890411 2015-12-20 16:30:00.000 Germany - 1. Bundesliga 2015/2016 Hertha Berlin FSV Mainz 05 2 0 available ... None 17 Regular Season Olympiastadion Berlin Peter Sippel Pál Dárdai Martin Schmidt 1.1.0 2 2

5 rows Ă— 22 columns

In [6]:
# sort chronologically
matches = matches.sort_values(by='match_date')
matches
match_id match_date kick_off competition season home_team away_team home_score away_score match_status ... last_updated_360 match_week competition_stage stadium referee home_managers away_managers data_version shot_fidelity_version xy_fidelity_version
305 3890259 2015-08-14 20:30:00.000 Germany - 1. Bundesliga 2015/2016 Bayern Munich Hamburger SV 5 0 available ... None 1 Regular Season Allianz Arena Bastian Dankert Josep Guardiola i Sala Bruno Labbadia 1.1.0 2 2
299 3890265 2015-08-15 15:30:00.000 Germany - 1. Bundesliga 2015/2016 Augsburg Hertha Berlin 0 1 available ... None 1 Regular Season WWK Arena Tobias Welz Markus Weinzierl Pál Dárdai 1.1.0 2 2
304 3890260 2015-08-15 15:30:00.000 Germany - 1. Bundesliga 2015/2016 Bayer Leverkusen Hoffenheim 2 1 available ... None 1 Regular Season BayArena Robert Hartmann Roger Schmidt Markus Gisdol 1.1.0 2 2
300 3890264 2015-08-15 15:30:00.000 Germany - 1. Bundesliga 2015/2016 Werder Bremen Schalke 04 0 3 available ... None 1 Regular Season Wohninvest Weserstadion Daniel Siebert Viktor Skripnik André Breitenreiter 1.1.0 2 2
302 3890262 2015-08-15 15:30:00.000 Germany - 1. Bundesliga 2015/2016 Darmstadt 98 Hannover 96 2 2 available ... None 1 Regular Season Merck-Stadion am Böllenfalltor Felix Brych Dirk Schuster Michael Frontzeck 1.1.0 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18 3890560 2016-05-14 15:30:00.000 Germany - 1. Bundesliga 2015/2016 Darmstadt 98 Borussia Mönchengladbach 0 2 available ... None 34 Regular Season Merck-Stadion am Böllenfalltor Peter Sippel Dirk Schuster André Schubert 1.1.0 2 2
17 3890562 2016-05-14 15:30:00.000 Germany - 1. Bundesliga 2015/2016 Borussia Dortmund FC Köln 2 2 available ... None 34 Regular Season Signal-Iduna-Park Michael Weiner Thomas Tuchel Peter Stöger 1.1.0 2 2
16 3890563 2016-05-14 15:30:00.000 Germany - 1. Bundesliga 2015/2016 Bayer Leverkusen Ingolstadt 3 2 available ... None 34 Regular Season BayArena Guido Winkmann Roger Schmidt Ralph HasenhĂĽttl 1.1.0 2 2
15 3890564 2016-05-14 15:30:00.000 Germany - 1. Bundesliga 2015/2016 Augsburg Hamburger SV 1 3 available ... None 34 Regular Season WWK Arena Florian Meyer Markus Weinzierl Bruno Labbadia 1.1.0 2 2
0 3890561 2016-05-14 15:30:00.000 Germany - 1. Bundesliga 2015/2016 Hoffenheim Schalke 04 1 4 available ... None 34 Regular Season PreZero Arena Felix Brych Julian Nagelsmann André Breitenreiter 1.1.0 2 2

306 rows Ă— 22 columns

2.ELO rating system (with arbitrary K and s factors)

https://en.wikipedia.org/wiki/Elo_rating_system

$ R_{} = R_{} + K ( - ) $

//also possible to incorporate the Goal difference factor like \(G = \frac{GD}{1+GD}\)

\(\text{Actual score}\) for the team is: * 1 - for a win * 0.5 - for a draw * 0 - for a loss

\(\text{Expected score}\) for the team: $ E = $

In [7]:
# initial parameters

K = 15 # scaling factor (how much a team's rating changes after each match)
s = 15 # factor (how difference in rating translates into expected win probability)

R_0 = 100 # initial rating
In [8]:
team_ratings = {}

def get_rating(team):
    """
    initialize rating with R_0 for every team
    """
    if team not in team_ratings:
        team_ratings[team] = R_0
    return team_ratings[team]

def expected_score(team_rating, opponent_rating, s):
    return 1 / (1+10**((opponent_rating - team_rating)/s))

def update_ratings(team1, team2, result, K, s):
    """
    result : actual score for team1 (1=win, 0.5=draw, 0 = loss)
    """
    r1, r2 = get_rating(team1), get_rating(team2)
    e1, e2 = expected_score (r1,r2,s), 1 - expected_score(r1,r2,s)

    team_ratings[team1] = r1 + K * (result - e1)
    team_ratings[team2] = r2 + K * ((1-result) - e2)

def get_team_rankings():
    return sorted(team_ratings.items(), key = lambda x: x[1], reverse=True)

3. Application the ELO rating system to 1. Bundesliga 2015/2016 season matches

In [9]:
def calculate_team_ratings (matches, K, s):

    team_ratings = {}

    # iterate over each match
    for idx, match in matches.iterrows():
        #print (match)
        home_team = match['home_team']
        away_team = match['away_team']
        home_score = match['home_score']
        away_score = match['away_score']
        
        #results of the match
        #first without goal's difference consideration
        if home_score > away_score:
            result = 1 # home win of the match
        elif home_score == away_score:
            result = 0.5 # draw
        else:
            result = 0 # loss
        update_ratings(home_team, away_team, result, K, s)

    final_rankings = get_team_rankings()

    return final_rankings

df_final_rankings = pd.DataFrame(calculate_team_ratings(matches, K, s), columns=['Team', 'Rating'])
print (df_final_rankings)
                        Team      Rating
0   Borussia Mönchengladbach  119.256246
1              Werder Bremen  117.325608
2              Bayern Munich  115.286875
3                 Schalke 04  110.249651
4        Eintracht Frankfurt  105.792199
5          Borussia Dortmund  105.689099
6                    FC Köln  105.357665
7           Bayer Leverkusen  104.791526
8                Hannover 96  100.557829
9                  Wolfsburg   98.128545
10              Darmstadt 98   97.147549
11              Hamburger SV   96.802878
12                  Augsburg   94.286453
13                Hoffenheim   89.494287
14              FSV Mainz 05   88.182061
15             Hertha Berlin   87.955994
16                Ingolstadt   86.235838
17             VfB Stuttgart   77.459697

4. Finding optimal factors s and K based on the season’s data

In [10]:
# based on the saison result's historical chronological data
#  
# data is discrete (1 - win, 0.5 - draw, 0 - loss)
# assuming the data is imbalanced

# other factors(which could influence K and s values) could be considered:
# - home field advantage
# - goal difference
# - match importance (K-value)

# Loss function - MSE,  as intepretable metric - difference between predictions and actual results
In [11]:
# calculate mse between predicted (ELO rating system) and actual values
def compute_mse(matches, K, s):

    global team_ratings
    team_ratings = {}
    
    predicted_results = []
    actual_results = []

    # iterate over the matches in chronological way
    for idx, match in matches.iterrows():
        home_team = match['home_team']
        away_team = match['away_team']
        home_score = match['home_score']
        away_score = match['away_score']

        # without goal's difference consideration
        if home_score > away_score:
            result = 1 # home win of the match
        elif home_score == away_score:
            result = 0.5 # draw
        else:
            result = 0 # loss

        # 
        r1 = get_rating(home_team)
        r2 = get_rating(away_team)
        e1 = expected_score (r1,r2, s)

        predicted_results.append(e1)
        actual_results.append(result)
    
        update_ratings (home_team, away_team, result, K, s)

    mse = mean_squared_error(actual_results,predicted_results)

    return mse
In [12]:
def find_optim_param(matches, K_range, s_range):
    """
    finding combination of parameters via Grid Search //alternatives -e.g. random search
    by minimizimg MSE 
    """
    best_K  = None
    best_s = None
    best_mse = float ('inf')

    for K, s in product(K_range, s_range):
        mse = compute_mse(matches, K, s)
        if mse < best_mse:
            # save current best values
            best_mse = mse
            best_K = K
            best_s = s
        print(f"K={K}, s={s}, MSE= {mse}")
    print(f"best K: {best_K}, best s: {best_s}, best MSE: {best_mse}")
    return best_K, best_s

# define range of values to search
K_range = np.arange(25, 50, 5) # https://eloratings.net/about, https://courses.cs.vt.edu/cs5824/Fall15/project_reports/sullivan_cronin.pdf
s_range = np.arange(400, 500, 50)  #


optimal_K, optimal_s = find_optim_param(matches, K_range, s_range)
K=25, s=400, MSE= 0.17810227843637477
K=25, s=450, MSE= 0.17836864179747566
K=30, s=400, MSE= 0.17795245927761352
K=30, s=450, MSE= 0.17801034693620812
K=35, s=400, MSE= 0.1781143332074865
K=35, s=450, MSE= 0.177965323655386
K=40, s=400, MSE= 0.17849574518103187
K=40, s=450, MSE= 0.17814722318708967
K=45, s=400, MSE= 0.17903423014209696
K=45, s=450, MSE= 0.17849574518103187
best K: 30, best s: 400, best MSE: 0.17795245927761352

5. Display the final ranking table with optimal K and s factors

In [13]:
df_final_rankings = pd.DataFrame(calculate_team_ratings(matches, optimal_K, optimal_s), columns=['Team', 'Rating'])
print (df_final_rankings)
                        Team      Rating
0              Bayern Munich  399.843898
1          Borussia Dortmund  311.559629
2           Bayer Leverkusen  206.012058
3   Borussia Mönchengladbach  161.946876
4                 Schalke 04  130.313364
5               FSV Mainz 05  109.815993
6                    FC Köln   90.355869
7              Hertha Berlin   83.103770
8                   Augsburg   63.408002
9                  Wolfsburg   62.476312
10             Werder Bremen   55.539498
11                Hoffenheim   54.275737
12              Hamburger SV   47.719699
13              Darmstadt 98   42.814199
14       Eintracht Frankfurt   37.806151
15                Ingolstadt   35.617444
16             VfB Stuttgart  -27.234742
17               Hannover 96  -65.373758

6. Finding the most surprising win in the season

Initial idea - the win of low-rated team with the gratest difference in ratings between teams in match.

In [14]:

def calculate_surprising_win (matches, K, s):

    team_ratings = {}

    most_surprising_win = None
    largest_rating_gain = 0

    # iterate over each match
    for idx, match in matches.iterrows():
        #print (match)
        home_team = match['home_team']
        away_team = match['away_team']
        home_score = match['home_score']
        away_score = match['away_score']
        
        #results of the match
        #first without goal's difference consideration
        if home_score > away_score:
            result = 1 # home win of the match
        elif home_score == away_score:
            result = 0.5 # draw
        else:
            result = 0 # loss
        
        # pre-match ratings
        home_r = get_rating(home_team)
        away_r = get_rating(away_team)

        update_ratings(home_team, away_team, result, K, s)

        # rating gain for wining team
        if result == 1:
            rating_gain = abs(get_rating(home_team) - home_r)
        else:
            rating_gain = abs(get_rating(away_team) - away_r)

        # the most surprising win
        if rating_gain > largest_rating_gain:
            largest_rating_gain = rating_gain
            most_surprising_win = match

    return most_surprising_win, largest_rating_gain


most_surprising_win,largest_rating_gain = calculate_surprising_win(matches, optimal_K, optimal_s)
print (most_surprising_win)
print ("Largest rating gain")
print (largest_rating_gain)
match_id                                    3890553
match_date                               2016-05-07
kick_off                               15:30:00.000
competition                 Germany - 1. Bundesliga
season                                    2015/2016
home_team                       Eintracht Frankfurt
away_team                         Borussia Dortmund
home_score                                        1
away_score                                        0
match_status                              available
match_status_360                        unscheduled
last_updated             2023-07-19T12:28:27.869912
last_updated_360                               None
match_week                                       33
competition_stage                    Regular Season
stadium                          Deutsche Bank Park
referee                              Daniel Siebert
home_managers                            Niko KovaÄŤ
away_managers                         Thomas Tuchel
data_version                                  1.1.0
shot_fidelity_version                             2
xy_fidelity_version                               2
Name: 25, dtype: object
Largest rating gain
26.126711087581256