-
Notifications
You must be signed in to change notification settings - Fork 2
/
bookies_correlation.py
169 lines (146 loc) · 5.37 KB
/
bookies_correlation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# Python script to correlate bookmaker's odds with simple results of PageRank algorithm
# As described on https://davidabelman.wordpress.com/2015/03/04/146/
import pandas as pd
import SR_Graph
import matplotlib.pyplot as plt
import math
reload(SR_Graph)
def open_and_combine_csvs(csv_list):
"""
Pass a list of csv filenames, all imported into one pandas dataframe
"""
first_file = True
for filename in csv_list:
if first_file:
df_final = pd.read_csv(filename)
first_file = False
else:
df_temp = pd.read_csv(filename)
df_final = pd.concat([df_final, df_temp], ignore_index=True)
return df_final
def create_scorerank_graph(matches_to_use, option = None):
"""
Create a scorerank graph
"""
G = SR_Graph.Graph(leak=0.2)
for i in matches_to_use.index:
# Iterate through each match, pulling out Home/Away teams and goals scored
match_row = matches_to_use.ix[i]
home_team, away_team, home_score, away_score = match_row[[
'HomeTeam',
'AwayTeam',
'FTHG',
'FTAG'
]]
# Add edges for each team's goals
G.add_edge(team_from = home_team, team_to = away_team, weight = int(away_score))
G.add_edge(team_from = away_team, team_to = home_team, weight = int(home_score))
# Set each scorerank to 1 to start with and iterate
G.redistribute_scoreranks()
G.iterate_scoreranks_n(5)
return G
def bookie_calculator(home_odds, draw_odds, away_odds, fn=None):
"""
Given a bookie's odds for home/draw/away, outputs a number from -1 to 1
-1 indicates 100% certain away win, +1 is 100% certain home win
"""
# Calculate percentage odds
h_pc = 1.0/home_odds
d_pc = 1.0/draw_odds
a_pc = 1.0/away_odds
total_pc = h_pc + d_pc + a_pc
result = (1*h_pc - 1*a_pc)*1.0 / total_pc
if fn==None:
return result
else:
# Could add some function here to skew results, e.g. sigmoid, squared, etc.
return result
def to_datetime(d):
"""
Converts string date to datetime
"""
import datetime
return datetime.datetime.strptime(d, '%d/%m/%y')
def enough_data(match_df, hometeam, awayteam):
"""
Returns True if both teams appear in historical matches over threshold
"""
hometeam_appears_number = sum(match_df['HomeTeam']==hometeam)+sum(match_df['AwayTeam']==hometeam)
awayteam_appears_number = sum(match_df['HomeTeam']==awayteam)+sum(match_df['AwayTeam']==awayteam)
if hometeam_appears_number >= matches_required_for_prediction:
if awayteam_appears_number >= matches_required_for_prediction:
return True
return False
# Main script below:
# Will loop through all matches in loaded data
# If there is enough historical data it will build ScoreRank model on historic games
# Uses ScoreRank model to predict game outcome
# Looks at Bookie prediction (converts 3 odds into single metric)
# Compares ScoreRank vs Bookie in correlation chart once all matches analysed
# Outputs R^2 value
if __name__ == '__main__':
# Settings
historical_match_number = 380
matches_required_for_prediction = 3
debug = True
list_to_save = []
# Load CSV files into dataframe
years = range(2013, 2015)
csv_list = ['{}.csv'.format(year) for year in years]
df = open_and_combine_csvs(csv_list)
if debug:
print "Combined CVs and put into 'df' dataframe"
# Convert dates to datetime and sort
df['Date'] = df['Date'].apply(to_datetime)
df.sort(['Date'], inplace=True)
df = df.reindex(range(len(df)))
if debug:
print "Converted dates to datetimes"
print df['Date'].head()
# For each line in df
previous_date = None
for i in df.index:
match_row = df.ix[i]
# Skip if not enough historical matches
if i<historical_match_number:
if debug:
print "i = {}, < {} so skipping...".format(i, historical_match_number)
continue
if debug:
print "i = {}, > {} so continuing...".format(i, historical_match_number)
# If date not previously considered, create a new model:
match_date = match_row['Date']
if match_date != previous_date:
if debug:
print "Match date ({}) not equal to previous, creating new model".format(match_date)
# Find previous X matches prior to this date, create graph
i_start = i - historical_match_number
matches_to_use = df.ix[i_start:i]
G = create_scorerank_graph(matches_to_use)
previous_date = match_date
# Make prediction using graph, if enough data
home_team, away_team = match_row[ [ 'HomeTeam', 'AwayTeam' ] ]
if not enough_data(matches_to_use, home_team, away_team):
continue
scorerank_difference = G.get_scorerank(home_team) - G.get_scorerank(away_team)
# Make bookie prediction, use B365 only for now
home_odds, draw_odds, away_odds = match_row[ ['B365H', 'B365D', 'B365A'] ]
bookie_prediction = bookie_calculator(home_odds, draw_odds, away_odds)
# Save result
row_to_save = {'HomeTeam':home_team, 'AwayTeam':away_team, 'SR':scorerank_difference, 'Bookie':bookie_prediction}
list_to_save.append(row_to_save)
if debug:
print "{} vs {}".format(home_team, away_team)
print "Scorerank: {} - Bookies: {}".format(scorerank_difference, bookie_prediction)
print ""
# Convert all of our calculations (rows = matches) to a dataframe
final_df = pd.DataFrame(list_to_save)
# Calculate success metric
correlation = final_df.SR.corr(final_df.Bookie)
print "Correlation between Bet365 and ScoreRank: {}".format(correlation**2)
# Plot the correlation
plt.scatter(final_df.SR, final_df.Bookie)
plt.title('Correlation between ScoreRank model and Bet365 odds')
plt.xlabel('Scorerank prediction (un-normalised)')
plt.ylabel('Bet365 prediction')
plt.show()