-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
115 lines (89 loc) · 4.76 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import numpy as np
import numpy.random
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import util
class Data:
def __init__(self):
# Initialize our model
self.rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
# Prepare our data
serie_a_matches = pd.read_csv("data/serie_a_matches.csv", index_col=0)
epl_matches = pd.read_csv("data/premier_league_matches.csv", index_col=0)
la_liga_matches = pd.read_csv("data/la_liga_matches.csv", index_col=0)
bundesliga_matches = pd.read_csv("data/bundesliga_matches.csv", index_col=0)
ligue_1_matches = pd.read_csv("data/ligue_1_matches.csv", index_col=0)
matches = pd.concat([serie_a_matches, epl_matches, la_liga_matches, bundesliga_matches, ligue_1_matches])
# Clean up the data (inefficient - might upgrade)
for opp in matches["opponent"]:
start = 0
for i in range(len(opp)):
if opp[i].isupper():
start = i
break
matches = matches.replace(to_replace=opp, value=opp[start:])
util.fix_team_names(matches)
# predictors
matches['venue_code'] = matches["venue"].astype("category").cat.codes
matches['team_code'] = matches["team"].astype("category").cat.codes
matches['date_code'] = matches.index
self.team_codes = dict(zip(matches['team'], matches['team_code']))
for opp in matches["opponent"]:
if self.team_codes.get(opp) is None:
self.team_codes[opp] = len(self.team_codes) + 1
matches["opp_code"] = matches["opponent"].map(self.team_codes)
matches["gf"] = [float(gf[0]) for gf in matches["gf"]]
matches["ga"] = [float(ga[0]) for ga in matches["ga"]]
matches["target"] = [util.define_result(r) for r in matches["result"]]
# Rolling averages
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt", "sh_against", "sot_against", "dist_against",
"fk_against", "pk_against", "pkatt_against"]
new_cols = [f"{c}_rolling" for c in cols]
matches_rolling = matches.groupby("team").apply(lambda x: util.rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])
# fit our model
self.train = matches_rolling
predictors = ["venue_code", "opp_code", "date_code"] + new_cols
self.rf.fit(self.train[predictors], self.train["target"])
def averages(self, team: str, opponent=None):
ret = [0] * 14
if team in self.train["team"].unique():
group = self.train.groupby("team").get_group(team) # Error here is happening because the team has no data
if opponent is not None:
group = group[group["opponent"] == opponent]
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt", "sh_against", "sot_against", "dist_against",
"fk_against", "pk_against", "pkatt_against"]
ret = group[cols].fillna(0).mean()
return ret
def st_devs(self, team: str, opponent=None):
ret = [0] * 14
if team in self.train["team"].unique():
group = self.train.groupby("team").get_group(team) # see this line in averages
if opponent is not None:
group = group[group["opponent"] == opponent]
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt", "sh_against", "sot_against", "dist_against",
"fk_against", "pk_against", "pkatt_against"]
ret = group[cols].fillna(0).std(ddof=0, numeric_only=True)
return ret
def create_stats(self, team: str, opponent: str):
# Create 2 random normal variables for 1. Overall stats and 2. stats vs opponent
rng = numpy.random.default_rng()
# missing gf and ga
overall_avgs = np.array(self.averages(team))
overall_stds = np.array(self.st_devs(team))
opponent_avgs = np.array(self.averages(team, opponent))
opponent_stds = np.array(self.averages(team, opponent))
# Convert NaN to 0
overall_avgs[np.isnan(overall_avgs)] = 0
overall_stds[np.isnan(overall_stds)] = 0
opponent_avgs[np.isnan(opponent_avgs)] = 0
opponent_stds[np.isnan(opponent_stds)] = 0
# Apply weights w1 and w2
w1 = 1.00
w2 = 0.10
overall_avgs = overall_avgs * w1
opponent_avgs = opponent_avgs * w2
# Combine, randomly select once from each distribution, return completed list
new_stats = [int(rng.normal(overall_avgs[i] + opponent_avgs[i], overall_stds[i] + opponent_stds[i])) for i in range(overall_avgs.size)]
return new_stats