WC2022_predictor / ml /data_prepare.py
phong.dao
init app
49a060f
raw
history blame
10.4 kB
"""
The data process is base on https://www.kaggle.com/code/sslp23/predicting-fifa-2022-world-cup-with-ml
"""
import os.path
import pandas as pd
from sklearn.model_selection import train_test_split
from configs.config import cfg
from configs.constants import DATA_ROOT
def result_finder(home, away):
"""
Encode the data
:param home:
:param away:
:return:
"""
if home > away:
return pd.Series([0, 3, 0])
if home < away:
return pd.Series([1, 0, 3])
else:
return pd.Series([2, 1, 1])
def create_dataset(df: pd.DataFrame):
"""
Create train, test dataset
:param df:
:return:
"""
x_, y = df.iloc[:, 3:], df[["target"]]
x_train, x_test, y_train, y_test = train_test_split(
x_, y, test_size=0.22, random_state=100)
return x_train, x_test, y_train, y_test
def data_preparing():
"""
Data preparing
:return:
"""
try:
df = pd.read_csv(cfg.data.result_url)
except Exception as e:
print(e)
df = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.result_file))
df["date"] = pd.to_datetime(df["date"])
df.dropna(inplace=True)
df = df[(df["date"] >= cfg.day_get_result)].reset_index(drop=True)
# RANK data prepare
rank = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.rank_file))
rank["rank_date"] = pd.to_datetime(rank["rank_date"])
rank = rank[(rank["rank_date"] >= cfg.day_get_rank)].reset_index(drop=True)
rank["country_full"] = rank["country_full"].str.replace("IR Iran", "Iran").str.replace("Korea Republic",
"South Korea").str.replace(
"USA", "United States")
# The merge is made in order to get a dataset FIFA games and its rankings.
rank = rank.set_index(['rank_date']).groupby(['country_full'], group_keys=False).resample('D').first().fillna(
method='ffill').reset_index()
df_wc_ranked = df.merge(
rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]],
left_on=["date", "home_team"], right_on=["rank_date", "country_full"]).drop(["rank_date", "country_full"],
axis=1)
df_wc_ranked = df_wc_ranked.merge(
rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]],
left_on=["date", "away_team"], right_on=["rank_date", "country_full"], suffixes=("_home", "_away")).drop(
["rank_date", "country_full"], axis=1)
# Featuring
df = df_wc_ranked
df[["result", "home_team_points", "away_team_points"]] = df.apply(
lambda x: result_finder(x["home_score"], x["away_score"]), axis=1)
# we create columns that will help in the creation of the features: ranking difference,
# points won at the game vs. team faced rank, and goals difference in the game.
# All features that are not differences should be created for the two teams (away and home).
df["rank_dif"] = df["rank_home"] - df["rank_away"]
df["sg"] = df["home_score"] - df["away_score"]
df["points_home_by_rank"] = df["home_team_points"] / df["rank_away"]
df["points_away_by_rank"] = df["away_team_points"] / df["rank_home"]
# In order to create the features, I'll separate the dataset in home team's and away team's dataset,
# unify them and calculate the past game values.
# After that, I'll separate again and merge them, retrieving the original dataset.
# This process optimizes the creation of the features.
home_team = df[["date", "home_team", "home_score", "away_score", "rank_home", "rank_away", "rank_change_home",
"total_points_home", "result", "rank_dif", "points_home_by_rank", "home_team_points"]]
away_team = df[["date", "away_team", "away_score", "home_score", "rank_away", "rank_home", "rank_change_away",
"total_points_away", "result", "rank_dif", "points_away_by_rank", "away_team_points"]]
home_team.columns = [h.replace("home_", "").replace("_home", "").replace("away_", "suf_").replace("_away", "_suf")
for h in home_team.columns]
away_team.columns = [a.replace("away_", "").replace("_away", "").replace("home_", "suf_").replace("_home", "_suf")
for a in away_team.columns]
team_stats = home_team.append(away_team)
stats_val = []
for index, row in team_stats.iterrows():
team = row["team"]
date = row["date"]
past_games = team_stats.loc[
(team_stats["team"] == team) & (team_stats["date"] < date)
].sort_values(by=['date'], ascending=False)
last5 = past_games.head(5)
goals = past_games["score"].mean()
goals_l5 = last5["score"].mean()
goals_suf = past_games["suf_score"].mean()
goals_suf_l5 = last5["suf_score"].mean()
rank = past_games["rank_suf"].mean()
rank_l5 = last5["rank_suf"].mean()
if len(last5) > 0:
points = past_games["total_points"].values[0] - past_games["total_points"].values[
-1] # amount of points earned
points_l5 = last5["total_points"].values[0] - last5["total_points"].values[-1]
else:
points = 0
points_l5 = 0
gp = past_games["team_points"].mean()
gp_l5 = last5["team_points"].mean()
gp_rank = past_games["points_by_rank"].mean()
gp_rank_l5 = last5["points_by_rank"].mean()
stats_val.append(
[goals, goals_l5, goals_suf, goals_suf_l5, rank, rank_l5, points, points_l5, gp, gp_l5, gp_rank,
gp_rank_l5])
stats_cols = ["goals_mean", "goals_mean_l5", "goals_suf_mean", "goals_suf_mean_l5", "rank_mean", "rank_mean_l5",
"points_mean", "points_mean_l5", "game_points_mean", "game_points_mean_l5",
"game_points_rank_mean", "game_points_rank_mean_l5"]
stats_df = pd.DataFrame(stats_val, columns=stats_cols)
full_df = pd.concat([team_stats.reset_index(drop=True), stats_df], axis=1, ignore_index=False)
home_team_stats = full_df.iloc[:int(full_df.shape[0] / 2), :]
away_team_stats = full_df.iloc[int(full_df.shape[0] / 2):, :]
home_team_stats = home_team_stats[home_team_stats.columns[-12:]]
away_team_stats = away_team_stats[away_team_stats.columns[-12:]]
home_team_stats.columns = ['home_' + str(col) for col in home_team_stats.columns]
away_team_stats.columns = ['away_' + str(col) for col in away_team_stats.columns]
# In order to unify the database, is needed to add home and away suffix for each column.
# After that, the data is ready to be merged.
match_stats = pd.concat([home_team_stats, away_team_stats.reset_index(drop=True)], axis=1, ignore_index=False)
full_df = pd.concat([df, match_stats.reset_index(drop=True)], axis=1, ignore_index=False)
# Drop friendly game
full_df["is_friendly"] = full_df["tournament"].apply(lambda x: find_friendly(x))
full_df = pd.get_dummies(full_df, columns=["is_friendly"])
base_df = full_df[
["date", "home_team", "away_team", "rank_home", "rank_away", "home_score", "away_score", "result",
"rank_dif", "rank_change_home", "rank_change_away", 'home_goals_mean',
'home_goals_mean_l5', 'home_goals_suf_mean', 'home_goals_suf_mean_l5',
'home_rank_mean', 'home_rank_mean_l5', 'home_points_mean',
'home_points_mean_l5', 'away_goals_mean', 'away_goals_mean_l5',
'away_goals_suf_mean', 'away_goals_suf_mean_l5', 'away_rank_mean',
'away_rank_mean_l5', 'away_points_mean', 'away_points_mean_l5', 'home_game_points_mean',
'home_game_points_mean_l5',
'home_game_points_rank_mean', 'home_game_points_rank_mean_l5', 'away_game_points_mean',
'away_game_points_mean_l5', 'away_game_points_rank_mean',
'away_game_points_rank_mean_l5',
'is_friendly_0', 'is_friendly_1']]
df = base_df.dropna()
df["target"] = df["result"].apply(lambda x: no_draw(x))
model_db = create_db(df)
return df, model_db
def find_friendly(x):
"""
Return whether the match is friendly match or not.
:param x:
:return:
"""
if x == "Friendly":
return 1
else:
return 0
def create_db(df):
"""
:param df:
:return:
"""
columns = ["home_team", "away_team", "target", "rank_dif", "home_goals_mean",
"home_rank_mean", "away_goals_mean", "away_rank_mean", "home_rank_mean_l5", "away_rank_mean_l5",
"home_goals_suf_mean", "away_goals_suf_mean", "home_goals_mean_l5", "away_goals_mean_l5",
"home_goals_suf_mean_l5", "away_goals_suf_mean_l5", "home_game_points_rank_mean",
"home_game_points_rank_mean_l5", "away_game_points_rank_mean", "away_game_points_rank_mean_l5",
"is_friendly_0", "is_friendly_1"]
base = df.loc[:, columns]
base.loc[:, "goals_dif"] = base["home_goals_mean"] - base["away_goals_mean"]
base.loc[:, "goals_dif_l5"] = base["home_goals_mean_l5"] - base["away_goals_mean_l5"]
base.loc[:, "goals_suf_dif"] = base["home_goals_suf_mean"] - base["away_goals_suf_mean"]
base.loc[:, "goals_suf_dif_l5"] = base["home_goals_suf_mean_l5"] - base["away_goals_suf_mean_l5"]
base.loc[:, "goals_per_ranking_dif"] = (base["home_goals_mean"] / base["home_rank_mean"]) - (
base["away_goals_mean"] / base["away_rank_mean"])
base.loc[:, "dif_rank_agst"] = base["home_rank_mean"] - base["away_rank_mean"]
base.loc[:, "dif_rank_agst_l5"] = base["home_rank_mean_l5"] - base["away_rank_mean_l5"]
base.loc[:, "dif_points_rank"] = base["home_game_points_rank_mean"] - base["away_game_points_rank_mean"]
base.loc[:, "dif_points_rank_l5"] = base["home_game_points_rank_mean_l5"] - base[
"away_game_points_rank_mean_l5"]
model_df = base[
["home_team", "away_team", "target", "rank_dif", "goals_dif", "goals_dif_l5",
"goals_suf_dif", "goals_suf_dif_l5", "goals_per_ranking_dif", "dif_rank_agst", "dif_rank_agst_l5",
"dif_points_rank", "dif_points_rank_l5", "is_friendly_0", "is_friendly_1"]]
return model_df
def no_draw(x):
"""
:param x:
:return:
"""
if x == 2:
return 1
else:
return x