-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Code reorganize, start to prediction
- Loading branch information
Showing
55 changed files
with
7,165 additions
and
2,202 deletions.
There are no files selected for viewing
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import glob | ||
import pandas as pd | ||
import numpy as np | ||
import pytz | ||
from datetime import timezone,datetime,timedelta | ||
import api.util | ||
|
||
def bind_full(df_sofa,df_op): | ||
df_sofa_=df_sofa.copy() | ||
df_op_=df_op.copy() | ||
print(f'IN: Sofa={df_sofa_.shape}, OP={df_op_.shape}') | ||
df_sofa_['t1_first']=df_sofa_['t1'].apply(lambda x: x.split(' ')[0]) | ||
df_sofa_['t2_first']=df_sofa_['t2'].apply(lambda x: x.split(' ')[0]) | ||
df_op_['t1_first']=df_op_['t1'].apply(lambda x: x.split(' ')[0]) | ||
df_op_['t2_first']=df_op_['t2'].apply(lambda x: x.split(' ')[0]) | ||
|
||
# Both teams step: | ||
df_op_=df_op_.rename(columns={'tid1':'op_tid1','tid2':'op_tid2','t1':'op_t1','t2':'op_t2','mid':'op_mid'}) | ||
df_merged=df_sofa_.merge(df_op_[['op_mid','op_tid1','op_tid2','op_t1','op_t2', 'ds']], left_on=['ds','t1', 't2'], right_on=['ds','op_t1', 'op_t2'], how='left') | ||
df_sofa_full=df_merged[~df_merged['op_mid'].isna()] | ||
df_sofa_=df_merged[df_merged['op_mid'].isna()][df_sofa_.columns] | ||
print(f'BOTH teams step: Binded={df_sofa_full.shape}, Total={df_sofa_full.shape}, Rest={df_sofa_.shape}') | ||
|
||
# First team step: | ||
teams_exclude=['inter','racing','liverpool','nacional','arsenal','san jose'] | ||
df_sofa_none=df_sofa_[df_sofa_['t1'].isin(teams_exclude)] | ||
df_sofa_=df_sofa_[~df_sofa_['t1'].isin(teams_exclude)] | ||
df_merged=df_sofa_.merge(df_op_[['op_mid','op_tid1','op_tid2','op_t1','op_t2', 'ds', 't1_first', 'country']], left_on=['ds','t1_first', 't2','country'], right_on=['ds','t1_first', 'op_t2','country'], how='left') | ||
df_binded=df_merged[~df_merged['op_mid'].isna()] | ||
df_sofa_full=pd.concat([df_sofa_full,df_binded], axis=0) | ||
df_sofa_=df_merged[df_merged['op_mid'].isna()][df_sofa_.columns] | ||
print(f'First team step: Binded={df_binded.shape}, Total={df_sofa_full.shape}, Rest={df_sofa_.shape}, Excluded={df_sofa_none.shape}') | ||
|
||
# Second team step: | ||
teams_exclude=['racing','arsenal'] | ||
df_sofa_none=pd.concat([df_sofa_none,df_sofa_[df_sofa_['t2'].isin(teams_exclude)]], axis=0) | ||
df_sofa_=df_sofa_[~df_sofa_['t2'].isin(teams_exclude)] | ||
df_merged=df_sofa_.merge(df_op_[['op_mid','op_tid1','op_tid2','op_t1','op_t2', 'ds', 't2_first', 'country']], left_on=['ds','t2_first', 't1','country'], right_on=['ds','t2_first', 'op_t1','country'], how='left') | ||
df_binded=df_merged[~df_merged['op_mid'].isna()] | ||
df_sofa_full=pd.concat([df_sofa_full,df_binded], axis=0) | ||
df_sofa_=df_merged[df_merged['op_mid'].isna()][df_sofa_.columns] | ||
df_sofa_=pd.concat([df_sofa_,df_sofa_none], axis=0) | ||
print(f'Second team step: Binded={df_binded.shape}, Total={df_sofa_full.shape}, Rest={df_sofa_.shape}, Excluded={df_sofa_none.shape}') | ||
|
||
return (df_sofa_full, df_sofa_) | ||
|
||
def check_teams(df): | ||
a=df[['country','tid1','t1','op_tid1','op_t1']] | ||
b=df[['country','tid2','t2','op_tid2','op_t2']] | ||
a.columns=b.columns=['country','tid','t','op_tid','op_t'] | ||
teams=pd.concat([a,b], axis=0).drop_duplicates().sort_values(by='tid') | ||
mask = teams.tid.duplicated(keep=False) | ||
#display(teams[mask]) | ||
return teams | ||
|
||
def save(df, teams): | ||
fn=f'data/teams_ss_op.csv' | ||
if path.exists(fn): | ||
teams_old=pd.read_csv(fn, index_col=None) | ||
teams=pd.concat([teams_old,teams], axis=0).drop_duplicates() | ||
teams.to_csv(fn, index=False) | ||
|
||
fn=f'data/binds_ss_op.csv' | ||
cols=['country', 'ds', 'mid','tid1','tid2','t1','t2','op_mid','op_tid1','op_tid2','op_t1','op_t2'] | ||
if path.exists(fn): | ||
df_old=pd.read_csv(fn, index_col=None).drop_duplicates() | ||
df=pd.concat([df_old[cols],df[cols]], axis=0) | ||
print('save',df.shape) | ||
df=df.drop_duplicates(subset=['mid','op_mid']) | ||
print('save',df.shape) | ||
df[cols].to_csv(fn, index=False) | ||
|
||
def filter_tids(df, teams): | ||
teams_=teams.rename(columns={'tid':'tid1','op_tid':'op_tid1'}) | ||
df_=df.merge(teams_[['tid1','op_tid1', 'country']], left_on=['tid1', 'country'], right_on=['tid1','country'], how='left') | ||
print('T1 merged: ', df_.shape) | ||
teams_=teams.rename(columns={'tid':'tid2','op_tid':'op_tid2'}) | ||
df_=df_.merge(teams_[['tid2','op_tid2', 'country']], left_on=['tid2', 'country'], right_on=['tid2','country'], how='left') | ||
print('T2 merged: ',df_.shape) | ||
df_both=df_[~(df_['op_tid1'].isna() | df_['op_tid2'].isna())] | ||
df_1=df_[~df_['op_tid1'].isna() & df_['op_tid2'].isna()] | ||
df_2=df_[df_['op_tid1'].isna() & ~df_['op_tid2'].isna()] | ||
df_none=df_[(df_['op_tid1'].isna()) & (df_['op_tid2'].isna())] | ||
print('IN: {}, BOTH: {}, ONLY T1: {}, ONLY T2: {}, NO BINDS: {}, OUT: {}'.format(len(df.index),len(df_both.index),len(df_1.index),len(df_2.index),len(df_none.index), len(df_both.index)+len(df_1.index)+len(df_2.index)+len(df_none.index))) | ||
return df_both, df_1,df_2,df_none | ||
|
||
|
||
def process_by_tid(df_ss, df_op, type='both'): | ||
df_op_=df_op.copy() | ||
df_op_=df_op_.rename(columns={'tid1':'op_tid1','tid2':'op_tid2','t1':'op_t1','t2':'op_t2','mid':'op_mid'}) | ||
print(f'IN: Sofa={df_ss.shape}, OP={df_op_.shape}') | ||
df_ss['date']=df_ss.ds.apply(lambda x: x.strftime('%d-%m-%Y')) | ||
df_op_['date']=df_op_.ds.apply(lambda x: x.strftime('%d-%m-%Y')) | ||
|
||
if type=='both': | ||
# By Both teams | ||
df_merged=df_ss.merge(df_op_[['op_mid','op_tid1','op_tid2','op_t1','op_t2', 'ds']], on=['ds','op_tid1','op_tid2'], how='left') | ||
df_binded=df_merged[~df_merged['op_mid'].isna()] | ||
df_none=df_merged[df_merged['op_mid'].isna()][df_ss.columns] | ||
print(f'Both teams step, exact dates: Binded={df_binded.shape}, Total={df_binded.shape}, Rest={df_none.shape}') | ||
df_merged=df_none.merge(df_op_[['op_mid','op_tid1','op_tid2','op_t1','op_t2','date', 'country']], on=['date','op_tid1','op_tid2', 'country'], how='left') | ||
df_binded1=df_merged[~df_merged['op_mid'].isna()] | ||
df_binded=pd.concat([df_binded,df_binded1], axis=0).drop_duplicates() | ||
df_ss=df_merged[df_merged['op_mid'].isna()][df_ss.columns] | ||
print(f'Both teams step, within a day: Binded={df_binded1.shape}, Total={df_binded.shape}, Rest={df_ss.shape}') | ||
|
||
if type=='first': | ||
# By First team | ||
df_merged=df_ss[[x for x in df_ss.columns if x!='op_tid2']].merge(df_op_[['op_mid','op_tid1','op_tid2','op_t1','op_t2', 'ds']], on=['ds','op_tid1'], how='left') | ||
df_binded=df_merged[~df_merged['op_mid'].isna()] | ||
df_none=df_merged[df_merged['op_mid'].isna()][df_ss.columns] | ||
print(f'First team step, exact dates: Binded={df_binded.shape}, Total={df_binded.shape}, Rest={df_none.shape}') | ||
df_merged=df_none[[x for x in df_none.columns if x!='op_tid2']].merge(df_op_[['op_mid','op_tid1','op_tid2','op_t1','op_t2','date', 'country']], on=['date','op_tid1', 'country'], how='left') | ||
df_binded1=df_merged[~df_merged['op_mid'].isna()] | ||
df_binded=pd.concat([df_binded,df_binded1], axis=0).drop_duplicates() | ||
df_ss=df_merged[df_merged['op_mid'].isna()][df_ss.columns] | ||
print(f'First team step, within a day: Binded={df_binded1.shape}, Total={df_binded.shape}, Rest={df_ss.shape}') | ||
|
||
if type=='second': | ||
# By Second team | ||
df_merged=df_ss[[x for x in df_ss.columns if x!='op_tid1']].merge(df_op_[['op_mid','op_tid1','op_tid2','op_t1','op_t2', 'ds']], on=['ds','op_tid2'], how='left') | ||
df_binded=df_merged[~df_merged['op_mid'].isna()] | ||
#df_binded=pd.concat([df_binded,df_binded1], axis=0).drop_duplicates() | ||
df_none=df_merged[df_merged['op_mid'].isna()][df_ss.columns] | ||
print(f'Second team step, exact dates: Binded={df_binded.shape}, Total={df_binded.shape}, Rest={df_none.shape}') | ||
df_merged=df_none[[x for x in df_none.columns if x!='op_tid1']].merge(df_op_[['op_mid','op_tid1','op_tid2','op_t1','op_t2','date', 'country']], on=['date','op_tid2', 'country'], how='left') | ||
df_binded1=df_merged[~df_merged['op_mid'].isna()] | ||
df_binded=pd.concat([df_binded,df_binded1], axis=0).drop_duplicates() | ||
df_ss=df_merged[df_merged['op_mid'].isna()][df_ss.columns] | ||
print(f'Second team step, within a day: Binded={df_binded1.shape}, Total={df_binded.shape}, Rest={df_ss.shape}') | ||
|
||
return df_binded.drop(columns='date'),df_ss.drop(columns='date') | ||
|
||
|
||
def bind_iteration(n,df, df_ss, df_op): | ||
print(f'**** {n} ITERATION ****') | ||
teams=check_teams(df) | ||
save(df,teams) | ||
|
||
df_both, df_1,df_2,df_none=filter_tids(df_ss, teams) | ||
|
||
df_binded,df_both=process_by_tid(df_both, df_op, type='both') | ||
df=pd.concat([df,df_binded], axis=0).drop_duplicates() | ||
print(df.shape) | ||
|
||
df_binded,df_1=process_by_tid(df_1, df_op, type='first') | ||
df=pd.concat([df,df_binded], axis=0).drop_duplicates() | ||
print(df.shape) | ||
|
||
df_binded,df_2=process_by_tid(df_2, df_op, type='second') | ||
df=pd.concat([df,df_binded], axis=0).drop_duplicates() | ||
print(df.shape) | ||
teams=check_teams(df) | ||
save(df,teams) | ||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
import os | ||
from os import path | ||
import glob | ||
import pandas as pd | ||
import numpy as np | ||
import seaborn as sns | ||
import pickle | ||
import pytz | ||
from datetime import timezone,datetime,timedelta | ||
from sklearn.preprocessing import LabelEncoder,OneHotEncoder | ||
|
||
import api.util | ||
#from api.op_dp import OpDataProvider | ||
from op_dp import OpDataProvider | ||
#from api.sofa_dp import SofaDataProvider | ||
from sofa_dp import SofaDataProvider | ||
|
||
class DataCollector: | ||
def __init__(self): | ||
self.LOCAL_TZ = 'Asia/Almaty' | ||
self.SERVER_TZ = 'UTC' | ||
self.DATA_PATH='data/' | ||
self.ELO_DATA_PATH='data/elo/' | ||
self.PREREQUISITES_PATH='prerequisites/' | ||
self.COL_CAT=[] | ||
self.COL_NUM=[] | ||
self.COL_LBL=[] | ||
self.COL_INF=[] | ||
|
||
def _load_prerequisites(self,name): | ||
with open(os.path.join(self.PREREQUISITES_PATH, name),'rb') as f: | ||
encoder = pickle.load(f) | ||
return encoder | ||
|
||
def _save_prerequisite(self, name, data): | ||
folder='prerequisites/' | ||
os.makedirs(self.PREREQUISITES_PATH, mode=0o777, exist_ok=True) | ||
with open(os.path.join(self.PREREQUISITES_PATH, name), mode='wb') as f: | ||
pickle.dump(data, f) | ||
|
||
def _ff(self, columns): | ||
if len(self.INCLUDE)>0: | ||
return [x for x in columns if x in self.INCLUDE] | ||
else: | ||
return [x for x in columns if x not in self.EXCLUDE] | ||
|
||
def _encode(self, enctype, features, outs, df): | ||
if (len(self.INCLUDE)>0 and outs[0] in self.INCLUDE) or outs[0] in self.EXCLUDE: | ||
return df | ||
name='_'.join(features) | ||
if self.LOAD: | ||
encoder=self._load_prerequisites(f'{enctype}_{features[0]}') | ||
else: | ||
if enctype=='sc': | ||
encoder = MinMaxScaler() | ||
elif enctype=='le': | ||
encoder = LabelEncoder() | ||
elif enctype=='ohe': | ||
encoder = OneHotEncoder() | ||
if len(features)==1: | ||
encoder.fit(df[features].values) | ||
else: | ||
encoder.fit(pd.concat([pd.DataFrame(df[features[0]].unique(), columns=[name]),pd.DataFrame(df[features[1]].unique(), columns=[name])])[name]) | ||
self._save_prerequisite(f'{enctype}_{name}', encoder) | ||
if enctype=='ohe': | ||
return encoder.transform(df[features].values).toarray() | ||
if len(features)==1: | ||
df[outs[0]] = encoder.transform(df[features].values) | ||
else: | ||
df[outs[0]] = encoder.transform(df[features[0]]) | ||
df[outs[1]] = encoder.transform(df[features[1]]) | ||
return df | ||
|
||
def _encode_teams(self, df): | ||
teams_name=self.ELO_DATA_PATH+'teams.csv' | ||
teams_saved=pd.read_csv(teams_name, index_col=None) | ||
teams=df[['team']].dropna().drop_duplicates() | ||
teams_new=teams[~teams.team.isin(teams_saved.team)] | ||
print(teams_new) | ||
if not teams_new.empty: | ||
print('New teams!') | ||
id=teams_saved.id.max()+1 | ||
#id=0 | ||
teams_list=[] | ||
for row in teams_new.itertuples(): | ||
if len(row.team)>1: | ||
teams_list.append({'team':row.team, 'id':id}) | ||
id+=1 | ||
#break | ||
teams_saved=pd.concat([teams_saved,pd.DataFrame(teams_list)]) | ||
teams_saved.id=teams_saved.id.astype(int) | ||
teams_saved.to_csv(teams_name, index=False) | ||
df=df.merge(teams_saved, on='team', how='left') | ||
return df | ||
|
||
def _add_elo(self, df_src,df_elo): | ||
df_teams=pd.read_csv(self.DATA_PATH+'teams.csv', index_col=None) | ||
df_elo_merged=df_elo.merge(df_teams[['id','tid']], on='id', how='left').drop_duplicates() | ||
df_elo_merged=df_elo_merged.dropna() | ||
df_src['de']=df_src.ds.apply(lambda x: x.strftime('%Y-%m-%d')) | ||
df_elo_merged=df_elo_merged.rename(columns={'tid':'tid1', 'elo':'elo1'}) | ||
df_src=df_src.merge(df_elo_merged[['tid1','de','elo1']], on=['tid1','de'], how='left') | ||
df_elo_merged=df_elo_merged.rename(columns={'tid1':'tid2', 'elo1':'elo2'}) | ||
df_src=df_src.merge(df_elo_merged[['tid2','de','elo2']], on=['tid2','de'], how='left') | ||
return df_src | ||
|
||
def _provide_elo(self): | ||
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join(self.DATA_PATH+'elo/', 'elo_*.csv')))) | ||
df=df[['Club', 'Country', 'Level', 'Elo', 'From', 'To']] | ||
df.columns=['team', 'country', 'level', 'elo', 'ds', 'de'] | ||
df=self._encode_teams(df) | ||
return df | ||
|
||
def _provide_sofa(self): | ||
dp=SofaDataProvider(load=True) | ||
df=dp._load_data() | ||
return df.drop_duplicates(subset='mid', keep='last') | ||
|
||
def _provide_op(self): | ||
dp=OpDataProvider(load=True) | ||
df=dp._load_data() | ||
return df | ||
|
||
def _bind_sofa_op(self): | ||
df_sofa=self._provide_sofa() | ||
df_op=self._provide_op() | ||
return None | ||
|
||
def _load_data(self): | ||
df_sofa=self._provide_sofa() | ||
df_elo=self._provide_elo() | ||
df_sofa=self._add_elo(df_sofa,df_elo) | ||
return df_sofa | ||
|
||
def provide_data(self): | ||
|
||
return None |
Oops, something went wrong.