Skip to content

Commit

Permalink
Todays
Browse files Browse the repository at this point in the history
  • Loading branch information
hwait committed Jan 31, 2021
1 parent 848b72d commit 07a15ce
Show file tree
Hide file tree
Showing 20 changed files with 2,786 additions and 739 deletions.
Binary file added api/__pycache__/bind.cpython-38.pyc
Binary file not shown.
Binary file modified api/__pycache__/data_collector.cpython-38.pyc
Binary file not shown.
Binary file modified api/__pycache__/data_provider.cpython-38.pyc
Binary file not shown.
Binary file modified api/__pycache__/op_dp.cpython-38.pyc
Binary file not shown.
Binary file modified api/__pycache__/op_parser.cpython-38.pyc
Binary file not shown.
Binary file modified api/__pycache__/sofa_dp.cpython-38.pyc
Binary file not shown.
Binary file modified api/__pycache__/sofa_parser.cpython-38.pyc
Binary file not shown.
7 changes: 4 additions & 3 deletions api/bind.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import glob
from os import path
import pandas as pd
import numpy as np
import pytz
Expand Down Expand Up @@ -49,7 +50,7 @@ def check_teams(df):
b=df[['country','tid2','t2','op_tid2','op_t2']]
a.columns=b.columns=['country','tid','t','op_tid','op_t']
teams=pd.concat([a,b], axis=0).drop_duplicates().sort_values(by='tid')
mask = teams.tid.duplicated(keep=False)
#mask = teams.tid.duplicated(keep=False)
#display(teams[mask])
return teams

Expand Down Expand Up @@ -85,7 +86,7 @@ def filter_tids(df, teams):
return df_both, df_1,df_2,df_none


def process_by_tid(df_ss, df_op, type='both'):
def process_by_tid(df_ss, df_op, type='both'):
df_op_=df_op.copy()
df_op_=df_op_.rename(columns={'tid1':'op_tid1','tid2':'op_tid2','t1':'op_t1','t2':'op_t2','mid':'op_mid'})
print(f'IN: Sofa={df_ss.shape}, OP={df_op_.shape}')
Expand Down Expand Up @@ -132,7 +133,7 @@ def process_by_tid(df_ss, df_op, type='both'):
return df_binded.drop(columns='date'),df_ss.drop(columns='date')


def bind_iteration(n,df, df_ss, df_op):
def bind_iteration(n,df, df_ss, df_op):
print(f'**** {n} ITERATION ****')
teams=check_teams(df)
save(df,teams)
Expand Down
104 changes: 88 additions & 16 deletions api/data_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

import api.util
#from api.op_dp import OpDataProvider
from op_dp import OpDataProvider
#from api.sofa_dp import SofaDataProvider
from sofa_dp import SofaDataProvider
from api.op_dp import OpDataProvider
#from op_dp import OpDataProvider
from api.sofa_dp import SofaDataProvider
#from sofa_dp import SofaDataProvider

class DataCollector:
def __init__(self):
def __init__(self, today=False):
self.LOCAL_TZ = 'Asia/Almaty'
self.SERVER_TZ = 'UTC'
self.DATA_PATH='data/'
Expand All @@ -26,14 +26,14 @@ def __init__(self):
self.COL_NUM=[]
self.COL_LBL=[]
self.COL_INF=[]
self.TODAY=today

def _load_prerequisites(self,name):
with open(os.path.join(self.PREREQUISITES_PATH, name),'rb') as f:
encoder = pickle.load(f)
return encoder

def _save_prerequisite(self, name, data):
folder='prerequisites/'
os.makedirs(self.PREREQUISITES_PATH, mode=0o777, exist_ok=True)
with open(os.path.join(self.PREREQUISITES_PATH, name), mode='wb') as f:
pickle.dump(data, f)
Expand Down Expand Up @@ -96,7 +96,7 @@ def _encode_teams(self, df):
def _add_elo(self, df_src,df_elo):
df_teams=pd.read_csv(self.DATA_PATH+'teams.csv', index_col=None)
df_elo_merged=df_elo.merge(df_teams[['id','tid']], on='id', how='left').drop_duplicates()
df_elo_merged=df_elo_merged.dropna()
#df_elo_merged=df_elo_merged.dropna()
df_src['de']=df_src.ds.apply(lambda x: x.strftime('%Y-%m-%d'))
df_elo_merged=df_elo_merged.rename(columns={'tid':'tid1', 'elo':'elo1'})
df_src=df_src.merge(df_elo_merged[['tid1','de','elo1']], on=['tid1','de'], how='left')
Expand All @@ -105,33 +105,105 @@ def _add_elo(self, df_src,df_elo):
return df_src

def _provide_elo(self):
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join(self.DATA_PATH+'elo/', 'elo_*.csv'))))
if self.TODAY:
df = pd.read_csv(self.DATA_PATH+'elo/elo_{:%Y-%m-%d}.csv'.format(datetime.today()-timedelta(days=1)), index_col=None)
else:
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join(self.DATA_PATH+'elo/', 'elo_*.csv'))))
df=df[['Club', 'Country', 'Level', 'Elo', 'From', 'To']]
df.columns=['team', 'country', 'level', 'elo', 'ds', 'de']
df=self._encode_teams(df)
return df

def _provide_sofa(self):
dp=SofaDataProvider(load=True)
dp=SofaDataProvider(load=True, today=self.TODAY)
df=dp._load_data()
print(len(df))
return df.drop_duplicates(subset='mid', keep='last')

def _provide_op(self):
dp=OpDataProvider(load=True)
dp=OpDataProvider(load=True, today=self.TODAY)
df=dp._load_data()
return df

def _bind_sofa_op(self):
df_sofa=self._provide_sofa()
def _bind_today(self,df):
df_teams=pd.read_csv('data/teams.csv', index_col=None)
df_teams=df_teams[['tid','op_tid']].drop_duplicates()

df=df.merge(df_teams, left_on='tid1', right_on='tid')
df=df.rename(columns={'op_tid':'op_tid1'})
df=df.drop(columns=['tid'])
df=df.merge(df_teams, left_on='tid2', right_on='tid')
df=df.rename(columns={'op_tid':'op_tid2'})
df=df.drop(columns=['tid'])

df_op=self._provide_op()
df_op=df_op.rename(columns={'tid1':'op_tid1','tid2':'op_tid2'})
df=df.merge(df_op[['op_tid1','op_tid2', 'odds_away','odds_draw','odds_home', 'oddsprob_home', 'oddsprob_draw', 'oddsprob_away', 'drift_home', 'drift_away', 'drift_draw']], on=['op_tid1','op_tid2'], how='left')
return df

def _bind_sofa_op(self,df):
df_op=self._provide_op()
return None
df_binds=pd.read_csv('data/binds_ss_op.csv', index_col=None)
df_op=df_op.merge(df_binds[['op_mid','mid']], left_on='mid', right_on='op_mid')
return df.merge(df_op[['mid_y','odds_away','odds_draw','odds_home','oddsprob_home','oddsprob_draw','oddsprob_away','drift_home','drift_away','drift_draw']], left_on='mid', right_on='mid_y', how='left')

def _load_data(self):
df_sofa=self._provide_sofa()
df_elo=self._provide_elo()
df_sofa=self._add_elo(df_sofa,df_elo)
if self.TODAY:
df_sofa=self._bind_today(df_sofa)
#df_sofa=self._bind_sofa_op(df_sofa)
return df_sofa

def provide_data(self):

return None
def provide_today(self, double=True):
df=self._load_data()
df['psft']=0
df['psht']=0
df['w1']=0
df['wx']=0
df['w2']=0
df_home=df.copy()
df_home=df_home.rename(columns={'homeScoreHT':'ht1','awayScoreHT':'ht2','sc1':'ft1','sc2':'ft2','vote_home':'vote1','vote_draw':'votex','vote_away':'vote2','home_formation':'form1','away_formation':'form2','oddsprob_home':'oddsprob1','oddsprob_draw':'oddsprobx','oddsprob_away':'oddsprob2','drift_home':'drift1','drift_draw':'driftx','drift_away':'drift2'})
if double:
df_home['side']=1
df_away=df.copy()
df_away['side']=0
df_away=df_away.rename(columns={'vote_home':'vote2','vote_draw':'votex','vote_away':'vote1',
'home_formation':'form2','away_formation':'form1','elo1':'elo2','elo2':'elo1','t1':'t2','t2':'t1',
'tid1':'tid2','tid2':'tid1','odds_away':'odds_home','odds_home':'odds_away','oddsprob1':'oddsprob2',
'oddsprob2':'oddsprob1','drift1':'drift2','drift2':'drift1'})
df_away['psft']=df_away['psft']*-1
df_away['psht']=df_away['psht']*-1

df_home=pd.concat([df_home,df_away], axis=0)

return df_home.reset_index(drop=True)

def provide_data(self, double=True):
df=self._load_data()
df['psft']=df.sc1-df.sc2
df['psht']=df.homeScoreHT-df.awayScoreHT
df['w1']=np.where(df.winner=='home',1,0)
df['wx']=np.where(df.winner=='draw',1,0)
df['w2']=np.where(df.winner=='away',1,0)
df_home=df.copy()
df_home=df_home.rename(columns={'homeScoreHT':'ht1','awayScoreHT':'ht2','sc1':'ft1','sc2':'ft2','vote_home':'vote1','vote_draw':'votex','vote_away':'vote2','home_formation':'form1','away_formation':'form2','oddsprob_home':'oddsprob1','oddsprob_draw':'oddsprobx','oddsprob_away':'oddsprob2','drift_home':'drift1','drift_draw':'driftx','drift_away':'drift2'})
if double:
df_home['side']=1
df_away=df.copy()
df_away['side']=0
df_away=df_away.rename(columns={'homeScoreHT':'ht2','awayScoreHT':'ht1','sc1':'ft2','sc2':'ft1','vote_home':'vote2','vote_draw':'votex','vote_away':'vote1',
'home_formation':'form2','away_formation':'form1','w1':'w2','w2':'w1','elo1':'elo2','elo2':'elo1','t1':'t2','t2':'t1',
'tid1':'tid2','tid2':'tid1','odds_away':'odds_home','odds_home':'odds_away','oddsprob1':'oddsprob2',
'oddsprob2':'oddsprob1','drift1':'drift2','drift2':'drift1',
'possession1':'possession2', 'shont1':'shont2', 'shofft1':'shofft2', 'corners1':'corners2',
'offsides1':'offsides2', 'fouls1':'fouls2', 'cards1':'cards2', 'gksaves1':'gksaves2',
'possession2':'possession1', 'shont2':'shont1', 'shofft2':'shofft1', 'corners2':'corners1',
'offsides2':'offsides1', 'fouls2':'fouls1', 'cards2':'cards1', 'gksaves2':'gksaves1'})
df_away['psft']=df_away['psft']*-1
df_away['psht']=df_away['psht']*-1

df_home=pd.concat([df_home,df_away], axis=0)

return df_home.reset_index(drop=True)
50 changes: 38 additions & 12 deletions api/data_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ def __init__(self):

self.DATA_FILE='matches.csv'
self.DATA_DONE_FILE='matches_done.csv'

self.DATA_TODAY_FILE='matches_today.csv'
self.DATA_INPLAY_FILE='matches_inplay.csv'
self.SS_DATA_PATH='data/sofa/'
self.FB_DATA_PATH='data/fbref/'
self.OP_DATA_PATH='data/op/'
self.ELO_DATA_PATH='data/elo/'

self.SS_DAYS_RAW_PATH='raw/sofa/days/'
self.SS_MATCHES_RAW_PATH='raw/sofa/matches/'
self.FB_DAYS_RAW_PATH='raw/fbref/days/'
Expand Down Expand Up @@ -156,17 +156,18 @@ def _load_match_info(self, data):
referer=f'https://www.sofascore.com/football/{dstr}'
self.HEADERS=self._generate_headers(referer)
is_loaded=self._load_json('votes',data)
print(', lineups...', end='')
if is_loaded:
is_loaded=self._load_json('lineups',data)
if stage>0 and is_loaded: # Match started
print(', graph...', end='')
self._load_json('graph',data)
print(', statistics...', end='')
self._load_json('statistics',data)
print(', lineups...', end='')
self._load_json('lineups',data)
print(', incidents...', end='')
self._load_json('incidents',data)
if stage>89: # Match completed
self.df_matches.loc[self.df_matches['id']==mid,'done']=1
#if stage>89: # Match completed
self.df_matches.loc[self.df_matches['id']==mid,'done']=1
print(' done.')

def _append_save(self,df, f):
Expand Down Expand Up @@ -238,24 +239,28 @@ def _load_day(self, d):
print(f'ERROR {r.status_code}!!!', end='')
self.SERVER_ERROR=True
self._load_data(d)




def load_matches(self):
self.COUNTER=0
self.PAUSE=True
file_name=self.SS_DATA_PATH+self.DATA_FILE
file_done_name=self.SS_DATA_PATH+self.DATA_DONE_FILE
file_inplay_name=self.SS_DATA_PATH+self.DATA_INPLAY_FILE
file_today_name=self.SS_DATA_PATH+self.DATA_TODAY_FILE
self.df_matches=pd.read_csv(file_name, index_col=None)
self.df_matches = self.df_matches.sample(frac=1, axis=1).reset_index(drop=True)
self.DATA=self.df_matches.loc[(self.df_matches['done']==0) & (self.df_matches['status']>89)][['id', 'status', 'ts']].values
self.DATA=self.df_matches.loc[self.df_matches['done']==0][['id', 'status', 'ts']].values
np.random.shuffle(self.DATA)
self.TYPE='matches'
for data in self.DATA:
#print('LOOP:', data)
self._load_data(data)
self._append_save(self.df_matches[self.df_matches['done']==1], file_done_name)
self.df_matches[self.df_matches['done']==0].to_csv(file_name, index=False)
self._append_save(self.df_matches[(self.df_matches['done']==1) & (self.df_matches['status']>89)], file_done_name)
self._append_save(self.df_matches[(self.df_matches['done']==1) & (self.df_matches['status']>0) & (self.df_matches['status']<=89)], file_inplay_name)
self._append_save(self.df_matches[(self.df_matches['done']==1) & (self.df_matches['status']==0)], file_today_name)
self.df_matches=self.df_matches[self.df_matches['done']==0]
self.df_matches.to_csv(file_name, index=False)

def load_days(self, ds=None,de=None):
self.df_matches=pd.read_csv(self.SS_DATA_PATH+self.DATA_FILE, index_col=None)
Expand All @@ -268,7 +273,6 @@ def load_days(self, ds=None,de=None):
while d<=de:
dates.append(d)
d+=timedelta(days=1)

dates=np.array(dates)
np.random.shuffle(dates)
self.COUNTER=0
Expand Down Expand Up @@ -481,6 +485,28 @@ def load_op_matches(self):
c+=1
#break

def load_op_matches_today(self):
options = {
'connection_keep_alive': True,
'connection_timeout': None
}
#self.firefox = webdriver.Firefox(executable_path=r'../lib/geckodriver.exe',seleniumwire_options=options)
self.firefox = webdriver.Firefox(executable_path=r'../lib/geckodriver.exe')
self.firefox.scopes = ['fb.oddsportal.com/feed/match/*']
csv_name=self.OP_DATA_PATH+self.DATA_TODAY_FILE
df_matches=pd.read_csv(csv_name, index_col=None)
df_matches=df_matches.sample(frac=1).reset_index(drop=True)
for row in df_matches[df_matches['done']==0].itertuples():
link=row.link
file_name=self.OP_MATCHES_RAW_PATH+link.split('/')[4].split('-')[-1]+'.json'
#print(link, file_name)
html=self._load_link(file_name,link)
if "oddsdata" in html:
df_matches.at[row.Index, 'done'] = 1
print('saving...')
df_matches.to_csv(csv_name, index=False)
#break

def load_elos(self, ds, de):
d = datetime.strptime(ds, '%Y-%m-%d')
de = datetime.strptime(de, '%Y-%m-%d')
Expand Down
18 changes: 13 additions & 5 deletions api/op_dp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,MinMaxScaler

class OpDataProvider:
def __init__(self, include=[],exclude=[], load=False):
def __init__(self, include=[],exclude=[], load=False, today=False):
self.LOCAL_TZ = 'Asia/Almaty'
self.SERVER_TZ = 'UTC'
self.DATA_PATH='data/op/'
Expand All @@ -18,15 +18,15 @@ def __init__(self, include=[],exclude=[], load=False):
self.COL_NUM=[]
self.COL_LBL=[]
self.COL_INF=[]
self.LOAD=load
self.TODAY=today
self.LOAD=True if today else load

def _load_prerequisites(self,name):
with open(os.path.join(self.PREREQUISITES_PATH, name),'rb') as f:
encoder = pickle.load(f)
return encoder

def _save_prerequisite(self, name, data):
folder='prerequisites/'
os.makedirs(self.PREREQUISITES_PATH, mode=0o777, exist_ok=True)
with open(os.path.join(self.PREREQUISITES_PATH, name), mode='wb') as f:
pickle.dump(data, f)
Expand Down Expand Up @@ -88,7 +88,10 @@ def _encode(self, enctype, features, outs, df):
def _provide_odds(self, df_src):
self.COL_NUM+=['drift_home','drift_draw','drift_away']
#self.COL_NUM+=['oddsprob_home','oddsprob_draw','oddsprob_away','drift_home','drift_draw','drift_away']
df=pd.read_csv(self.DATA_PATH+'odds.csv', index_col=False)
if self.TODAY:
df=pd.read_csv(self.DATA_PATH+'odds_today.csv', index_col=False)
else:
df=pd.read_csv(self.DATA_PATH+'odds.csv', index_col=False)
df=df.dropna()
df['w1']=1/df['w1']
df['w2']=1/df['w2']
Expand Down Expand Up @@ -126,7 +129,12 @@ def _provide_matches(self):
self.COL_CAT+=cat_colums
self.COL_LBL+=label_colums
cols=np.unique(info_colums+num_colums+cat_colums+label_colums)
df=pd.read_csv(self.DATA_PATH+'matches_done.csv', index_col=False)
if self.TODAY:
df=pd.read_csv(self.DATA_PATH+'matches_today.csv', index_col=False)
df['sc1']=0
df['sc2']=0
else:
df=pd.read_csv(self.DATA_PATH+'matches_done.csv', index_col=False)
df = df.rename(columns={'odds1': 'odds_home','oddsdraw': 'odds_draw','odds2': 'odds_away'})
df['t1']=df['t1'].replace('[^a-zA-Z0-9 ]', '', regex=True).str.lower()
df['t2']=df['t2'].replace('[^a-zA-Z0-9 ]', '', regex=True).str.lower()
Expand Down
Loading

0 comments on commit 07a15ce

Please sign in to comment.