-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuildPandasDF.py
317 lines (260 loc) · 12 KB
/
buildPandasDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
from scipy.stats.stats import pearsonr
import operator
import pandas as pd
import numpy as np
import os
import pickle
# takes a dataframe ldf, makes a copy of it, and returns the copy
# with all averages and review counts recomputed
# this is used when a frame is subsetted.
def recompute_frame(ldf):
ldfu=ldf.groupby('user')
ldfb=ldf.groupby('gameName')
user_avg=ldfu.rating.mean()
user_review_count=ldfu.rating.count()
game_avg=ldfb.rating.mean()
game_review_count=ldfb.rating.count()
nldf=ldf.copy()
nldf.set_index(['gameName'], inplace=True)
nldf['game_avg']=game_avg
nldf['game_review_count']=game_review_count
nldf.reset_index(inplace=True)
nldf.set_index(['user'], inplace=True)
nldf['user_avg']=user_avg
nldf['user_review_count']=user_review_count
nldf.reset_index(inplace=True)
return nldf
# Given a subframe of game1 reviews and a subframe of game2 reviews,
# where the reviewers are those who have reviewed both games, return
# the pearson correlation coefficient between the user average subtracted ratings.
def pearson_sim(game1_reviews, game2_reviews, n_common):
if n_common==0:
rho=0.
else:
diff1=game1_reviews['rating']-game1_reviews['user_avg']
diff2=game2_reviews['rating']-game2_reviews['user_avg']
rho=pearsonr(diff1, diff2)[0]
return rho
#alternative similarity metric for 2 games
#compute cosine similarity of v1 to v2: (v1 dot v1)/{||v1||*||v2||)
def cosine_similarity(game1_reviews, game2_reviews, n_common):
v1=game1_reviews['rating'].values
v2=game2_reviews['rating'].values
sumxx, sumxy, sumyy = 0, 0, 0
for i in range(len(v1)):
x = v1[i]; y = v2[i]
sumxx += x*x
sumyy += y*y
sumxy += x*y
return sumxy/math.sqrt(sumxx*sumyy)
#Calculates the similarity between 2 games using a provided similarity metric
#takes as arguments the 2 gameNames, the dataframe to use, and a function to calculate similarity
def calculate_similarity(game1,game2,df,similarity_func):
game1_reviewers = df[df.gameName==game1].user.unique()
game2_reviewers = df[df.gameName==game2].user.unique()
common_reviewers = set(game1_reviewers).intersection(game2_reviewers)
n_common=len(common_reviewers)
game1_reviews=get_game_reviews(game1, df, common_reviewers)
game2_reviews=get_game_reviews(game2, df, common_reviewers)
sim=similarity_func(game1_reviews, game2_reviews, n_common)
if np.isnan(sim):
sim=0
comparison=(sim, n_common)
return comparison
#given a gameName and a set of reviewers (e.g., the reviewers in common)
#return the sub-dataframe of their reviews.
def get_game_reviews(game, df, set_of_users):
mask = (df.user.isin(set_of_users)) & (df.gameName==game)
reviews = df[mask]
reviews = reviews[reviews.user.duplicated()==False]
return reviews
#takes a similarity and shrinks it down by using the regularizer
#this down-weights comparisons with low common support
def shrunk_sim(sim, n_common, reg=1000.):
ssim=(n_common*sim)/(n_common+reg)
return ssim
class Database:
# A class representing a database of similaries and common supports
def __init__(self, df):
# "the constructor, takes a reviews dataframe like smalldf as its argument"
database={}
self.df=df
self.gameNames={v:k for (k,v) in enumerate(df.gameName.unique())}
keys=self.gameNames.keys()
l_keys=len(keys)
self.database_sim=np.zeros([l_keys,l_keys])
self.database_sup=np.zeros([l_keys, l_keys], dtype=np.int)
def populate_by_calculating(self, similarity_func):
# a populator for every pair of games in df. takes similarity_func like
# pearson_sim as argument
counter=0
items=self.gameNames.items()
totalComparisons=len(items)**2
for g1, i1 in items:
for g2, i2 in items:
counter+=1
if counter%100==0:
print counter, totalComparisons
if i1 < i2:
sim, nsup=calculate_similarity(g1, g2, self.df, similarity_func)
self.database_sim[i1][i2]=sim
self.database_sim[i2][i1]=sim
self.database_sup[i1][i2]=nsup
self.database_sup[i2][i1]=nsup
elif i1==i2:
nsup=self.df[self.df.gameName==g1].user.count()
self.database_sim[i1][i1]=1.
self.database_sup[i1][i1]=nsup
def get(self, g1, g2):
# "returns a tuple of similarity,common_support given two business ids"
sim=self.database_sim[self.gameNames[g1]][self.gameNames[g2]]
nsup=self.database_sup[self.gameNames[g1]][self.gameNames[g2]]
return (sim, nsup)
def knearest(gameName,set_of_games,dbase,k=7,reg=1000):
sims=[dbase.get(gameName, iterGame) for iterGame in set_of_games]
shrunkSims=[(shrunk_sim(sim[0], sim[1], reg), sim[1]) for sim in sims]
getFirstItem=operator.itemgetter(0)
sortedInds=np.argsort(map(getFirstItem,shrunkSims))[::-1]
#if we ask for more returned nearest than are in the set of games, return them all
if k>len(set_of_games):
k=len(set_of_games)
kNearestInds=sortedInds[0:k]
kNearest=[(set_of_games[i],shrunkSims[i][0],shrunkSims[i][1]) for i in kNearestInds]
#Don't include the game itself in its own k-nearest neighbors
#this would be easier if we can assume that the game's best match is always unqiue and
#with itself and just skip the first term in the sorted list
if gameName in map(getFirstItem,kNearest):
#remove the self-comparison entry
selfId=map(getFirstItem,kNearest).index(gameName)
kNearest.pop(selfId)
#if there are more restaurants available in the set to use
if k!=len(sortedInds):
ind=sortedInds[k]
additionalItem=(set_of_games[ind],shrunkSims[ind][0],shrunkSims[ind][1])
kNearest.append(additionalItem)
return kNearest
#"get the sorted top 5 games for a user by the rating the user gave them"
def get_user_top_choices(user, df, numchoices=5):
udf=df[df.user==user][['gameName','rating']].sort(['rating'], ascending=False).head(numchoices)
return udf
def get_top_recos_for_user(user, df, dbase, n=5, k=8, reg=200):
#a set just containing the gameNames strings
neighborGames=set()
#lists the games already rated by the user
userAlreadyRatedGames=set(df[df.user==user].gameName.values)
games=get_user_top_choices(user, df,numchoices=n)['gameName'].values
#for each of the user top choices, get the k nearest neighbor games
for userTopGame in games:
kNearestGames=knearest(userTopGame,df.gameName.unique(),dbase, k, reg)
for nearGame in kNearestGames:
#checks if the games we might recommend has already been reviewed by user
if nearGame[0] not in userAlreadyRatedGames:
#add the game name to the set
neighborGames.add(nearGame[0])
#find the average rating for all games in the passed df
gameRatings=df.groupby('gameName')['rating'].aggregate(np.mean)
#recs is a list of tuples pairing each of the neighbor games with their average rating
recs=[(neighborGameName,gameRatings[neighborGameName]) for neighborGameName in neighborGames]
#sort the recommendations b rating
getSecondItem=operator.itemgetter(1)
sortedInds=np.argsort(map(getSecondItem,recs))[::-1]
topRecs=[recs[ind] for ind in sortedInds]
return topRecs
def knearest_amongst_userrated(gameName,user,df,dbase,k=7,reg=200.):
userRatedGames=df[df.user==user].gameName.unique()
nearestAmongstRated=knearest(gameName,userRatedGames,dbase,k,reg)
return nearestAmongstRated
def calcBase(df,user,gameName):
ybar=np.mean(df.rating)
yubar=np.mean(df[df.user==user].rating)
ymbar=np.mean(df[df.gameName==gameName].rating)
base=ybar+(yubar-ybar)+(ymbar-ybar)
return base
def getRating(df,gameName):
try:
ratings=df[df.gameName==gameName].rating.values[0]
return ratings
except:
print 'no rating found'
return None
def ratingPredictor(df,dbase,gameName,user,k=7, reg=200.):
userReviews=df[df.user==user]
yum_base=calcBase(df,user,gameName)
kNearestUserRated=knearest_amongst_userrated(gameName,user,df,dbase,k,reg)
s=np.array([dbase.get(gameName, neighbor[0])[0] for neighbor in kNearestUserRated])
yu=np.array([getRating(userReviews,neighbor[0]) for neighbor in kNearestUserRated])
yuj_base=[calcBase(df, user,neighbor[0]) for neighbor in kNearestUserRated]
if sum(s)==0 or np.isnan(sum(s)):
print 'no similarities'
return yum_base
else:
prediction=yum_base+sum((yu-yuj_base)*s)/sum(s)
return prediction
def get_other_ratings(gameName, user, df):
"get a user's rating for a game and the game's average rating"
choice=df[(df.gameName==gameName) & (df.user==user)]
users_score=choice.rating.values[0]
average_score=choice.game_avg.values[0]
return users_score, average_score
#"get the sorted top 5 games for a user by the rating the user gave them"
def get_user_top_choices(user, df, numchoices=5):
udf=df[df.user==user][['gameName','rating']].sort(['rating'], ascending=False)
return udf.head(numchoices)
#Build a pandas database fullDf from all of the individual game ratings csvs saved
def buildDfFromScrapedCsvs():
dataDir='google_drive/gameRatings/'
gameRatingsFiles=os.listdir(dataDir)
fullDf=pd.DataFrame()
for file in gameRatingsFiles:
if file.endswith('.csv'):
filePath=dataDir+file
df=pd.read_csv(filePath)
df.columns = ['gameID','gameName','rating','user']
fullDf=pd.concat([fullDf,df],ignore_index=True)
print 'Built fullDf:\n',fullDf.head(),'\n', fullDf
#Some rare users rate games more than once. This keeps their first rating
fullDf=fullDf.drop_duplicates(cols=['user','gameName'], take_last=False, inplace=False)
fullDf=recompute_frame(fullDf)
return fullDf
fullDf=buildDfFromScrapedCsvs()
#deal with the fact that some idiot put a comma in their username and some idiot database allowed it
fullDf['user']=fullDf['user'].map(lambda x: x.replace(',',' '))
##Save the dataframe or load from csv
fullDf.to_csv('google_drive/fullGamesDf.csv',index=False)
#fullDf=pd.read_csv('google_drive/fullGamesDf.csv')
#Exports a csv containing only the columns I want to pass to computeSimMrjob
subsetoffull=fullDf[['user','gameID', 'rating','game_avg','user_avg']]
subsetoffull.to_csv("subset-full.csv", index=False, header=False)
#create a smaller dataframe containing only the ratings by users with 13 or more ratings
#(number chosen arbitrarily)
smallDf=fullDf[fullDf.user_review_count>=25]
smallDf=recompute_frame(smallDf)
# db=Database(smallDf)
# db.populate_by_calculating(pearson_sim)
# fout=open('google_drive/gameDbPickle','w')
# pickle.dump(db,fout)
# fout.close()
# #fin=open('google_drive/gameDbPickle','r')
# #db=pickle.load(fin)
# #fin.close()
# #define some variables for function testing
# testGame1='Mage Wars'
# testGame2='Terra Mystica'
# testuser="m4c14s"
# print 'the database and this function should return the same values'
# print calculate_similarity(testGame1,testGame2,smallDf,pearson_sim)
# print 'database load test: ', db.get(testGame1,testGame2)
# print "For user", testuser, "top rated games are:"
# gameRecs=get_user_top_choices(testuser, smallDf)['gameName'].values
# print gameRecs
# print "\nFor user", testuser, "the top recommendations are:"
# toprecos=get_top_recos_for_user(testuser, smallDf, db, n=5, k=7)
# for gameName, gameRating in toprecos:
# print gameName, '| aveRating:',gameRating
# print "\nfor user",testuser, 'avg', smallDf[smallDf.user==testuser].rating.mean()
# for game in gameRecs:
# print "----------------------------------"
# print game
# print "Predicted Rating:",ratingPredictor(smallDf, db, game, testuser, k=7, reg=1000.)
# u,a=get_other_ratings(game, testuser, smallDf)
# print "Actual User Rating:",u,"Avg Rating",a