-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproject_tmdb_lib.py
339 lines (241 loc) · 12.7 KB
/
project_tmdb_lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# Import the Required Libraries
import numpy as np
import pandas as pd
import math
import random
# import form user defined libraries
import decision_tree2 as dtree
import optimization as opt
#get the data from TMDB dataset
def getUserFeatureMatrix(ratingcsv,moviefeaturecsv,maxUsers):
df = pd.read_csv(moviefeaturecsv)
df.fillna(0,inplace = True)
df.set_index('id',inplace=True)
df2 = pd.read_csv(ratingcsv,usecols=['userId','movieId','rating'])
df2.fillna(0,inplace = True)
df2['userId'] = pd.to_numeric(df2['userId'])
df2['movieId'] = pd.to_numeric(df2['movieId'])
df2['rating'] = pd.to_numeric(df2['rating'])
commonMovieIds = df.index.intersection(df2['movieId'].unique(),sort = None).tolist()
ratingdf = df2[df2['movieId'].isin(commonMovieIds)]
featuresdf = df[df.index.isin(commonMovieIds)]
#dropping users and keeping #maxUsers with most ratings
users = df2.groupby(['userId'],sort = False).size().sort_values(ascending =False).index.values.tolist()
# users.sort()
users = users[:maxUsers]
# print(max(users))
ratingdf = ratingdf[ratingdf['userId'].isin(users)]
num_users = len(users)
num_movies = len(commonMovieIds)
print(num_users)
print(num_movies)
utility_mat = np.zeros((num_users,num_movies))
for _,row in ratingdf.iterrows():
utility_mat[users.index(int(row['userId']))][commonMovieIds.index(int(row['movieId']))] = row['rating']
movieFeaturesMat = featuresdf.as_matrix()
userFeaturesMat = np.matmul(utility_mat, movieFeaturesMat)
num_genre = 19
num_prod_comp = 23
num_prod_ctry = 32
num_cast = 13
for i in range(userFeaturesMat.shape[0]):
genre_sum = np.linalg.norm(userFeaturesMat[i][:num_genre],None) + 0.0001
userFeaturesMat[i][:num_genre] = userFeaturesMat[i][:num_genre] /genre_sum
cast_sum = np.linalg.norm(userFeaturesMat[i][num_genre:num_genre + num_cast],None) + 0.0001
userFeaturesMat[i][num_genre:num_genre + num_cast] = userFeaturesMat[i][num_genre:num_genre + num_cast]/cast_sum
prod_com_sum = np.linalg.norm(userFeaturesMat[i][num_genre + num_cast:num_genre + num_cast + num_prod_comp],None) + 0.0001
userFeaturesMat[i][num_genre + num_cast:num_genre + num_cast + num_prod_comp] = userFeaturesMat[i][num_genre + num_cast:num_genre + num_cast + num_prod_comp] /prod_com_sum
prod_ctry_sum = np.linalg.norm(userFeaturesMat[i][num_genre + num_cast + num_prod_comp: num_genre + num_cast + num_prod_comp + num_prod_ctry],None) + 0.0001
userFeaturesMat[i][num_genre + num_cast + num_prod_comp: num_genre + num_cast + num_prod_comp + num_prod_ctry] = userFeaturesMat[i][num_genre + num_cast + num_prod_comp:num_genre + num_cast + num_prod_comp + num_prod_ctry]/prod_ctry_sum
userFeatureDF = pd.DataFrame(userFeaturesMat, columns = df.columns.tolist())
userFeatureDF = userFeatureDF.loc[:, (userFeatureDF != 0).any(axis=0)]
utilityDF = pd.DataFrame(utility_mat, columns = commonMovieIds )
utilityDF = utilityDF.loc[:, (utilityDF != 0).any(axis=0)]
return utilityDF,userFeatureDF
# get the data from the MovieLens Dataset
# def getRatingMatrix(filename):
# # Open the file for reading data
# file = open(filename, "r")
# while 1:
# # Read all the lines from the file and store it in lines
# lines = file.readlines(1000000000)
# # if Lines is empty, simply break
# if not lines:
# break
# # Create a Dictionary of DIctionaries
# User_Movie_Dict = {}
# # Create a list to hold all the data
# data = []
# print ("Number of Lines: ", len(lines))
# # For each Data Entry, get the Y and the Xs in their respective list
# for line in lines:
# # Get all the attributes by splitting on '::' and
# # use list comprehension to convert string to float
# list1 = line.split("\n")[0].split("::")
# list1 = [float(j) for j in list1]
# # Add to the data
# data.append(list1)
# # Convert the data into numpyarray
# data_array = np.array(data)
# # Get the indices of the maximum Values in each column
# a = np.argmax(data, axis=0)
# num_users = data[a[0]][0]
# num_movies = data[a[1]][1]
# # print "Max values Indices: ", a
# print ("Number of Users: ", num_users)
# print ("Number of Movies: ", num_movies)
# # Creat and initialise Rating Matrix to hold all the rating values
# ratingMatrix = np.zeros((int(num_users), int(num_movies)))
# for list1 in data:
# # print list1[0], " ", list1[1]
# ratingMatrix[int(list1[0]) - 1][int(list1[1]) - 1] = list1[2]
# # Return both the array and the dict
# return (User_Movie_Dict, ratingMatrix)
def cf_movie_vector(rating_matrix, user_vectors, movie_vectors, K,lambda_r = 0.02):
# Stores the user profile vectors
movie_profiles = np.zeros((len(rating_matrix[0]), K))
count = 0
for j in range(len(rating_matrix[0])):
first_term = np.zeros((K, K))
for i in range(len(rating_matrix)):
if rating_matrix[i][j] > 0:
first_term = np.add(first_term, np.outer(user_vectors[i], user_vectors[i]))
# Take the inverse of the first term
first_term = np.add(first_term, np.multiply(lambda_r, np.eye(K)))
first_term = np.linalg.inv(first_term)
second_term = np.zeros(K)
for i in range(len(user_vectors)):
if rating_matrix[i][j] > 0:
second_term = np.add(second_term, np.multiply(rating_matrix[i][j], user_vectors[i]))
movie_profiles[count] = np.dot(first_term, second_term)
count = count + 1
return movie_profiles
# Function to calculate the RMSE Error between the predicted and actual rating
def getRMSE(Actual_Rating, Predicted_Rating):
# Calculate the Root Mean Squared Error(RMS)
rmse = 0.0
for i in range(len(Actual_Rating)):
for j in range(len(Actual_Rating[0])):
if Actual_Rating[i][j] > 0:
rmse = rmse + pow((Actual_Rating[i][j] - Predicted_Rating[i][j]), 2)
rms = rmse * 1.0 / len(Actual_Rating)
rmse = math.sqrt(rmse)
# Print and return the RMSE
print ('Root Mean Squared Error(RMS) = ' , rms)
return rms
# Used to randomly split the data
def random_split(data):
# Split the data set into 75% and 25%
SPLIT_PERCENT = 0.75
# Get Random Indices to shuffle the rows around
indices = np.random.permutation(data.shape[0])
# Get the number of rows
num_rows = len(data[:, 0])
# Get the indices for training and testing sets
training_indices, test_indices = indices[: int(SPLIT_PERCENT * num_rows)], indices[int(SPLIT_PERCENT * num_rows) :]
# return the training and the test set
return training_indices,test_indices
# Returns the rating Matrix with approximated ratings for all users for all movies using fMf
def alternateOptimization(rating_matrix,feature_matrix, NUM_OF_FACTORS,MAX_DEPTH,ITERATIONS, LAMBDA):
# Save and print the Number of Users and Movies
NUM_USERS = rating_matrix.shape[0]
NUM_MOVIES = rating_matrix.shape[1]
print ("Number of Users", NUM_USERS)
print ("Number of Movies", NUM_MOVIES)
print ("Number of Latent Factors: ", NUM_OF_FACTORS)
# Create the user and item profile vector of appropriate size.
# Initialize the item vectors randomly, check the random generation
user_vectors = np.zeros((NUM_USERS, NUM_OF_FACTORS), dtype=float)
movie_vectors = np.random.rand(NUM_MOVIES, NUM_OF_FACTORS)
i = 0
print ("Entering Main Loop of alternateOptimization")
decTree = dtree.Tree(dtree.Node(None, 1), rating_matrix, NUM_OF_FACTORS,MAX_DEPTH)
# Do converge Check
while i < ITERATIONS:
# Create the decision Tree based on movie_vectors
#print "Creating tree..."
decTree = dtree.Tree(dtree.Node(None, 1), rating_matrix, NUM_OF_FACTORS,MAX_DEPTH)
#print "Creating Tree.. for i = ", i
decTree.fitTree(decTree.root, rating_matrix,feature_matrix, movie_vectors, NUM_OF_FACTORS)
#print "Tree Created for i = ", i
#print "Getting the user vectors from tree"
# Calculate the User vectors using dtree
user_vectors = decTree.getUserVectors(feature_matrix, NUM_OF_FACTORS)
#print "Got the user vectors from the decisionTree"
#print "Optimizing movie_vectors.."
# Optimize Movie vector using the calculated user vectors
movie_vectors = cf_movie_vector(rating_matrix, user_vectors, movie_vectors, NUM_OF_FACTORS, LAMBDA)
#print "movie_vectors Optimized"
# Calculate Error for Convergence check
i = i + 1
# return the completed rating matrix
return (decTree, movie_vectors.T)
def printTopKMovies(test, predicted, K = 2):
# Gives top K (2) recommendations
K = 2
zero_list = []
movie_list = []
for i in range(len(test)):
for j in range(len(test[0])):
if test[i][j] == 0:
zero_list.append(predicted[i][j])
movie_list.append(j)
zero_array = np.array(zero_list)
movie_array = np.array(movie_list)
args = np.argsort(zero_array)
movie_array = movie_array[args]
print ("Top Movies Not rated by the user")
print (movie_array[0:K-1])
def run(max_users=1500,Lambda=0.08, iterations=5,max_depth=5,num_factors=10):
MAX_NUM_USERS = max_users
# Get the Data
ratingDF,featureDF =getUserFeatureMatrix("./the-movies-dataset/ratings.csv","./tmdb_movies_cross_features.csv",MAX_NUM_USERS)
rating_matrix = ratingDF.as_matrix()
feature_matrix = featureDF.as_matrix()
# (User_Movie_Dict, data) = getRatingMatrix("ratings.dat")
print ("Dimensions of the Dataset: ", rating_matrix.shape)
# Split the data 75-25 into training and testing dataset
(train_indices, test_indices) = random_split(rating_matrix)
train_rating = rating_matrix[train_indices,:]
test_rating = rating_matrix[test_indices,:]
train_feature = feature_matrix[train_indices,:]
test_feature = feature_matrix[test_indices,:]
print ("Dimensions of the Training Set: ", train_rating.shape)
print ("Dimensions of the Testing Set: ", test_rating.shape)
# Split the testing dataset 75-25 into answer and evaluation dataset
(answer_indices, evaluation_indices) = random_split(test_rating)
answer_rating = test_rating[answer_indices,:]
evaluation_rating = test_rating[evaluation_indices,:]
answer_feature = test_feature[answer_indices,:]
evaluation_feature = test_feature[evaluation_indices,:]
print ("Dimensions of the Answer Set: ", answer_rating.shape)
print ("Dimensions of the Evaluation Set: ", evaluation_rating.shape)
# Set the number of Factors
NUM_OF_FACTORS = num_factors
MAX_DEPTH = max_depth
ITERATIONS = iterations
LAMBDA = Lambda
(decisionTree, movie_vector) = alternateOptimization(train_rating,train_feature, NUM_OF_FACTORS,MAX_DEPTH,ITERATIONS, LAMBDA)
user_vectors = decisionTree.getUserVectors(test_feature, NUM_OF_FACTORS)
Predicted_Rating = np.dot(user_vectors, movie_vector)
print ("Predicted_Rating for Test: ", Predicted_Rating)
print ("Test Rating: ", test_rating)
rmse = getRMSE(test_rating, Predicted_Rating)
print ("RMSE on Testing: ", rmse)
return rmse
# pd.DataFrame(test_rating, index = ratingDF.index.values[test_indices],columns = ratingDF.columns).to_csv("true_rating_TMDB.csv")
# pd.DataFrame(Predicted_Rating, index = ratingDF.index.values[test_indices],columns = ratingDF.columns).to_csv("prediction_rating_TMDB.csv")
# Top K new recommendations:
# printTopKMovies(test, Predicted_Rating, 1)
# Testing
# testMatrix = np.array([[0, 5, 3, 4, 0], [3, 4, 0, 3, 1], [3, 0, 4, 0, 2], [4, 4, 4, 3, 0], [3, 5, 0, 4, 0]])
# (indices_like, indices_dislike, indices_unknown) = splitUsers(data, 2293)
# like = data[indices_like]
# dislike = data[indices_dislike]
# unknown = data[indices_unknown]
# print "Dimensions of the Like: ", like.shape[0], like.shape[1]
# print "Dimensions of the Dislike: ", dislike.shape[0], dislike.shape[1]
# print "Dimensions of the Unknown: ", unknown.shape[0], unknown.shape[1]
# Get the decision tree and the item profile vectors using alternate optimization
#(decisionTree, item_vector) = alternateOptimization(train)
# train = test = np.array([[0, 5, 3, 4, 0], [3, 4, 0, 3, 1], [3, 0, 4, 0, 2], [4, 4, 4, 3, 0], [3, 5, 0, 4, 0]])