-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathContentKNNAlgorithm.py
102 lines (80 loc) · 3.79 KB
/
ContentKNNAlgorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from surprise import AlgoBase
from surprise import PredictionImpossible
from MovieLens import MovieLens
import math
import numpy as np
import heapq
class ContentKNNAlgorithm(AlgoBase):
def __init__(self, k=40, sim_options={}):
AlgoBase.__init__(self)
self.k = k
def fit(self, trainset):
AlgoBase.fit(self, trainset)
# Compute item similarity matrix based on content attributes
# Load up genre vectors for every movie
ml = MovieLens()
genres = ml.getGenres()
years = ml.getYears()
mes = ml.getMiseEnScene()
print("Computing content-based similarity matrix...")
# Compute genre distance for every movie combination as a 2x2 matrix
self.similarities = np.zeros((self.trainset.n_items, self.trainset.n_items))
for thisRating in range(self.trainset.n_items):
if (thisRating % 100 == 0):
print(thisRating, " of ", self.trainset.n_items)
for otherRating in range(thisRating+1, self.trainset.n_items):
thisMovieID = int(self.trainset.to_raw_iid(thisRating))
otherMovieID = int(self.trainset.to_raw_iid(otherRating))
genreSimilarity = self.computeGenreSimilarity(thisMovieID, otherMovieID, genres)
yearSimilarity = self.computeYearSimilarity(thisMovieID, otherMovieID, years)
self.similarities[thisRating, otherRating] = genreSimilarity * yearSimilarity
self.similarities[otherRating, thisRating] = self.similarities[thisRating, otherRating]
print("...done.")
return self
def computeGenreSimilarity(self, movie1, movie2, genres):
genres1 = genres[movie1]
genres2 = genres[movie2]
sumxx, sumxy, sumyy = 0, 0, 0
for i in range(len(genres1)):
x = genres1[i]
y = genres2[i]
sumxx += x * x
sumyy += y * y
sumxy += x * y
return sumxy/math.sqrt(sumxx*sumyy)
def computeYearSimilarity(self, movie1, movie2, years):
diff = abs(years[movie1] - years[movie2])
sim = math.exp(-diff / 10.0)
return sim
def computeMiseEnSceneSimilarity(self, movie1, movie2, mes):
mes1 = mes[movie1]
mes2 = mes[movie2]
if (mes1 and mes2):
shotLengthDiff = math.fabs(mes1[0] - mes2[0])
colorVarianceDiff = math.fabs(mes1[1] - mes2[1])
motionDiff = math.fabs(mes1[3] - mes2[3])
lightingDiff = math.fabs(mes1[5] - mes2[5])
numShotsDiff = math.fabs(mes1[6] - mes2[6])
return shotLengthDiff * colorVarianceDiff * motionDiff * lightingDiff * numShotsDiff
else:
return 0
def estimate(self, u, i):
if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.')
# Build up similarity scores between this item and everything the user rated
neighbors = []
for rating in self.trainset.ur[u]:
genreSimilarity = self.similarities[i,rating[0]]
neighbors.append( (genreSimilarity, rating[1]) )
# Extract the top-K most-similar ratings
k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])
# Compute average sim score of K neighbors weighted by user ratings
simTotal = weightedSum = 0
for (simScore, rating) in k_neighbors:
if (simScore > 0):
simTotal += simScore
weightedSum += simScore * rating
if (simTotal == 0):
raise PredictionImpossible('No neighbors')
predictedRating = weightedSum / simTotal
return predictedRating