-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathclustering.py
118 lines (92 loc) · 6.48 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import pickle
from options import config
from sklearn.preprocessing import Normalizer
def train_kmeans(dataset_path, master_path, clustering_path, nb_clusters, max_iter, random_state, embeddings_version="svd", clusters_filename=None):
# user embeddings (target = only for train users)
user_embeddings = pd.read_parquet(dataset_path + "/user_embeddings.parquet", engine = 'fastparquet')
list_embeddings = ["embedding_"+str(i) for i in range(len(user_embeddings[embeddings_version + "_embeddings"][0]))]
user_embeddings[list_embeddings] = pd.DataFrame(user_embeddings[embeddings_version + "_embeddings"].tolist(), index= user_embeddings.index)
user_embeddings_values = user_embeddings[list_embeddings].values
# clustering
kmeans = KMeans(n_clusters=nb_clusters, random_state=random_state, max_iter = max_iter, n_jobs=None, precompute_distances='auto').fit(user_embeddings_values)
with open(master_path + "/" + clustering_path + "/" + clusters_filename, "wb") as f:
pickle.dump(kmeans, f)
# top songs by cluster
user_embeddings["cluster"] = kmeans.labels_
user_clusters = user_embeddings[["user_index", "cluster"]]
# load songs listened between D1 and D30 for train users
features_train_path = dataset_path + "/user_features_train_" + embeddings_version + ".parquet"
features_train = pd.read_parquet(features_train_path, engine = 'fastparquet').fillna(0)
features_train = features_train.sort_values("user_index")
features_train = features_train.reset_index(drop=True)#to check it is ok for train data
listd1d30 = features_train[["user_index", "d1d30_songs"]]
listd1d30 = pd.merge(listd1d30, user_clusters, left_on = "user_index", right_on = "user_index")
listd1d30_exploded = listd1d30.explode('d1d30_songs')
listd1d30_exploded["count"] = np.ones(len(listd1d30_exploded))
listd1d30_by_cluster = pd.DataFrame(listd1d30_exploded.groupby(["cluster", "d1d30_songs"])['count'].count())
# most popular songs by cluster
nb_songs = config["nb_songs"]
arrays = (np.repeat(np.arange(nb_clusters), repeats = nb_songs), np.tile(np.arange(nb_songs), nb_clusters))
tuples = list(zip(*arrays))
index_perso = pd.MultiIndex.from_tuples(tuples, names=["cluster", "song_index"])
df = pd.DataFrame(index=["default"], columns=index_perso).T
both = pd.concat([listd1d30_by_cluster, df], axis=1)[["count"]].fillna(0)
both = both.reset_index(drop=False)
both.columns = ["cluster", "song_index", "nb_streams"]
data_by_cluster = pd.DataFrame(both.groupby("cluster")['nb_streams'].sum())
data_by_cluster.columns = ["nb_streams_by_cluster"]
data_by_cluster_and_song = pd.merge(both, data_by_cluster, left_on = "cluster", right_on = "cluster")
data_by_cluster_and_song["segment_proba"]=data_by_cluster_and_song["nb_streams"]/data_by_cluster_and_song["nb_streams_by_cluster"]
if not os.path.exists("{}/{}/".format(master_path, clustering_path + "_probas_" + embeddings_version)):
os.mkdir("{}/{}/".format(master_path, clustering_path + "_probas_" + embeddings_version))
for cluster_id in range(nb_clusters):
if cluster_id % 100 == 0:
print("probas by cluster and by song computed for cluster : "+ str(cluster_id))
list_proba = []
for song_index in range(nb_songs):
list_proba.append(data_by_cluster_and_song.iloc[cluster_id*nb_songs+song_index]["segment_proba"])
pickle.dump(list_proba, open("{}/{}/list_proba_{}.pkl".format(master_path, clustering_path + "_probas_" + embeddings_version, cluster_id), "wb"))
def train_inputfeatureskmeans(dataset_path, master_path, clustering_path, nb_clusters, max_iter, random_state, nb_songs, embeddings_version="svd", clusters_filename=None):
# train features
user_features_train = pd.read_parquet(dataset_path+"/user_features_train_"+embeddings_version+".parquet", engine = 'fastparquet')
features_train = user_features_train.fillna(0).sort_values("user_index")
features_train_ = features_train.values[:,2:]
transformer = Normalizer().fit(features_train_)
X_train = transformer.transform(features_train_)
# clustering
kmeans = KMeans(n_clusters=nb_clusters, random_state=random_state, max_iter = max_iter, n_jobs=None, precompute_distances='auto').fit(X_train)
with open(master_path + "/" + clustering_path + "/" + clusters_filename, "wb") as f:
pickle.dump(kmeans, f)
# top songs by cluster
features_train["cluster"] = kmeans.labels_
listd1d30 = features_train[["user_index", "d1d30_songs", "cluster"]]
listd1d30_exploded = listd1d30.explode('d1d30_songs')
listd1d30_exploded["count"] = np.ones(len(listd1d30_exploded))
listd1d30_by_cluster = pd.DataFrame(listd1d30_exploded.groupby(["cluster", "d1d30_songs"])['count'].count())
#we need to create empty dataframe with all combination of songs + clusters
arrays = (np.repeat(np.arange(nb_clusters), repeats = nb_songs),
np.tile(np.arange(nb_songs), nb_clusters))
tuples = list(zip(*arrays))
index_perso = pd.MultiIndex.from_tuples(tuples, names=["cluster", "song_index"])
df = pd.DataFrame(index=["test"], columns=index_perso).T
both = pd.concat([listd1d30_by_cluster, df], axis=1)[["count"]].fillna(0)
both = both.reset_index(drop=False)
both.columns = ["cluster", "song_index", "nb_streams"]
data_by_cluster = pd.DataFrame(both.groupby("cluster")['nb_streams'].sum())
data_by_cluster.columns = ["nb_streams_by_cluster"]
data_by_cluster_and_song = pd.merge(both, data_by_cluster, left_on = "cluster",
right_on = "cluster")
data_by_cluster_and_song["segment_proba"]=data_by_cluster_and_song["nb_streams"]/data_by_cluster_and_song["nb_streams_by_cluster"]
if not os.path.exists("{}/{}/".format(master_path, clustering_path + "_probas_" + embeddings_version)):
os.mkdir("{}/{}/".format(master_path, clustering_path + "_probas_" + embeddings_version))
for cluster_id in range(nb_clusters):
if cluster_id % 10 == 0:
print("probas by cluster and by song computed for cluster : "+ str(cluster_id))
list_proba = []
for song_index in range(nb_songs):
list_proba.append(data_by_cluster_and_song.iloc[cluster_id*nb_songs+song_index]["segment_proba"])
pickle.dump(list_proba, open("{}/{}/list_proba_{}.pkl".format(master_path, clustering_path + "_probas_" + embeddings_version, cluster_id), "wb"))