merging in test-workflows and fixing conflicts, with updated github a…

…ction config
sdl60660 · Dec 27, 2024 · de77765 · de77765
2 parents faab13f + bfbd960
commit de77765
Show file tree

Hide file tree

Showing 219 changed files with 1,272,362 additions and 1,567,161 deletions.
diff --git a/Pipfile b/Pipfile
@@ -9,13 +9,13 @@ scikit-learn = ">=0.23.2"
 requests = ">=2.25.0"
 beautifulsoup4 = ">=4.9.3"
 asyncio = ">=3.4.3"
-aiohttp = ">=3.7.3"
+aiohttp = "3.7.3"
 pymongo = ">=3.12.0"
 dnspython = ">=2.0.0"
 motor = ">=2.3.0"
 lxml = ">=4.6.2"
-scikit-surprise = ">=1.1.1"
-numpy = ">=1.19.4"
+scikit-surprise = "1.1.4"
+numpy = "1.26.0"
 pandas = ">=1.1.4"
 waitress = ">=2.0.0"
 tqdm = ">=4.50.2"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/data_processing/create_training_data.py b/data_processing/create_training_data.py
@@ -12,17 +12,16 @@
 
 from db_connect import connect_to_db
 
+
 def get_sample(cursor, iteration_size):
     while True:
         try:
-            rating_sample = cursor.aggregate([
-                {"$sample": {"size": iteration_size}}
-            ])
+            rating_sample = cursor.aggregate([{"$sample": {"size": iteration_size}}])
             return list(rating_sample)
         except pymongo.errors.OperationFailure:
             print("Encountered $sample operation error. Retrying...")
-    
-    
+
+
 def create_training_data(db_client, sample_size=200000):
     ratings = db_client.ratings
 
@@ -31,7 +30,7 @@ def create_training_data(db_client, sample_size=200000):
     while unique_records < sample_size:
         rating_sample = get_sample(ratings, 100000)
         all_ratings += rating_sample
-        unique_records = len(set([(x['movie_id'] + x['user_id']) for x in all_ratings]))
+        unique_records = len(set([(x["movie_id"] + x["user_id"]) for x in all_ratings]))
         print(unique_records)
 
     df = pd.DataFrame(all_ratings)
@@ -46,15 +45,28 @@ def create_training_data(db_client, sample_size=200000):
 
 def create_movie_data_sample(db_client, movie_list):
     movies = db_client.movies
-    included_movies = movies.find( { "movie_id": { "$in": movie_list } } )
-    
+    included_movies = movies.find({"movie_id": {"$in": movie_list}})
+
     movie_df = pd.DataFrame(list(included_movies))
-    movie_df = movie_df[['movie_id', 'image_url', 'movie_title', 'year_released']]
-    movie_df['image_url'] = movie_df['image_url'].fillna('').str.replace('https://a.ltrbxd.com/resized/', '', regex=False)
-    movie_df['image_url'] = movie_df['image_url'].fillna('').str.replace('https://s.ltrbxd.com/static/img/empty-poster-230.c6baa486.png', '', regex=False)
-
+    movie_df = movie_df[["movie_id", "image_url", "movie_title", "year_released"]]
+    movie_df["image_url"] = (
+        movie_df["image_url"]
+        .fillna("")
+        .str.replace("https://a.ltrbxd.com/resized/", "", regex=False)
+    )
+    movie_df["image_url"] = (
+        movie_df["image_url"]
+        .fillna("")
+        .str.replace(
+            "https://s.ltrbxd.com/static/img/empty-poster-230.c6baa486.png",
+            "",
+            regex=False,
+        )
+    )
+
     return movie_df
 
+
 if __name__ == "__main__":
     # Connect to MongoDB client
     db_name, client, tmdb_key = connect_to_db()
@@ -63,33 +75,39 @@ def create_movie_data_sample(db_client, movie_list):
     min_review_threshold = 20
 
     # Generate training data sample
-    training_df = create_training_data(db, 1500000)
+    training_df = create_training_data(db, 1200000)
 
     # Create review counts dataframe
-    review_count = db.ratings.aggregate([
-        { "$group": { "_id": "$movie_id", "review_count": { "$sum": 1 } } },
-        { "$match": { "review_count": { "$gte": min_review_threshold } } }
-    ])
+    review_count = db.ratings.aggregate(
+        [
+            {"$group": {"_id": "$movie_id", "review_count": {"$sum": 1}}},
+            {"$match": {"review_count": {"$gte": min_review_threshold}}},
+        ]
+    )
     review_counts_df = pd.DataFrame(list(review_count))
-    review_counts_df.rename(columns={"_id": "movie_id", "review_count": "count"}, inplace=True)
+    review_counts_df.rename(
+        columns={"_id": "movie_id", "review_count": "count"}, inplace=True
+    )
+
+    threshold_movie_list = review_counts_df["movie_id"].to_list()
 
-    threshold_movie_list = review_counts_df['movie_id'].to_list()
-
     # Generate movie data CSV
     movie_df = create_movie_data_sample(db, threshold_movie_list)
     print(movie_df.head())
     print(movie_df.shape)
 
     # Use movie_df to remove any items from threshold_list that do not have a "year_released"
     # This virtually always means it's a collection of more popular movies (such as the LOTR trilogy) and we don't want it included in recs
-    retain_list = movie_df.loc[(movie_df['year_released'].notna() & movie_df['year_released'] != 0.0)]['movie_id'].to_list()
-
+    retain_list = movie_df.loc[
+        (movie_df["year_released"].notna() & movie_df["year_released"] != 0.0)
+    ]["movie_id"].to_list()
+
     threshold_movie_list = [x for x in threshold_movie_list if x in retain_list]
 
     # Store Data
-    with open('models/threshold_movie_list.txt', 'wb') as fp:
+    with open("models/threshold_movie_list.txt", "wb") as fp:
         pickle.dump(threshold_movie_list, fp)
-    
-    training_df.to_csv('data/training_data.csv', index=False)
-    review_counts_df.to_csv('data/review_counts.csv', index=False)
-    movie_df.to_csv('../static/data/movie_data.csv', index=False)
+
+    training_df.to_csv("data/training_data.csv", index=False)
+    review_counts_df.to_csv("data/review_counts.csv", index=False)
+    movie_df.to_csv("../static/data/movie_data.csv", index=False)