Skip to content

Commit

Permalink
merging in test-workflows and fixing conflicts, with updated github a…
Browse files Browse the repository at this point in the history
…ction config
  • Loading branch information
Sam Learner authored and Sam Learner committed Dec 27, 2024
2 parents faab13f + bfbd960 commit de77765
Show file tree
Hide file tree
Showing 219 changed files with 1,272,362 additions and 1,567,161 deletions.
6 changes: 3 additions & 3 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ scikit-learn = ">=0.23.2"
requests = ">=2.25.0"
beautifulsoup4 = ">=4.9.3"
asyncio = ">=3.4.3"
aiohttp = ">=3.7.3"
aiohttp = "3.7.3"
pymongo = ">=3.12.0"
dnspython = ">=2.0.0"
motor = ">=2.3.0"
lxml = ">=4.6.2"
scikit-surprise = ">=1.1.1"
numpy = ">=1.19.4"
scikit-surprise = "1.1.4"
numpy = "1.26.0"
pandas = ">=1.1.4"
waitress = ">=2.0.0"
tqdm = ">=4.50.2"
Expand Down
339 changes: 89 additions & 250 deletions Pipfile.lock

Large diffs are not rendered by default.

72 changes: 45 additions & 27 deletions data_processing/create_training_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,16 @@

from db_connect import connect_to_db


def get_sample(cursor, iteration_size):
while True:
try:
rating_sample = cursor.aggregate([
{"$sample": {"size": iteration_size}}
])
rating_sample = cursor.aggregate([{"$sample": {"size": iteration_size}}])
return list(rating_sample)
except pymongo.errors.OperationFailure:
print("Encountered $sample operation error. Retrying...")


def create_training_data(db_client, sample_size=200000):
ratings = db_client.ratings

Expand All @@ -31,7 +30,7 @@ def create_training_data(db_client, sample_size=200000):
while unique_records < sample_size:
rating_sample = get_sample(ratings, 100000)
all_ratings += rating_sample
unique_records = len(set([(x['movie_id'] + x['user_id']) for x in all_ratings]))
unique_records = len(set([(x["movie_id"] + x["user_id"]) for x in all_ratings]))
print(unique_records)

df = pd.DataFrame(all_ratings)
Expand All @@ -46,15 +45,28 @@ def create_training_data(db_client, sample_size=200000):

def create_movie_data_sample(db_client, movie_list):
movies = db_client.movies
included_movies = movies.find( { "movie_id": { "$in": movie_list } } )
included_movies = movies.find({"movie_id": {"$in": movie_list}})

movie_df = pd.DataFrame(list(included_movies))
movie_df = movie_df[['movie_id', 'image_url', 'movie_title', 'year_released']]
movie_df['image_url'] = movie_df['image_url'].fillna('').str.replace('https://a.ltrbxd.com/resized/', '', regex=False)
movie_df['image_url'] = movie_df['image_url'].fillna('').str.replace('https://s.ltrbxd.com/static/img/empty-poster-230.c6baa486.png', '', regex=False)

movie_df = movie_df[["movie_id", "image_url", "movie_title", "year_released"]]
movie_df["image_url"] = (
movie_df["image_url"]
.fillna("")
.str.replace("https://a.ltrbxd.com/resized/", "", regex=False)
)
movie_df["image_url"] = (
movie_df["image_url"]
.fillna("")
.str.replace(
"https://s.ltrbxd.com/static/img/empty-poster-230.c6baa486.png",
"",
regex=False,
)
)

return movie_df


if __name__ == "__main__":
# Connect to MongoDB client
db_name, client, tmdb_key = connect_to_db()
Expand All @@ -63,33 +75,39 @@ def create_movie_data_sample(db_client, movie_list):
min_review_threshold = 20

# Generate training data sample
training_df = create_training_data(db, 1500000)
training_df = create_training_data(db, 1200000)

# Create review counts dataframe
review_count = db.ratings.aggregate([
{ "$group": { "_id": "$movie_id", "review_count": { "$sum": 1 } } },
{ "$match": { "review_count": { "$gte": min_review_threshold } } }
])
review_count = db.ratings.aggregate(
[
{"$group": {"_id": "$movie_id", "review_count": {"$sum": 1}}},
{"$match": {"review_count": {"$gte": min_review_threshold}}},
]
)
review_counts_df = pd.DataFrame(list(review_count))
review_counts_df.rename(columns={"_id": "movie_id", "review_count": "count"}, inplace=True)
review_counts_df.rename(
columns={"_id": "movie_id", "review_count": "count"}, inplace=True
)

threshold_movie_list = review_counts_df["movie_id"].to_list()

threshold_movie_list = review_counts_df['movie_id'].to_list()

# Generate movie data CSV
movie_df = create_movie_data_sample(db, threshold_movie_list)
print(movie_df.head())
print(movie_df.shape)

# Use movie_df to remove any items from threshold_list that do not have a "year_released"
# This virtually always means it's a collection of more popular movies (such as the LOTR trilogy) and we don't want it included in recs
retain_list = movie_df.loc[(movie_df['year_released'].notna() & movie_df['year_released'] != 0.0)]['movie_id'].to_list()

retain_list = movie_df.loc[
(movie_df["year_released"].notna() & movie_df["year_released"] != 0.0)
]["movie_id"].to_list()

threshold_movie_list = [x for x in threshold_movie_list if x in retain_list]

# Store Data
with open('models/threshold_movie_list.txt', 'wb') as fp:
with open("models/threshold_movie_list.txt", "wb") as fp:
pickle.dump(threshold_movie_list, fp)
training_df.to_csv('data/training_data.csv', index=False)
review_counts_df.to_csv('data/review_counts.csv', index=False)
movie_df.to_csv('../static/data/movie_data.csv', index=False)

training_df.to_csv("data/training_data.csv", index=False)
review_counts_df.to_csv("data/review_counts.csv", index=False)
movie_df.to_csv("../static/data/movie_data.csv", index=False)
Loading

0 comments on commit de77765

Please sign in to comment.