Skip to content

Commit

Permalink
V.0.2.1 (#15)
Browse files Browse the repository at this point in the history
* feat: update gitignore, add IDE

* feat: add dockerfile to streamlit app

* feat: create scripts

* feat: move files

* feat: resolve error in dockerfile

* feat: update function recommends and add new components

* feat: add checkbox in service

* feat: remove description

* feat: split page in colunmns

* feat: remove variable

* feat: remove variable

* feat: add improves in pipeline

---------

Co-authored-by: eddi <[email protected]>
  • Loading branch information
sc0v0ne and eddi authored Nov 18, 2023
1 parent a0c69f0 commit 777791e
Show file tree
Hide file tree
Showing 19 changed files with 69,176 additions and 46,068 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,6 @@ dmypy.json
# Cython debug symbols
cython_debug/

# IDE
.idea
.vscode
3 changes: 2 additions & 1 deletion containers/Dockerfile.preprocess
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ RUN pip install --upgrade pip & \
pip install \
numpy==1.24.0 \
pandas==2.0.3 \
scikit-learn==1.3.0
scikit-learn==1.3.0 \
joblib==1.3.2

COPY pipe /preprocess
17 changes: 17 additions & 0 deletions containers/Dockerfile.streamlit
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM python:3.10


COPY /requirements.txt /

RUN pip install --upgrade pip & \
pip install -r requirements.txt

WORKDIR /streamlit

COPY /src /streamlit/src
#COPY /.streamlit /streamlit/.streamlit
#COPY /main.py /streamlit

EXPOSE 7999

ENTRYPOINT ["streamlit", "run", "/streamlit/src/main.py", "--server.port=7999"]
27 changes: 19 additions & 8 deletions pipe/clusters.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,37 @@
import os

import joblib
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans


def pipeline_clusters(path_input, data_train, all_data):
def init_train(path_input, data_train, all_data):
print('-' * 100)
print('Final')
path_train = os.path.join(path_input, 'processed', data_train)
print('Initialize Train')
path_train = os.path.join(path_input, 'data', 'processed', data_train)
X_train = np.array(pd.read_csv(path_train))

kmeans_model = KMeans(n_clusters=35, random_state=0)
kmeans_model = KMeans(n_clusters=277, random_state=123456)
y_clusters = kmeans_model.fit_predict(X_train)

path_dataset = os.path.join(path_input, 'processed' ,all_data)

print('Prepare new Dataframe')
path_dataset = os.path.join(path_input, 'data', 'processed', all_data)
dataset = pd.read_csv(path_dataset)
dataset['clusters_genre_type'] = y_clusters
OUTPUT= os.path.join(path_input, 'final')

print('Save outputs')
OUTPUT= os.path.join(path_input, 'data', 'final')

if not os.path.exists(OUTPUT):
os.mkdir(OUTPUT)
dataset.to_csv('data/final/dataset_titles_final.csv', index=False)
print('Sucefully data final')


OUTPUT_MODEL = os.path.join(path_input, 'data', 'models')

if not os.path.exists(OUTPUT_MODEL):
os.mkdir(OUTPUT_MODEL)

model_path = os.path.join(OUTPUT_MODEL, 'model_kmeans_20231118.pkl')
joblib.dump(kmeans_model, model_path)
8 changes: 4 additions & 4 deletions pipe/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from preprocess import preprocess
from clusters import pipeline_clusters
from clusters import init_train
import sys
import os
if __name__ == '__main__':
Expand All @@ -8,8 +8,8 @@

preprocess(NAME_INPUT_DIR)

PATH_PROCESSED = os.path.join('/preprocess/data')
DATA_TRAIN = 'train_genger.csv'
PATH_PROCESSED = os.path.join('/preprocess')
DATA_TRAIN = 'train_gender.csv'
DATA_MOVIES_SERIES = 'data_titles_processed.csv'

pipeline_clusters(PATH_PROCESSED, DATA_TRAIN, DATA_MOVIES_SERIES)
init_train(PATH_PROCESSED, DATA_TRAIN, DATA_MOVIES_SERIES)
56 changes: 34 additions & 22 deletions pipe/preprocess.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
import os
import sys

import pandas as pd


def preprocess(path_input):
PATTERN_PATH = os.path.join('/preprocess', 'data')
datasets_names = os.listdir(os.path.join(PATTERN_PATH, path_input))
print('-' * 100)
print('\nLog preprocess execution\n')
print(datasets_names)
PATTERN_PATH = os.path.join('/preprocess', 'data', path_input)
datasets_names = os.listdir(PATTERN_PATH)
print(datasets_names, flush=True)

all_data = []
for dir in datasets_names:
read_pd = pd.read_csv(os.path.join(PATTERN_PATH, path_input, dir))
read_pd['channel_streaming'] = dir.split('_')[0]
for dir_ in datasets_names:
path_file_csv = os.path.join(PATTERN_PATH, dir_)
read_pd = pd.read_csv(path_file_csv)
read_pd['channel_streaming'] = dir_.split('_')[0]
all_data.append(read_pd)

try:
Expand Down Expand Up @@ -48,27 +45,42 @@ def preprocess(path_input):

data_titles = pd.concat(all_data, axis=0)

df_split = data_titles['gender_type'].str.split(',', expand=True)
df_split = df_split.fillna('-')
data_titles['gender_type'] = data_titles['gender_type'].str.lower()

group_dummies = [
pd.get_dummies(df_split[y].apply(lambda x: x.strip()), dtype='int')
for y in df_split.columns
]

df_split = data_titles['gender_type'].str.split(',', expand=True)

df_split = df_split.fillna('-')
path_input
for x in df_split.columns:
df_split[x] = df_split[x].apply(lambda i: i.strip())

group_dummies = [df_split[d] for d in df_split.columns]

for x in group_dummies:
print(type(x))

group_dummies = [pd.get_dummies(d, dtype='int') for d in group_dummies]

print(len(group_dummies))

group_dummies = pd.concat(group_dummies, axis=1)

group_dummies = group_dummies.fillna(0).astype('uint8')

data_titles['title'] = data_titles['title'].apply(lambda x: x.upper())

OUTPUT= os.path.join(PATTERN_PATH, 'processed')
group_dummies.drop(columns=['-'], axis=1, inplace=True)

data_titles['title'] = data_titles['title'].apply(lambda x: x.lower())


OUTPUT= os.path.join('/preprocess', 'data', 'processed')
if not os.path.exists(OUTPUT):
os.mkdir(OUTPUT)

data_titles.to_csv('/preprocess/data/processed/data_titles_processed.csv', index=False)
print('Sucefully Data Titles')
group_dummies.to_csv('/preprocess/data/processed/train_genger.csv',
group_dummies.to_csv('/preprocess/data/processed/train_gender.csv',
index=False)
print('Sucefully train genger')
print('-' * 100)
print('Sucefully group_dummies')
print('-' * 100)

2 changes: 2 additions & 0 deletions scripts/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
docker build . -f ./containers/Dockerfile.streamlit -t streamlit_app:latest --rm
2 changes: 2 additions & 0 deletions scripts/preprocess.sh → scripts/init_pipe.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

NAME_INPUT_DIR=raw

docker build . -f containers/Dockerfile.preprocess -t container_preprocess
docker run -it \
-v ${PWD}/src/data:/preprocess/data container_preprocess \
Expand Down
3 changes: 3 additions & 0 deletions scripts/run_local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
bash ./scripts/build.sh
docker run -v ${PWD}/src:/streamlit/src -p 7999:7999 streamlit_app:latest
File renamed without changes.
Empty file added src/__init__.py
Empty file.
Empty file added src/components/__init__.py
Empty file.
34 changes: 29 additions & 5 deletions src/components/query.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,33 @@
def query_name_title(dataset, nome, top_n):
new_search = nome.upper()
movie = dataset[dataset['title'] == new_search][['clusters_genre_type']]
import pandas as pd
from components.responses import response_markdown, response_recommends

def recommends(
dataset:pd.DataFrame,
name: str,
top_n: int,
extra_cols: dict,
) -> pd.DataFrame:

rename = name.lower()
exists_title = len(dataset[dataset['title'].str.contains(rename)])

if exists_title == 0:
return response_markdown('This title not exists')

try:
top_n = int(top_n)
except Exception as e:
print('Exception', e)
return response_markdown('This is not number')

extra_cols = [x for x, y in extra_cols.items() if y]

movie = dataset[dataset['title'] == rename][['clusters_genre_type']]
reset_movie = movie.reset_index()
reset_movie = reset_movie.at[0, 'clusters_genre_type']
k_id = int(reset_movie)
result = dataset[dataset['clusters_genre_type'] == k_id][['title', 'gender_type']][:int(top_n)]
cols_view = ['title', 'gender_type'] + extra_cols
result = dataset[dataset['clusters_genre_type'] == k_id][cols_view][:int(top_n)]
result.set_index('title')
return result

return response_recommends(result)
12 changes: 12 additions & 0 deletions src/components/responses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd
import streamlit as st


def response_recommends(results: pd.DataFrame)-> st.dataframe:
st.dataframe(
results,
use_container_width=False,
)

def response_markdown(text: str) -> st.markdown:
st.markdown(text)
Loading

0 comments on commit 777791e

Please sign in to comment.