chore: mlops example

scaleway · Nov 28, 2023 · 801d12f · 801d12f
1 parent 26942d6
commit 801d12f
Show file tree

Hide file tree

Showing 27 changed files with 624 additions and 28 deletions.
diff --git a/jobs/ml-ops/README.md b/jobs/ml-ops/README.md
@@ -0,0 +1,37 @@
+# Serverless MLOps
+
+In this example, we train and deploy a binary classification inference API using serverless computing resources (job+container). We use object storage resources to store data and training artifacts. We use container registry to store docker images.
+
+## Use case: Bank Telemarketing
+
+### Context
+
+We use a bank telemarketing dataset to predict if a client would engage in a term deposit subscription. This dataset records marketing phone calls made to clients. The outcome of the phone call is in shown in the `y` column:
+* `0` : no subscription
+* `1` : subscription
+
+### Data Source
+
+The dataset has many versions and is open-sourced and published [here](http://archive.ics.uci.edu/dataset/222/bank+marketing) on the UCI Machine Leaning repository and is close to the one analyzed in the following research work:
+
+* [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014
+
+We use the dataset labelled in the source as `bank-additional-full.csv`. You can download, extract this file, rename it to `bank_telemarketing.csv` then put it under this [directory](./s3/data-store/data/).
+
+## How to deploy your MLOps workflow in the cloud?
+
+### Step A: Create object storage resources and upload your data file to data store
+
+cf. this [readme](./s3/README.md)
+
+### Step B: Create registry namespaces for your docker images
+
+cf. this [readme](./registry/README.md)
+
+### Step C: Run a machine learning job
+
+cf. this [readme](./job/README.md)
+
+### Step D: Deploy and inference API as a serverless container
+
+cf. this [readme](./container/README.md)
diff --git a/jobs/ml-ops/container/README.md b/jobs/ml-ops/container/README.md
@@ -0,0 +1,37 @@
+# Deploy an inference API as a container
+
+## Step 1: Build and push API image to Scaleway's Registry
+
+```bash
+docker build -t rg.fr-par.scw.cloud/inference-api-images/inference-api:v1 .
+docker login rg.fr-par.scw.cloud/inference-api-images -u nologin --password-stdin <<< "$SCW_SECRET_KEY"
+docker push rg.fr-par.scw.cloud/inference-api-images/inference-api:v1
+```
+
+## Step 2: Create and deploy a private inference container
+
+Create `.tfvars` file in `/terraform` directory and put variable values in it:
+
+```
+region = "fr-par"
+access_key = "<access-key>"
+secret_key = "<secret_key>"
+project_id = "<project_id>"
+registry_image = "rg.fr-par.scw.cloud/registry-namespace-images/inference-api:v1"
+```
+
+Then perform:
+
+```bash
+cd terraform
+terraform plan -var-file=testing.tfvars
+terraform apply -var-file=testing.tfvars
+```
+
+## Test the inference API using HTTP calls
+
+You can perform the following HTTP call:
+
+```bash
+curl -H "X-Auth-Token: $CONTAINER_TOKEN" -X POST "<scw_container_endpoint>" -H "Content-Type: application/json" -d '{"age": 44, "job": "blue-collar", "marital": "married", "education": "basic.4y", "default": "unknown", "housing": "yes", "loan": "no", "contact": "cellular", "month": "aug", "day_of_week": "thu", "duration": 210, "campaign": 1, "pdays": 999, "previous": "0", "poutcome": "nonexistent", "emp_var_rate": 1.4, "cons_price_idx": 93.444, "cons_conf_idx": -36.1, "euribor3m": 4.963, "nr_employed": 5228.1}'  
+```
diff --git a/jobs/ml-ops/container/inference-api/Dockerfile b/jobs/ml-ops/container/inference-api/Dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.10.12
+
+WORKDIR /tmp/ml-serving/
+
+COPY . .
+
+RUN pip install --upgrade pip
+RUN pip install -r requirements.txt
+
+CMD ["uvicorn", "main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "80"]
diff --git a/jobs/ml-ops/container/inference-api/data_processing.py b/jobs/ml-ops/container/inference-api/data_processing.py
@@ -0,0 +1,89 @@
+import pandas as pd
+import numpy as np
+from pydantic import BaseModel
+
+class ClientProfile(BaseModel):
+    """
+    This class represent features of client profile upon which the inference in applied
+    """
+    age: int
+    job: str
+    marital: str
+    education: str
+    default: str
+    housing: str
+    loan: str
+    contact: str
+    month: str
+    day_of_week: str
+    duration: int
+    campaign: int
+    pdays: int
+    previous: int
+    poutcome: str
+    emp_var_rate: float
+    cons_price_idx: float
+    cons_conf_idx: float
+    euribor3m: float
+    nr_employed: float
+
+def clean_data(data: pd.DataFrame) -> pd.DataFrame:
+    """
+    This function removes rows with missing value(s)
+    """
+
+    data = data.dropna()
+    return data
+
+def transform_data(data: pd.DataFrame) -> pd.DataFrame:
+    """
+    This function handles the transformation of categorical variables of the dataset into 0/1 indicators.
+    It also adds mmissing categorical variables that are by default false (0).
+    """
+
+    # # use the same category for basic education sub-categories
+    data['education'] = np.where(data['education'] =='basic.9y', 'Basic', data['education'])
+    data['education'] = np.where(data['education'] =='basic.6y', 'Basic', data['education'])
+    data['education'] = np.where(data['education'] =='basic.4y', 'Basic', data['education'])
+
+    # transform all categorical variables into 0/1 indicators and remove columns with string categories
+    cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
+    for var in cat_vars:
+        cat_list = 'var' + '_' + var
+        cat_list = pd.get_dummies(data[var], prefix=var)
+        data = data.join(cat_list)
+
+    data_vars = data.columns.values.tolist()
+    to_keep = [i for i in data_vars if i not in cat_vars]
+    data = data[to_keep]
+
+    # normalize column naming
+    data.columns = data.columns.str.replace('.','_')
+    data.columns = data.columns.str.replace(' ','_')
+
+    # insert missing dummy categorical columns
+    cat_vars_target = [
+        'age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate',
+       'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed',
+       'job_admin_', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
+       'job_management', 'job_retired', 'job_self-employed', 'job_services',
+       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
+       'marital_divorced', 'marital_married', 'marital_single',
+       'marital_unknown', 'education_Basic', 'education_high_school',
+       'education_illiterate', 'education_professional_course',
+       'education_university_degree', 'education_unknown', 'default_no',
+       'default_unknown', 'default_yes', 'housing_no', 'housing_unknown',
+       'housing_yes', 'loan_no', 'loan_unknown', 'loan_yes',
+       'contact_cellular', 'contact_telephone', 'month_apr', 'month_aug',
+       'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may',
+       'month_nov', 'month_oct', 'month_sep', 'day_of_week_fri',
+       'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue',
+       'day_of_week_wed', 'poutcome_failure', 'poutcome_nonexistent',
+       'poutcome_success'
+    ]
+
+    for column_index, column_name in enumerate(cat_vars_target):
+        if column_name not in data.columns:
+            data.insert(column_index, column_name, False)
+
+    return data
diff --git a/jobs/ml-ops/container/inference-api/main.py b/jobs/ml-ops/container/inference-api/main.py
@@ -0,0 +1,35 @@
+from fastapi import FastAPI
+from dotenv import load_dotenv
+import data_processing as process
+import pickle, boto3, pandas, os
+
+app = FastAPI()
+
+load_dotenv(dotenv_path='./.env')
+
+s3 = boto3.resource(
+    's3',
+    region_name=os.getenv('SCW_REGION'),
+    use_ssl=True,
+    endpoint_url='https://s3.{region}.scw.cloud'.format(region=os.getenv('SCW_REGION')),
+    aws_access_key_id=os.getenv('SCW_ACCESS_KEY'),
+    aws_secret_access_key=os.getenv('SCW_SECRET_KEY'),
+)
+
+bucket = s3.Bucket(name=os.getenv('SCW_MODEL_REGISTRY')) # type: ignore
+bucket.download_file(os.getenv('MODEL_FILE_NAME'), os.getenv('MODEL_FILE_NAME'))
+
+classifier = pickle.load(open(os.getenv('MODEL_FILE_NAME', ''),'rb'))
+
+@app.post('/inference')
+def classify(data:process.ClientProfile):
+    """
+    This function predicts class given client profile
+    """
+
+    data_point_json = data.model_dump()
+    data_point_pd = pandas.DataFrame(index=[0], data=data_point_json)
+    data_point_processed = process.transform_data(process.clean_data(data_point_pd))
+    prediction = classifier.predict(data_point_processed)
+
+    return {'predicted_class': int(prediction)}
diff --git a/jobs/ml-ops/container/inference-api/requirements.txt b/jobs/ml-ops/container/inference-api/requirements.txt
@@ -0,0 +1,7 @@
+fastapi==0.104.1
+boto3==1.29.3
+uvicorn==0.24.0.post1
+pandas==2.0.2
+numpy==1.24.3
+scikit-learn==1.2.2
+python-dotenv==1.0.0
diff --git a/jobs/ml-ops/container/terraform/main.tf b/jobs/ml-ops/container/terraform/main.tf
@@ -0,0 +1,19 @@
+resource "scaleway_container_namespace" "inference_api_namespace" {
+  name = "ml-serving"
+  description = "Serving inference models deployed as serverless containers"
+}
+
+resource "scaleway_container" "inference_api_container" {
+  name = "inference-api"
+  description = "Serving an inference API"
+  namespace_id = scaleway_container_namespace.inference_api_namespace.id
+  registry_image = var.registry_image
+  port = 80
+  cpu_limit = 1120
+  memory_limit = 2048
+  min_scale = 1
+  max_scale = 5
+  privacy = "private"
+  protocol = "http1"
+  deploy = true
+}
diff --git a/jobs/ml-ops/container/terraform/outputs.tf b/jobs/ml-ops/container/terraform/outputs.tf
@@ -0,0 +1,3 @@
+output "inference_api_endpoint" {
+  value = scaleway_container.inference_api_container.domain_name
+}
diff --git a/jobs/ml-ops/container/terraform/providers.tf b/jobs/ml-ops/container/terraform/providers.tf
@@ -0,0 +1,6 @@
+provider "scaleway" {
+  region = var.region
+  access_key = var.access_key
+  secret_key = var.secret_key
+  project_id = var.project_id
+}
diff --git a/jobs/ml-ops/container/terraform/variables.tf b/jobs/ml-ops/container/terraform/variables.tf
@@ -0,0 +1,19 @@
+variable "region" {
+  type = string
+}
+
+variable "access_key" {
+  type = string
+}
+
+variable "secret_key" {
+  type = string
+}
+
+variable "project_id" {
+  type = string
+}
+
+variable "registry_image" {
+  type = string
+}
diff --git a/jobs/ml-ops/container/terraform/versions.tf b/jobs/ml-ops/container/terraform/versions.tf
@@ -0,0 +1,8 @@
+terraform {
+  required_providers {
+    scaleway = {
+      source = "scaleway/scaleway"
+    }
+  }
+  required_version = ">= 0.13"
+}
diff --git a/jobs/ml-ops/job/Dockerfile b/jobs/ml-ops/job/Dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.10.12
+
+WORKDIR /ml-job/
+
+COPY . .
+
+RUN pip install --upgrade pip
+RUN pip install -r requirements.txt
+
+CMD [ "python", "main.py" ]
diff --git a/jobs/ml-ops/job/README.md b/jobs/ml-ops/job/README.md
@@ -0,0 +1,28 @@
+# Machine Learning job for binary classification use case
+
+## Step 1: Build and push ML training image to Scaleway's Registry
+
+Create an fill a `.env` file as it follows:
+
+```bash
+SCW_ACCESS_KEY=<scw_access_key>
+SCW_SECRET_KEY=<scw_secret_key>
+SCW_S3_BUCKET_DATA=data-store
+DATA_FILE_NAME=bank_telemarketing.csv
+SCW_S3_BUCKET_MODEL=model-registry
+MODEL_FILE_NAME=classifier.pkl
+SCW_S3_BUCKET_PERF=performance-monitoring
+SCW_REGION=fr-par
+```
+
+Then build and push job image to registry:
+
+```bash
+docker build -t rg.fr-par.scw.cloud/ml-job-images/ml-job:v1 .
+docker login rg.fr-par.scw.cloud/ml-job-images -u nologin --password-stdin <<< "$SCW_SECRET_KEY"
+docker push rg.fr-par.scw.cloud/ml-job-images/ml-job:v1
+```
+
+## Step 2: Define and run an ML job
+
+You can create a job definition on the console using the private registry image. Run the job and check that training artifacts are uploaded to object storage buckets.
diff --git a/jobs/ml-ops/job/main.py b/jobs/ml-ops/job/main.py
@@ -0,0 +1,57 @@
+import sys, os, pickle, boto3
+import ml_training as ml
+from dotenv import load_dotenv
+
+def main() -> int:
+    """
+    This function trains a classifier on data pulled from a data store.
+    It uploads training/test artifacts into object storage.
+    """
+
+    load_dotenv(dotenv_path='./.env')
+
+    s3 = boto3.resource(
+        's3',
+        region_name=os.getenv('SCW_REGION'),
+        use_ssl=True,
+        endpoint_url='https://s3.{region}.scw.cloud'.format(region=os.getenv('SCW_REGION')),
+        aws_access_key_id=os.getenv('SCW_ACCESS_KEY'),
+        aws_secret_access_key=os.getenv('SCW_SECRET_KEY'),
+    )
+
+    # download data locally from data store
+    data_store = s3.Bucket(name=os.getenv('SCW_DATA_STORE')) # type: ignore
+    data_store.download_file(os.getenv('DATA_FILE_NAME'), './data/'+os.getenv('DATA_FILE_NAME', ''))
+    data = ml.load_data('./data/'+os.getenv('DATA_FILE_NAME', ''))
+    cleaned_data = ml.clean_data(data)
+    transformed_data = ml.transform_data(cleaned_data)
+
+    X_train, X_test, y_train, y_test = ml.split_to_train_test_data(transformed_data)
+    X_train, y_train = ml.over_sample_target_class(X_train, y_train)
+
+    # train and upload classifier to model registry
+    classifier, _ = ml.tune_classifier(X_train, y_train)
+    pickle.dump(classifier, open(os.getenv('MODEL_FILE_NAME',''), 'wb'))
+    model_registry = s3.Bucket(name=os.getenv('SCW_MODEL_REGISTRY')) # type: ignore
+    model_registry.upload_file(Filename='/ml-job/'+os.getenv('MODEL_FILE_NAME', ''), Key=os.getenv('MODEL_FILE_NAME'))
+
+    # compute performance on test data
+    y_pred = ml.predict_on_test_data(classifier, X_test)
+    y_pred_prob = ml.predict_prob_on_test_data(classifier,X_test)
+    test_metrics = ml.compute_performance_metrics(y_test, y_pred, y_pred_prob)
+    pickle.dump(test_metrics, open('performance_metrics.pkl', 'wb'))
+    performance_monitor = s3.Bucket(name=os.getenv('SCW_PERF_MONITOR')) # type: ignore
+    performance_monitor.upload_file(Filename='/ml-job/performance_metrics.pkl', Key='performance_metrics.pkl')
+
+    # save roc_auc plot
+    ml.save_roc_plot(classifier, X_test, y_test)
+    performance_monitor.upload_file(Filename='/ml-job/roc_auc_curve.png', Key='roc_auc_curve.png')
+
+    # save confusion matrix
+    ml.save_confusion_matrix_plot(classifier, X_test, y_test)
+    performance_monitor.upload_file(Filename='/ml-job/confusion_matrix.png', Key='confusion_matrix.png')
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())