From 2adf1626b2fd5ff299d42011dee5095405049d20 Mon Sep 17 00:00:00 2001 From: Simon Shillaker Date: Wed, 6 Dec 2023 16:24:10 +0100 Subject: [PATCH] Run-through --- jobs/ml-ops/README.md | 25 ++++++++++--------------- jobs/ml-ops/data/main.py | 6 +++--- jobs/ml-ops/docker-compose.yml | 18 +++++++++--------- jobs/ml-ops/inference/main.py | 6 +++--- jobs/ml-ops/terraform/container.tf | 10 +++------- jobs/ml-ops/terraform/jobs.tf | 14 +++++++------- jobs/ml-ops/terraform/outputs.tf | 12 ++++++++++++ jobs/ml-ops/training/main.py | 6 +++--- 8 files changed, 50 insertions(+), 47 deletions(-) create mode 100644 jobs/ml-ops/terraform/outputs.tf diff --git a/jobs/ml-ops/README.md b/jobs/ml-ops/README.md index f4c3052..a49f8f6 100644 --- a/jobs/ml-ops/README.md +++ b/jobs/ml-ops/README.md @@ -1,12 +1,12 @@ # Serverless MLOps -In this example, we train and deploy a binary classification inference model using Scaleway Serverless. To do this, we use the following resources: +In this example, we train and deploy a binary classification inference model using Scaleway Serverless Jobs and Container. To do this, we use the following resources: -1. Serverless Job for training -2. Serverless Job to populate data in S3 +1. Serverless Job to populate data in S3 +2. Serverless Job for training 3. Serverless Container for inference -We use object storage to share data between the two. +We use object storage to share data between the steps. ## Context @@ -42,22 +42,17 @@ terraform apply ### Step 2. Run the data and training Jobs -*At the time of writing, the Scaleway CLI does not support Jobs, so we use a Python script* +To run the jobs for the data and training, we can use the Scaleway CLI: ``` -cd scripts +scw jobs run $(terraform output data_job_id) +scw jobs runs ls -python3 -m venv venv -source venv/bin/activate -pip install -r requirements.txt - -python3 run upload -python3 run training +scw jobs run $(terraform output training_job_id) +scw jobs runs ls ``` -You can then check your Job runs in the [Jobs Console](https://console.scaleway.com/serverless-jobs/jobs). - -### Step 4. Use the inference API +### Step 3. Use the inference API ``` export INFERENCE_URL=$(terraform output endpoint) diff --git a/jobs/ml-ops/data/main.py b/jobs/ml-ops/data/main.py index 1b963c4..0be8b6f 100644 --- a/jobs/ml-ops/data/main.py +++ b/jobs/ml-ops/data/main.py @@ -32,9 +32,9 @@ def main(): with zipfile.ZipFile(NESTED_ZIP_PATH) as fh: fh.extractall(DATA_DIR) - access_key = os.environ["SCW_ACCESS_KEY"] - secret_key = os.environ["SCW_SECRET_KEY"] - region_name = os.environ["SCW_REGION"] + access_key = os.environ["ACCESS_KEY"] + secret_key = os.environ["SECRET_KEY"] + region_name = os.environ["REGION"] bucket_name = os.environ["S3_BUCKET_NAME"] s3_url = os.environ["S3_URL"] diff --git a/jobs/ml-ops/docker-compose.yml b/jobs/ml-ops/docker-compose.yml index 48173ab..aba2fb0 100644 --- a/jobs/ml-ops/docker-compose.yml +++ b/jobs/ml-ops/docker-compose.yml @@ -7,9 +7,9 @@ services: depends_on: - minio environment: - - SCW_ACCESS_KEY=example - - SCW_SECRET_KEY=example-password - - SCW_REGION=foo + - ACCESS_KEY=example + - SECRET_KEY=example-password + - REGION=foo - S3_BUCKET_NAME=mlops - S3_URL=http://minio:9000 @@ -19,9 +19,9 @@ services: depends_on: - minio environment: - - SCW_ACCESS_KEY=example - - SCW_SECRET_KEY=example-password - - SCW_REGION=foo + - ACCESS_KEY=example + - SECRET_KEY=example-password + - REGION=foo - S3_BUCKET_NAME=mlops - S3_URL=http://minio:9000 @@ -33,9 +33,9 @@ services: depends_on: - minio environment: - - SCW_ACCESS_KEY=example - - SCW_SECRET_KEY=example-password - - SCW_REGION=foo + - ACCESS_KEY=example + - SECRET_KEY=example-password + - REGION=foo - S3_BUCKET_NAME=mlops - S3_URL=http://minio:9000 diff --git a/jobs/ml-ops/inference/main.py b/jobs/ml-ops/inference/main.py index ce4b109..8292eff 100644 --- a/jobs/ml-ops/inference/main.py +++ b/jobs/ml-ops/inference/main.py @@ -21,9 +21,9 @@ class ClassifierLoader(object): @classmethod def load(cls, force=False): if force or cls._classifier is None: - access_key = os.environ["SCW_ACCESS_KEY"] - secret_key = os.environ["SCW_SECRET_KEY"] - region_name = os.environ["SCW_REGION"] + access_key = os.environ["ACCESS_KEY"] + secret_key = os.environ["SECRET_KEY"] + region_name = os.environ["REGION"] bucket_name = os.environ["S3_BUCKET_NAME"] s3_url = os.environ["S3_URL"] diff --git a/jobs/ml-ops/terraform/container.tf b/jobs/ml-ops/terraform/container.tf index eba6b61..5329163 100644 --- a/jobs/ml-ops/terraform/container.tf +++ b/jobs/ml-ops/terraform/container.tf @@ -16,15 +16,11 @@ resource "scaleway_container" "inference" { environment_variables = { "S3_BUCKET_NAME" = scaleway_object_bucket.main.name "S3_URL" = var.s3_url - "SCW_REGION" = var.region + "REGION" = var.region } secret_environment_variables = { - "SCW_ACCESS_KEY" = var.access_key - "SCW_SECRET_KEY" = var.secret_key + "ACCESS_KEY" = var.access_key + "SECRET_KEY" = var.secret_key } deploy = true } - -output "endpoint" { - value = scaleway_container.inference.domain_name -} diff --git a/jobs/ml-ops/terraform/jobs.tf b/jobs/ml-ops/terraform/jobs.tf index 59791ea..4058d89 100644 --- a/jobs/ml-ops/terraform/jobs.tf +++ b/jobs/ml-ops/terraform/jobs.tf @@ -8,15 +8,15 @@ resource scaleway_job_definition data { env = { "S3_BUCKET_NAME": scaleway_object_bucket.main.name, "S3_URL": var.s3_url, - "SCW_ACCESS_KEY": var.access_key, - "SCW_SECRET_KEY": var.secret_key, - "SCW_REGION": var.region + "ACCESS_KEY": var.access_key, + "SECRET_KEY": var.secret_key, + "REGION": var.region } } resource scaleway_job_definition training { name = "training" - cpu_limit = 6000 + cpu_limit = 4000 memory_limit = 4096 image_uri = docker_image.training.name timeout = "10m" @@ -24,8 +24,8 @@ resource scaleway_job_definition training { env = { "S3_BUCKET_NAME": scaleway_object_bucket.main.name, "S3_URL": var.s3_url, - "SCW_ACCESS_KEY": var.access_key, - "SCW_SECRET_KEY": var.secret_key, - "SCW_REGION": var.region, + "ACCESS_KEY": var.access_key, + "SECRET_KEY": var.secret_key, + "REGION": var.region, } } diff --git a/jobs/ml-ops/terraform/outputs.tf b/jobs/ml-ops/terraform/outputs.tf new file mode 100644 index 0000000..ccdb5c5 --- /dev/null +++ b/jobs/ml-ops/terraform/outputs.tf @@ -0,0 +1,12 @@ + +output "endpoint" { + value = scaleway_container.inference.domain_name +} + +output "training_job_id" { + value = scaleway_job_definition.training.id +} + +output "data_job_id" { + value = scaleway_job_definition.data.id +} diff --git a/jobs/ml-ops/training/main.py b/jobs/ml-ops/training/main.py index 4fa6f8d..d520f3d 100644 --- a/jobs/ml-ops/training/main.py +++ b/jobs/ml-ops/training/main.py @@ -20,9 +20,9 @@ def main() -> int: Uploads training/test artifacts into artifact data stores. """ - access_key = os.environ["SCW_ACCESS_KEY"] - secret_key = os.environ["SCW_SECRET_KEY"] - region_name = os.environ["SCW_REGION"] + access_key = os.environ["ACCESS_KEY"] + secret_key = os.environ["SECRET_KEY"] + region_name = os.environ["REGION"] bucket_name = os.environ["S3_BUCKET_NAME"] s3_url = os.environ["S3_URL"]