update script to use s3

RobotSail · Oct 25, 2024 · e048e29 · e048e29
1 parent accaaae
commit e048e29
Show file tree

Hide file tree

Showing 4 changed files with 262 additions and 47 deletions.
diff --git a/.github/workflows/print-loss.yml b/.github/workflows/print-loss.yml
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 name: Print Loss
 
 on:
@@ -22,14 +23,37 @@ jobs:
 
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
+          # python -m pip install --upgrade pip
           pip install -r requirements-dev.txt
 
       - name: Generate test data
         run: |
           printf '{"total_loss": 2.0}\n{"total_loss": 1.8023}\n{"total_loss": 1.52324}\n{"total_loss": 1.3234}' > test-log.jsonl
           ls -al
-      - name: Print loss
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+      - name: Try to upload to s3
         run: |
-          python scripts/create-loss-graph.py markdown --log-file test-log.jsonl --output-file 'results.md'
-          cat 'results.md' >> "${GITHUB_STEP_SUMMARY}"
+          echo "$(which aws)"
+          bucket_name='os-ci-loss-curve-test' 
+          output_file='./test.md' 
+          python scripts/create-loss-graph.py --source file \
+            --log-file test-log.jsonl \
+            --output-file "${output_file}" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${bucket_name}" \
+            --base-branch "${{ github.event.pull_request.base.ref }}" \
+            --pr-number "${{ github.event.pull_request.number }}" \
+            --head-sha "${{ github.event.pull_request.head.sha }}" \
+            --origin-repository "${{ github.repository }}"
+          
+
+          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
+
+          echo "test 1: https://github.com/${{ github.repository }}/commit/${{ github.sha }}"
+          echo "test 2: ${{ github.event.pull_request.html_url }}/commits/${{ github.sha }}"
+          
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
--r requirements.txt
+# blot this out
+# -r requirements.txt
 
 pre-commit>=3.0.4,<5.0
 pylint>=2.16.2,<4.0

diff --git a/scripts/create-loss-graph.py b/scripts/create-loss-graph.py
@@ -1,16 +1,33 @@
 # SPDX-License-Identifier: Apache-2.0
 # Standard
-from argparse import ArgumentParser, Namespace
-from base64 import b64encode
-from io import BytesIO
+from argparse import ArgumentParser
+from base64 import b64decode
 from pathlib import Path
+from subprocess import run
+from typing import List
 import json
+import os
 
 # Third Party
 from matplotlib import pyplot as plt
+from pydantic import BaseModel
 
+ENV_VAR_NAME = "LOSS_DATA"
 
-def create_b64_data(log_file: Path) -> str:
+
+class Arguments(BaseModel):
+    log_file: str | None = None
+    source: str  ## one of: file, env
+    output_file: str
+    aws_region: str
+    bucket_name: str
+    base_branch: str
+    pr_number: str
+    head_sha: str
+    origin_repository: str
+
+
+def render_image(loss_data: List[float], outfile: Path) -> str:
     log_file = Path(args.log_file)
     if not log_file.exists():
         raise FileNotFoundError(f'log file "{args.log_file}" does not exist')
@@ -30,56 +47,171 @@ def create_b64_data(log_file: Path) -> str:
     plt.ylabel("Loss")
     plt.title("Training performance over fixed dataset")
 
-    buf = BytesIO()
-    plt.savefig(buf, format="png")
-    buf.seek(0)
+    if outfile.exists():
+        outfile.unlink()
+
+    plt.savefig(outfile, format="png")
+
+
+def read_loss_data(src: str, log_file: Path | None = None) -> List[float]:
+    match src:
+        case "env":
+            data = os.getenv(ENV_VAR_NAME, None)
+            if not data:
+                raise ValueError(f"Environment variable {ENV_VAR_NAME} not set")
+            # decode the base64 data
+            data = b64decode(data)
+        case "file":
+            if not log_file:
+                raise ValueError("log_file must be provided when source is file")
+            if not log_file.exists():
+                raise FileNotFoundError(f"Log file {log_file} does not exist")
+            if log_file.is_dir():
+                raise ValueError(f"Log file {log_file} is a directory")
+            with open(log_file, "r") as f:
+                data = f.read()
+        case _:
+            raise ValueError(f"Unknown source: {src}")
 
-    imgb64 = b64encode(buf.read()).decode("utf-8")
-    return imgb64
+    # select the loss data
+    contents = [json.loads(l) for l in data.splitlines()]
+    loss_data = [item["total_loss"] for item in contents if "total_loss" in item]
 
+    if not loss_data:
+        raise ValueError("Loss data is empty")
 
-def create_md_file(b64_data: str, output_file: Path | None):
-    content = f"""## Training Performance\n
+    # ensure that the loss data is valid
+    if not all(isinstance(l, float) for l in loss_data):
+        raise ValueError("Loss data must be a list of floats")
 
-![Training Performance](data:image/png;base64,{b64_data})
-"""
-    if not output_file:
-        print(content)
-    else:
-        output_file.write_text(content, encoding="utf-8")
+    return loss_data
 
 
-def main(args: Namespace):
-    imgb64 = create_b64_data(args.log_file)
+def write_to_s3(
+    file: Path,
+    bucket_name: str,
+    destination: str,
+):
+    if not file.exists():
+        raise RuntimeError(f"File {file} does not exist")
 
-    output_file = Path(args.output_file) if args.output_file else None
-    if output_file:
-        output_file.write_text(imgb64, encoding="utf-8")
+    s3_path = f"s3://{bucket_name}/{destination}"
+    results = run(
+        ["aws", "s3", "cp", str(file), s3_path], capture_output=True, check=True
+    )
+    if results.returncode != 0:
+        raise RuntimeError(f"failed to upload to s3: {results.stderr.decode('utf-8')}")
     else:
-        # just print the file without including a newline, this way it can be piped
-        print(imgb64, end="")
+        print(results.stdout.decode("utf-8"))
+
+
+def get_destination_path(base_ref: str, pr_number: str, head_sha: str):
+    return f"pulls/{base_ref}/{pr_number}/{head_sha}/loss-graph.png"
+
+
+def write_md_file(
+    output_file: Path, url: str, pr_number: str, head_sha: str, origin_repository: str
+):
+    commit_url = f"https://github.com/{origin_repository}/commit/{head_sha}"
+    md_template = f"""
+# Loss Graph for PR {args.pr_number} [({args.head_sha[:5]})]({commit_url})
+
+![Loss Graph]({url})
+"""
+    output_file.write_text(md_template, encoding="utf-8")
+
+
+def get_url(bucket_name: str, destination: str, aws_region: str) -> str:
+    return f"https://{bucket_name}.s3.{aws_region}.amazonaws.com/{destination}"
+
+
+def main(args: Arguments):
+    # first things first, we create the png file to upload to S3
+    log_file = Path(args.log_file) if args.log_file else None
+    loss_data = read_loss_data(src=args.source, log_file=log_file)
+    output_image = Path("/tmp/loss-graph.png")
+    output_file = Path(args.output_file)
+    render_image(loss_data=loss_data, outfile=output_image)
+    destination_path = get_destination_path(
+        base_ref=args.base_branch, pr_number=args.pr_number, head_sha=args.head_sha
+    )
+    write_to_s3(
+        file=output_image, bucket_name=args.bucket_name, destination=destination_path
+    )
+    s3_url = get_url(
+        bucket_name=args.bucket_name,
+        destination=destination_path,
+        aws_region=args.aws_region,
+    )
+    write_md_file(
+        output_file=output_file,
+        url=s3_url,
+        pr_number=args.pr_number,
+        head_sha=args.head_sha,
+        origin_repository=args.origin_repository,
+    )
+    print(f"Loss graph uploaded to '{s3_url}'")
+    print(f"Markdown file written to '{output_file}'")
 
 
 if __name__ == "__main__":
     parser = ArgumentParser()
-    subparsers = parser.add_subparsers(dest="command", required=True)
-
-    image_parser = subparsers.add_parser("image")
-    image_parser.add_argument("--log-file", type=str, required=True)
-    image_parser.add_argument("--output-file", type=str, default=None)
 
-    markdown_parser = subparsers.add_parser("markdown")
-    markdown_parser.add_argument("--log-file", type=str, required=True)
-    markdown_parser.add_argument("--output-file", type=str, default=None)
+    parser.add_argument(
+        "--source",
+        choices=["file", "env"],
+        default="file",
+        help="Source of the log file to read the loss data from. If file is selected, then we will read from the given file. If env is selected, we will read from the LOSS_DATA environment variable. If writing to env, the result should be a base64-encoded JSONL file.",
+    )
+    parser.add_argument(
+        "--log-file",
+        type=str,
+        default=None,
+        help="The log file to read the loss data from.",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        required=True,
+        help="The output file where the resulting markdown will be written.",
+    )
+    parser.add_argument(
+        "--aws-region",
+        type=str,
+        required=True,
+        help="S3 region to which the bucket belongs.",
+    )
+    parser.add_argument(
+        "--bucket-name", type=str, required=True, help="The S3 bucket name"
+    )
+    parser.add_argument(
+        "--base-branch",
+        type=str,
+        required=True,
+        help="The base branch being merged to.",
+    )
+    parser.add_argument("--pr-number", type=str, required=True, help="The PR number")
+    parser.add_argument(
+        "--head-sha", type=str, required=True, help="The head SHA of the PR"
+    )
+    parser.add_argument(
+        "--origin-repository",
+        type=str,
+        required=True,
+        help="The repository to which the originating branch belongs to.",
+    )
 
     args = parser.parse_args()
-    match args.command:
-        case "image":
-            print("creating image")
-            main(args)
-        case "markdown":
-            print("creating md file")
-            b64_data = create_b64_data(log_file=Path(args.log_file))
-            create_md_file(b64_data=b64_data, output_file=Path(args.output_file))
-        case _:
-            raise ValueError(f"Unknown command: {args.command}")
+
+    arguments = Arguments(
+        log_file=args.log_file,
+        source=args.source,
+        output_file=args.output_file,
+        aws_region=args.aws_region,
+        bucket_name=args.bucket_name,
+        base_branch=args.base_branch,
+        pr_number=args.pr_number,
+        head_sha=args.head_sha,
+        origin_repository=args.origin_repository,
+    )
+    main(arguments)
diff --git a/scripts/render-results.sh b/scripts/render-results.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+
+set -eo pipefail
+
+# This script exists to upload data to s3 and render the final markdown
+# file for the results of the benchmarking.
+
+function upload_to_s3() {
+    local -r bucket_name="$1"
+    local -r file_path="$2"
+    local -r destination_path="$3"
+
+    local -r bucket_path="s3://${bucket_name}/${destination_path}"
+    printf 'Uploading result to S3: %s\n' "${bucket_path}"
+    if [[ ! -f "${file_path}" ]]; then
+        echo "Error: File '${file_path}' does not exist."
+        exit 1
+    fi
+    aws s3 cp "${file_path}" "${bucket_path}"
+}
+
+################################################################################
+# Returns the path to where we'll be uploading the loss.png file to.
+# Currently, the format is in the form of:
+# pulls/<base_branch>/<pr_number>/<sha>/loss.png
+# This way, a single PR can have multiple runs and we can keep track of them.
+# Globals:
+#   github (read-only) - The github context
+# Arguments:
+#  None
+# Returns:
+#  (string) The path to where we'll be uploading the loss.png file to.
+################################################################################
+function get_s3_path() {
+    printf 'pulls/%s/%s/%s/loss.png' "${{ github.event.pull_request.base.ref }}" "${{ github.event.pull_request.number }}" "${{ github.event.pull_request.head.sha}}"
+}
+
+function export_results() {
+    local -r img_url="$1"
+    printf '### Test performance:\n\n![Loss curve](%s)\n' "${img_url}" >> "${GITHUB_STEP_SUMMARY}"
+}
+
+function main() {
+    local -r output_path=$(get_s3_path)
+    local -r bucket_name='os-ci-loss-curve-test'
+    local -r access_region="${{ vars.AWS_REGION }}"
+    local -r input_file='./loss.png'
+    local -r final_url="https://${bucket_name}.s3.${access_region}.amazonaws.com/${output_path}"
+
+    printf 'Uploading image "%s" to bucket "%s" at output path "%s"\n' "${input_file}" "${bucket_name}" "${output_path}"
+    upload_to_s3 "${bucket_name}" "${input_file}" "${output_path}"
+
+    printf 'Final url should be: "%s"\n' "${final_url}"
+    export_results "${final_url}"
+}
+
+main