Merge branch 'main' into add_credits_to_ubc_and_university_of_wiscons…

…in_in_readme
UBC-MDS · Jun 25, 2024 · a28fef1 · a28fef1
2 parents 3d56d69 + 70e574e
commit a28fef1
Show file tree

Hide file tree

Showing 292 changed files with 1,420 additions and 46,503 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+omit = tests/*
diff --git a/.github/workflows/ci-full.yml b/.github/workflows/ci-full.yml
@@ -0,0 +1,78 @@
+name: ci-full
+
+on:
+  # only pushes to the protected branch i.e. `main` should trigger this flow.
+  # enabling this flow to be triggered on push to **unprotected** branches
+  # or Pull Requests to **any** branch risks exposing the secrets for any
+  # unapproved changes (basically RCE)
+  push:
+    branches:
+      - main
+
+jobs:
+
+  test-then-build-wheel:
+    # Set up operating system
+    runs-on: ubuntu-latest
+
+    outputs:
+      wheelfile: ${{ steps.wheel-name.outputs.WHEELFILE }}
+
+    environment: integration-testing
+
+    # Define job steps
+    steps:
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      - name: Check-out repository
+        uses: actions/checkout@v3
+
+      - name: Install poetry
+        uses: snok/install-poetry@v1
+
+      - name: Install package
+        run: poetry install
+
+      - name: Install pandoc
+        run: sudo apt-get update && sudo apt-get install -y pandoc
+
+      - name: Install tectonic
+        run: sudo snap refresh && sudo snap install tectonic
+
+      - name: Test with pytest
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: poetry run pytest --cov
+
+      - name: Test package building process
+        run: rm -rf ./dist && poetry build --format wheel --output dist
+
+      - name: Get built wheel path
+        id: wheel-name
+        run: echo "WHEELFILE=$(ls dist)" >> "$GITHUB_OUTPUT"
+
+      - name: Upload test build as artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-wheel
+          path: dist/${{ steps.wheel-name.outputs.WHEELFILE }}
+
+  wheel-install-test:
+    runs-on: ubuntu-latest
+    needs: [test-then-build-wheel]
+    steps:
+     - name: Set up Python
+       uses: actions/setup-python@v4
+       with:
+         python-version: "3.12"
+
+     - name: Download built wheel
+       uses: actions/download-artifact@v4
+       with:
+         name: test-wheel
+
+     - name: Test pip install local wheel
+       run: pip install -U wheel ${{needs.test-then-build-wheel.outputs.wheelfile}}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,13 +1,16 @@
 name: ci
 
-on: [push, pull_request]
+on: [push]
 
 jobs:
 
-  ci:
+  unit-test-then-build-wheel:
     # Set up operating system
     runs-on: ubuntu-latest
 
+    outputs:
+      wheelfile: ${{ steps.wheel-name.outputs.WHEELFILE }}
+
     # Define job steps
     steps:
       - name: Set up Python
@@ -30,5 +33,35 @@ jobs:
       - name: Install tectonic
         run: sudo snap refresh && sudo snap install tectonic
 
-      - name: Test with pytest
-        run: poetry run pytest tests
+      - name: Test with pytest (skip integration tests)
+        run: poetry run pytest --cov -m "not integration"
+
+      - name: Test package building process
+        run: rm -rf ./dist && poetry build --format wheel --output dist
+
+      - name: Get built wheel path
+        id: wheel-name
+        run: echo "WHEELFILE=$(ls dist)" >> "$GITHUB_OUTPUT"
+
+      - name: Upload test build as artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-wheel
+          path: dist/${{ steps.wheel-name.outputs.WHEELFILE }}
+
+  wheel-install-test:
+    runs-on: ubuntu-latest
+    needs: [unit-test-then-build-wheel]
+    steps:
+     - name: Set up Python
+       uses: actions/setup-python@v4
+       with:
+         python-version: "3.12"
+
+     - name: Download built wheel
+       uses: actions/download-artifact@v4
+       with:
+         name: test-wheel
+
+     - name: Test pip install local wheel
+       run: pip install -U wheel ${{needs.unit-test-then-build-wheel.outputs.wheelfile}}
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 draft/*
 **/*~
 .env
+/data/
 
 # vim swap files
 **/*~
@@ -12,4 +13,14 @@ draft/*
 .idea/
 
 # pycache
-**/__pycache__/
+**/__pycache__/
+
+# built package
+dist/
+
+# pytest-cov coverage files
+.coverage*
+
+# quarto github page related
+/.quarto/
+/_site/
diff --git a/Makefile b/Makefile
@@ -0,0 +1,38 @@
+# Reference: 
+# 	https://swcarpentry.github.io/make-novice/02-makefiles.html
+# 	https://ubc-dsci.github.io/reproducible-and-trustworthy-workflows-for-data-science/materials/lectures/09-pipelines.html
+
+# The .PHONY rule is used to tell make that 'all', 'clean' are not files.
+.PHONY : all
+# The 'all' target is the default target. It depends on 'report/docs/index.html', which triggers the build process for this file.
+all : report/docs/index.html 
+
+# Unzip batch_run.zip
+data/batch_run/batch_run_3.5-turbo \
+	data/batch_run/batch_run_4-turbo \
+	data/batch_run/batch_run_4o : 
+	unzip data/batch_run/batch_run.zip -d data/batch_run/
+
+# Preprocess 
+data/processed/ground_truth.csv : analysis/preprocess_batch_run_result.py data/batch_run/batch_run_3.5-turbo
+	python analysis/preprocess_batch_run_result.py
+
+# Build 'report/docs/index.html' by rendering the Jupyter notebooks using Quarto.
+report/docs/index.html : data/processed/ground_truth.csv
+	quarto render
+
+.PHONY : publish
+publish : data/processed/ground_truth.csv
+	quarto publish gh-pages
+
+# The 'clean' target is used to clean up generated files and directories.
+.PHONY : clean
+clean : 
+	rm -rf report/docs/*
+	touch report/docs/.gitkeep
+	rm -rf data/batch_run/batch_run_3.5-turbo
+	rm -rf data/batch_run/batch_run_4-turbo
+	rm -rf data/batch_run/batch_run_4o
+	rm -rf data/processed/ground_truth.csv
+	rm -rf data/processed/score_*csv
+
diff --git a/report/final_report/_quarto.yml → _quarto.yml b/report/final_report/_quarto.yml → _quarto.yml
@@ -1,8 +1,8 @@
 project:
   type: website
   render:
-    - "*qmd"
-  output-dir: docs
+    - "report/*qmd"
+  output-dir: report/docs
 
 website:
   sidebar:
@@ -11,9 +11,9 @@ website:
     search: true
     contents:
       - text: "Capstone Final Report"
-        href: final_report.qmd
+        href: report/final_report.qmd
       - text: "Capstone Proposal"
-        href: proposal.qmd
+        href: report/proposal.qmd
 
 format:
   html:

diff --git a/analysis/batch_run.py b/analysis/batch_run.py
@@ -25,14 +25,16 @@ def main(config_yml):
         from_dir = config['repo_base_path']
         to_dir = config['response_path']
         repos = config['repo']
+        model = config['model']
 
         record = []
         for repo in tqdm(repos):
             for run in range(1, runs+1):
                 action.evaluate(
                     repo_path=os.path.join(from_dir, repo['path']),
-                    save_to=f"{to_dir}/{repo['name']}_{"{:02d}".format(run)}.json",
-                    checklist_path=checklist_path
+                    save_response_to=f"{to_dir}/{repo['name']}_{"{:02d}".format(run)}.json",
+                    checklist_path=checklist_path,
+                    model=model
                 )
 
                 record.append({
@@ -43,4 +45,4 @@ def main(config_yml):
                 with open(f"{to_dir}/record.yml", 'w') as file:
                     yaml.dump(record, file)
 
-    fire.Fire(main)
+    fire.Fire(main)
diff --git a/analysis/batch_run.yml b/analysis/batch_run.yml
@@ -1,7 +1,8 @@
 runs: 30
 checklist_path: null
-repo_base_path: '../data/raw/openja/'
-response_path: '../data/processed/batch_run'
+model: 'gpt-3.5-turbo'
+repo_base_path: 'data/raw/openja/'
+response_path: 'data/processed/batch_run_3.5-turbo'
 repo:
   - name: lightfm
     path: './lightfm'

diff --git a/report/final_report/01_preprocess.ipynb → analysis/ipynb/01_preprocess.ipynb b/report/final_report/01_preprocess.ipynb → analysis/ipynb/01_preprocess.ipynb
@@ -178,19 +178,26 @@
    "outputs": [],
    "source": [
     "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/processed/batch_run_3.5-turbo/')\n",
-    "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)"
+    "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
+    "\n",
+    "df_repo_run.to_csv('score_by_repo_run_3.5-turbo.csv', index=False)\n",
+    "df_repo__stat.to_csv('score_stat_by_repo_3.5-turbo.csv', index=False)\n",
+    "df_repo__count.to_csv('score_count_by_repo_3.5-turbo.csv', index=False)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "31c1ce0b-14e3-4825-aa6e-74dd4d4af960",
+   "execution_count": 5,
+   "id": "853ba351-b620-4833-8683-a4e62c7fd9a4",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_repo_run.to_csv('score_by_repo_run_3.5-turbo.csv', index=False)\n",
-    "df_repo__stat.to_csv('score_stat_by_repo_3.5-turbo.csv', index=False)\n",
-    "df_repo__count.to_csv('score_count_by_repo_3.5-turbo.csv', index=False)"
+    "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/processed/batch_run_4-turbo/')\n",
+    "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
+    "\n",
+    "df_repo_run.to_csv('score_by_repo_run_4-turbo.csv', index=False)\n",
+    "df_repo__stat.to_csv('score_stat_by_repo_4-turbo.csv', index=False)\n",
+    "df_repo__count.to_csv('score_count_by_repo_4-turbo.csv', index=False)"
    ]
   },
   {
@@ -200,7 +207,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/processed/batch_run_4o/')\n",
+    "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4o/')\n",
     "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
     "\n",
     "df_repo_run.to_csv('score_by_repo_run_4o.csv', index=False)\n",
@@ -221,16 +228,6 @@
     "ground_truth_df.to_csv('ground_truth.csv')"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "82d58e97-827a-4a3c-a313-c7623341bdd2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#df_repo__count.melt(id_vars=['repo', 'level_1'])"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,