diff --git a/analysis/batch_run.yml b/analysis/batch_run.yml
index 64b803a..0214292 100644
--- a/analysis/batch_run.yml
+++ b/analysis/batch_run.yml
@@ -1,28 +1,28 @@
-runs: 28
+runs: 30
checklist_path: null
-model: 'gpt-4-turbo'
-repo_base_path: '../data/raw/openja/'
-response_path: '../data/processed/batch_run_4-turbo'
+model: 'gpt-3.5-turbo'
+repo_base_path: 'data/raw/openja/'
+response_path: 'data/processed/batch_run_3.5-turbo'
repo:
- name: lightfm
path: './lightfm'
- # - name: qlib
- # path: './qlib'
- # - name: mmf
- # path: './mmf'
- # - name: nanodet
- # path: './nanodet'
- # - name: magenta
- # path: './magenta'
- # - name: nupic
- # path: './nupic'
- # - name: DeepSpeech
- # path: './DeepSpeech'
- # - name: apollo
- # path: './apollo'
- # - name: 'paperless-ng'
- # path: './paperless-ng'
- # - name: 'mycroft-core'
- # path: './mycroft-core'
- # - name: deepchem
- # path: './deepchem'
+ - name: qlib
+ path: './qlib'
+ - name: mmf
+ path: './mmf'
+ - name: nanodet
+ path: './nanodet'
+ - name: magenta
+ path: './magenta'
+ - name: nupic
+ path: './nupic'
+ - name: DeepSpeech
+ path: './DeepSpeech'
+ - name: apollo
+ path: './apollo'
+ - name: 'paperless-ng'
+ path: './paperless-ng'
+ - name: 'mycroft-core'
+ path: './mycroft-core'
+ - name: deepchem
+ path: './deepchem'
diff --git a/analysis/ipynb/01_preprocess.ipynb b/analysis/ipynb/01_preprocess.ipynb
new file mode 100644
index 0000000..c539db1
--- /dev/null
+++ b/analysis/ipynb/01_preprocess.ipynb
@@ -0,0 +1,269 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "18830011-62d1-4242-b851-e6e9ae47b49d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#!pip install scipy altair"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "47bea46f-2b65-42ce-801e-55bf6576a67a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import json\n",
+ "import yaml\n",
+ "import pandas as pd\n",
+ "from collections import Counter\n",
+ "\n",
+ "id_item_map = {\n",
+ " '2.1': 'Ensure Data File Loads as Expected',\n",
+ " '3.2': 'Data in the Expected Format',\n",
+ " '3.5': 'Check for Duplicate Records in Data',\n",
+ " '4.2': 'Verify Data Split Proportion',\n",
+ " '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n",
+ " '6.1': 'Verify Evaluation Metrics Implementation',\n",
+ " '6.2': \"Evaluate Model's Performance Against Thresholds\"\n",
+ "}\n",
+ "\n",
+ "ground_truth = [\n",
+ " {'repo': 'lightfm', 'id': '2.1', 'score': 1},\n",
+ " {'repo': 'lightfm', 'id': '3.2', 'score': 1},\n",
+ " {'repo': 'lightfm', 'id': '3.5', 'score': 0},\n",
+ " {'repo': 'lightfm', 'id': '4.2', 'score': 1},\n",
+ " {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},\n",
+ " {'repo': 'lightfm', 'id': '6.1', 'score': 1},\n",
+ " {'repo': 'lightfm', 'id': '6.2', 'score': 1},\n",
+ " {'repo': 'qlib', 'id': '2.1', 'score': 0.5},\n",
+ " {'repo': 'qlib', 'id': '3.2', 'score': 1},\n",
+ " {'repo': 'qlib', 'id': '3.5', 'score': 0},\n",
+ " {'repo': 'qlib', 'id': '4.2', 'score': 0.5},\n",
+ " {'repo': 'qlib', 'id': '5.3', 'score': 1},\n",
+ " {'repo': 'qlib', 'id': '6.1', 'score': 1},\n",
+ " {'repo': 'qlib', 'id': '6.2', 'score': 1},\n",
+ " {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},\n",
+ " {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},\n",
+ " {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},\n",
+ " {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},\n",
+ " {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},\n",
+ " {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},\n",
+ " {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},\n",
+ "]\n",
+ "\n",
+ "def get_score_report_from_response(resp_path, verbose=False):\n",
+ " if verbose:\n",
+ " print(resp_path)\n",
+ " with open(resp_path, 'rb') as file:\n",
+ " response = json.load(file)\n",
+ " \n",
+ " reports = [] # report for each test file\n",
+ " for result in response['call_results']: # one test file per response\n",
+ " if result['parsed_response']:\n",
+ " resp = result['parsed_response']['results']\n",
+ " for item in resp:\n",
+ " item['file'] = result['files_evaluated'][0] \n",
+ " item['success'] = result['success']\n",
+ " reports.append(item)\n",
+ " # FIXME: not handled failed run for now\n",
+ " # else: # if the run is failed, the parsed_response will be None\n",
+ " # reports.append({\n",
+ " # 'ID': '2.1', \n",
+ " # 'Title': '',\n",
+ " # 'Requirement': '',\n",
+ " # 'Observation': '',\n",
+ " # 'Functions': [],\n",
+ " # 'Evaluation': '',\n",
+ " # 'Score': 0,\n",
+ " # 'file': result['files_evaluated'][0],\n",
+ " # 'success': result['success']\n",
+ " # })\n",
+ " \n",
+ " reports_df = pd.DataFrame(reports)\n",
+ " df = (\n",
+ " reports_df\n",
+ " .pivot(index='file', columns='ID', values='Score')\n",
+ " .rename_axis(None, axis=1)\n",
+ " )\n",
+ " df['success'] = reports_df.groupby(['file'])['success'].all()\n",
+ " df['response_path'] = os.path.abspath(resp_path)\n",
+ " \n",
+ " return df.reset_index()\n",
+ "\n",
+ "def get_scores_by_repo_by_run_by_file(batch_run_dir_path, record_yml='record_combine.yml', verbose=False):\n",
+ " ''' Get score for each checklist item, by repository, by run and by test file\n",
+ " '''\n",
+ " with open(os.path.join(batch_run_dir_path, record_yml), 'r') as file:\n",
+ " config = pd.DataFrame(yaml.safe_load(file))\n",
+ "\n",
+ " config['response_path'] = config['response_path'].apply(\n",
+ " lambda x: os.path.abspath(os.path.join(batch_run_dir_path, x))\n",
+ " )\n",
+ " \n",
+ " tmp = [\n",
+ " get_score_report_from_response(\n",
+ " os.path.join(batch_run_dir_path, path),\n",
+ " verbose=verbose\n",
+ " ) for path in config['response_path']\n",
+ " ]\n",
+ " tmp = pd.concat(tmp, axis=0).reset_index(drop=True)\n",
+ " \n",
+ " return config.merge(tmp, on='response_path', how='left')\n",
+ "\n",
+ "def preprocess(df_repo_run_file, id_item_map=None):\n",
+ " if id_item_map is None:\n",
+ " id_item_map = {\n",
+ " '2.1': 'Ensure Data File Loads as Expected',\n",
+ " '3.2': 'Data in the Expected Format',\n",
+ " '3.5': 'Check for Duplicate Records in Data',\n",
+ " '4.2': 'Verify Data Split Proportion',\n",
+ " '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n",
+ " '6.1': 'Verify Evaluation Metrics Implementation',\n",
+ " '6.2': \"Evaluate Model's Performance Against Thresholds\"\n",
+ " }\n",
+ "\n",
+ " # prepare score data by repo, by run\n",
+ " df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({\n",
+ " id: ['max'] for id in id_item_map.keys()\n",
+ " })\n",
+ " df_repo_run.columns = [col[0] for col in df_repo_run.columns]\n",
+ " df_repo_run = df_repo_run.reset_index()\n",
+ " \n",
+ " # prepare statistics of scores by repo\n",
+ " df_repo__stat = df_repo_run.groupby(['repo']).agg({\n",
+ " id: ['mean', 'std', 'count'] for id in id_item_map.keys()\n",
+ " })\n",
+ " df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])\n",
+ " df_repo__stat.columns = ['repo', 'id', 'stat', 'value']\n",
+ " df_repo__stat = (\n",
+ " df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')\n",
+ " .reset_index()\n",
+ " .rename_axis(None, axis=1)\n",
+ " )\n",
+ " df_repo__stat['title'] = df_repo__stat['id'].apply(lambda x: id_item_map[x])\n",
+ " df_repo__stat['id_title'] = df_repo__stat['id'] + '. ' + df_repo__stat['title']\n",
+ " \n",
+ " # prepare counting of scores by repo\n",
+ " df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()\n",
+ " for id in list(id_item_map.keys())[1:]:\n",
+ " df_repo__count = df_repo__count.merge(\n",
+ " df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),\n",
+ " on=['repo', 'level_1'],\n",
+ " how='outer'\n",
+ " )\n",
+ " #df_repo__count['title'] = df_repo__count['id'].apply(lambda x: id_item_map[x])\n",
+ " \n",
+ " df_repo__count = df_repo__count.fillna(0)\n",
+ "\n",
+ " df_repo_run = df_repo_run.melt(id_vars=['repo', 'run'], var_name='id', value_name='score')\n",
+ " df_repo_run['title'] = df_repo_run['id'].apply(lambda x: id_item_map[x])\n",
+ " df_repo_run['id_title'] = df_repo_run['id'] + '. ' + df_repo_run['title']\n",
+ " \n",
+ " return (df_repo_run, df_repo__stat, df_repo__count)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "31c1ce0b-14e3-4825-aa6e-74dd4d4af960",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_3.5-turbo/')\n",
+ "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
+ "\n",
+ "df_repo_run.to_csv('score_by_repo_run_3.5-turbo.csv', index=False)\n",
+ "df_repo__stat.to_csv('score_stat_by_repo_3.5-turbo.csv', index=False)\n",
+ "df_repo__count.to_csv('score_count_by_repo_3.5-turbo.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6343bba3-4fff-4c24-8e71-30ec81df4c4f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4-turbo/')\n",
+ "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
+ "\n",
+ "df_repo_run.to_csv('score_by_repo_run_4-turbo.csv', index=False)\n",
+ "df_repo__stat.to_csv('score_stat_by_repo_4-turbo.csv', index=False)\n",
+ "df_repo__count.to_csv('score_count_by_repo_4-turbo.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "11b75320-05a7-4db2-86ea-9c085df26d73",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4o/')\n",
+ "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
+ "\n",
+ "df_repo_run.to_csv('score_by_repo_run_4o.csv', index=False)\n",
+ "df_repo__stat.to_csv('score_stat_by_repo_4o.csv', index=False)\n",
+ "df_repo__count.to_csv('score_count_by_repo_4o.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "455c8477-e02d-44cd-80dc-854b3e9e0fa5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ground_truth_df = pd.DataFrame(ground_truth)\n",
+ "ground_truth_df['title'] = ground_truth_df['id'].apply(lambda x: id_item_map[x])\n",
+ "ground_truth_df = ground_truth_df.pivot(index=['id', 'title'], columns='repo', values='score')\n",
+ "ground_truth_df.to_csv('ground_truth.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "28b5150d-d11b-41a5-91bb-32af1a19a776",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "46ae0ecc-c510-418e-ae7b-6db3d6571219",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:fixml]",
+ "language": "python",
+ "name": "conda-env-fixml-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/analysis/ipynb/02_plots-for-final-report.ipynb b/analysis/ipynb/02_plots-for-final-report.ipynb
new file mode 100644
index 0000000..36e9feb
--- /dev/null
+++ b/analysis/ipynb/02_plots-for-final-report.ipynb
@@ -0,0 +1,693 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e9a74646-ec18-49c0-b9ef-ed3b5ba64087",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import altair as alt\n",
+ "import pandas as pd\n",
+ "\n",
+ "df_repo__stat = pd.read_csv('score_stat_by_repo_3.5-turbo.csv')\n",
+ "gt = pd.read_csv('ground_truth.csv')\n",
+ "gt = gt.melt(id_vars=['id', 'title'], var_name='repo', value_name='ground_truth')\n",
+ "\n",
+ "df_repo__stat_with_gt = df_repo__stat.merge(gt, on=['id', 'title', 'repo'])\n",
+ "\n",
+ "base = alt.Chart(\n",
+ " df_repo__stat_with_gt.query('repo in [\"lightfm\", \"qlib\", \"DeepSpeech\"]')\n",
+ ").transform_calculate(\n",
+ " min=\"max(0, datum.mean-datum.std)\",\n",
+ " max=\"min(1, datum.mean+datum.std)\"\n",
+ ")\n",
+ " \n",
+ "# generate the points\n",
+ "points = base.mark_point(\n",
+ " filled=True,\n",
+ " size=50,\n",
+ " color='black'\n",
+ ").encode(\n",
+ " x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n",
+ " labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n",
+ " ),\n",
+ " y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n",
+ ")\n",
+ "\n",
+ "# generate the points for ground truth\n",
+ "gt_points = base.mark_point(\n",
+ " filled=True,\n",
+ " size=200,\n",
+ " color='green',\n",
+ " shape=\"diamond\"\n",
+ ").encode(\n",
+ " x=alt.X('ground_truth:Q'),\n",
+ " y=alt.Y('id_title:N')\n",
+ ")\n",
+ "\n",
+ "# generate the error bars\n",
+ "errorbars = base.mark_errorbar().encode(\n",
+ " x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n",
+ " x2=\"max:Q\",\n",
+ " y=\"id_title:N\"\n",
+ ")\n",
+ "\n",
+ "(gt_points + points + errorbars).facet(\n",
+ " column=alt.Column('repo:N').title(None)\n",
+ ").configure_axis( \n",
+ " labelFontSize=12, \n",
+ " titleFontSize=12\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aa1c3071-33f8-47da-a10d-42b7dbdcce1d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "a169da71-4be7-4c88-8553-d6b68c2b1edf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " score | \n",
+ " 0.0 | \n",
+ " 0.5 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " Repository | \n",
+ " Checklist Item | \n",
+ " Ground Truth | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " lightfm | \n",
+ " 3.5. Check for Duplicate Records in Data | \n",
+ " 0.0 | \n",
+ " 30 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 5.3. Ensure Model Output Shape Aligns with Expectation | \n",
+ " 0.5 | \n",
+ " 1 | \n",
+ " 29 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2.1. Ensure Data File Loads as Expected | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " 3.2. Data in the Expected Format | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 30 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4.2. Verify Data Split Proportion | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 11 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " 6.1. Verify Evaluation Metrics Implementation | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " 6.2. Evaluate Model's Performance Against Thresholds | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " qlib | \n",
+ " 3.5. Check for Duplicate Records in Data | \n",
+ " 0.0 | \n",
+ " 23 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2.1. Ensure Data File Loads as Expected | \n",
+ " 0.5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " 4.2. Verify Data Split Proportion | \n",
+ " 0.5 | \n",
+ " 3 | \n",
+ " 25 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 3.2. Data in the Expected Format | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 14 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " 5.3. Ensure Model Output Shape Aligns with Expectation | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 25 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 6.1. Verify Evaluation Metrics Implementation | \n",
+ " 1.0 | \n",
+ " 2 | \n",
+ " 18 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 6.2. Evaluate Model's Performance Against Thresholds | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 24 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "score 0.0 \\\n",
+ "Repository Checklist Item Ground Truth \n",
+ "lightfm 3.5. Check for Duplicate Records in Data 0.0 30 \n",
+ " 5.3. Ensure Model Output Shape Aligns with Expe... 0.5 1 \n",
+ " 2.1. Ensure Data File Loads as Expected 1.0 0 \n",
+ " 3.2. Data in the Expected Format 1.0 0 \n",
+ " 4.2. Verify Data Split Proportion 1.0 0 \n",
+ " 6.1. Verify Evaluation Metrics Implementation 1.0 0 \n",
+ " 6.2. Evaluate Model's Performance Against Thres... 1.0 0 \n",
+ "qlib 3.5. Check for Duplicate Records in Data 0.0 23 \n",
+ " 2.1. Ensure Data File Loads as Expected 0.5 0 \n",
+ " 4.2. Verify Data Split Proportion 0.5 3 \n",
+ " 3.2. Data in the Expected Format 1.0 0 \n",
+ " 5.3. Ensure Model Output Shape Aligns with Expe... 1.0 1 \n",
+ " 6.1. Verify Evaluation Metrics Implementation 1.0 2 \n",
+ " 6.2. Evaluate Model's Performance Against Thres... 1.0 0 \n",
+ "\n",
+ "score 0.5 \\\n",
+ "Repository Checklist Item Ground Truth \n",
+ "lightfm 3.5. Check for Duplicate Records in Data 0.0 0 \n",
+ " 5.3. Ensure Model Output Shape Aligns with Expe... 0.5 29 \n",
+ " 2.1. Ensure Data File Loads as Expected 1.0 0 \n",
+ " 3.2. Data in the Expected Format 1.0 30 \n",
+ " 4.2. Verify Data Split Proportion 1.0 11 \n",
+ " 6.1. Verify Evaluation Metrics Implementation 1.0 5 \n",
+ " 6.2. Evaluate Model's Performance Against Thres... 1.0 1 \n",
+ "qlib 3.5. Check for Duplicate Records in Data 0.0 7 \n",
+ " 2.1. Ensure Data File Loads as Expected 0.5 0 \n",
+ " 4.2. Verify Data Split Proportion 0.5 25 \n",
+ " 3.2. Data in the Expected Format 1.0 14 \n",
+ " 5.3. Ensure Model Output Shape Aligns with Expe... 1.0 25 \n",
+ " 6.1. Verify Evaluation Metrics Implementation 1.0 18 \n",
+ " 6.2. Evaluate Model's Performance Against Thres... 1.0 24 \n",
+ "\n",
+ "score 1.0 \n",
+ "Repository Checklist Item Ground Truth \n",
+ "lightfm 3.5. Check for Duplicate Records in Data 0.0 0 \n",
+ " 5.3. Ensure Model Output Shape Aligns with Expe... 0.5 0 \n",
+ " 2.1. Ensure Data File Loads as Expected 1.0 30 \n",
+ " 3.2. Data in the Expected Format 1.0 0 \n",
+ " 4.2. Verify Data Split Proportion 1.0 19 \n",
+ " 6.1. Verify Evaluation Metrics Implementation 1.0 25 \n",
+ " 6.2. Evaluate Model's Performance Against Thres... 1.0 29 \n",
+ "qlib 3.5. Check for Duplicate Records in Data 0.0 0 \n",
+ " 2.1. Ensure Data File Loads as Expected 0.5 30 \n",
+ " 4.2. Verify Data Split Proportion 0.5 2 \n",
+ " 3.2. Data in the Expected Format 1.0 16 \n",
+ " 5.3. Ensure Model Output Shape Aligns with Expe... 1.0 4 \n",
+ " 6.1. Verify Evaluation Metrics Implementation 1.0 10 \n",
+ " 6.2. Evaluate Model's Performance Against Thres... 1.0 6 "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_repo_run = pd.read_csv('score_by_repo_run_3.5-turbo.csv')\n",
+ "\n",
+ "df_repo_run = df_repo_run.merge(gt, on=['id', 'title', 'repo'])\n",
+ "\n",
+ "contingency_table = pd.pivot_table(\n",
+ " df_repo_run,\n",
+ " values='run', \n",
+ " index=['repo', 'id_title', 'ground_truth'], \n",
+ " columns=['score'],\n",
+ " aggfunc='count', \n",
+ " fill_value=0\n",
+ ")\n",
+ "contingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth']\n",
+ "contingency_table.sort_index(level=[0, 2])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "afc06ca7-5f39-4293-8bdb-9d46558e7535",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "f23950d5-792f-4f0a-8e3a-1727b3598dd8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.LayerChart(...)"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "stds = df_repo__stat[['repo', 'std', 'id_title']].pivot(index='repo', columns='id_title').copy()\n",
+ "stds.columns = [col[1] for col in stds.columns]\n",
+ "stds = stds.reset_index()\n",
+ "stds = stds.melt(id_vars='repo', var_name='id_title')\n",
+ "\n",
+ "base = alt.Chart(stds)\n",
+ "\n",
+ "box = base.mark_boxplot(\n",
+ " color='grey',\n",
+ " opacity=0.5,\n",
+ " size=20,\n",
+ ").encode(\n",
+ " x=alt.X('value:Q').title('Standard Deviation of Scores'),\n",
+ " y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))\n",
+ ")\n",
+ "\n",
+ "stripplot = base.mark_circle(size=100).encode(\n",
+ " y=alt.Y( \n",
+ " 'id_title:N',\n",
+ " axis=alt.Axis(ticks=False, grid=True, labels=True), \n",
+ " scale=alt.Scale(), \n",
+ " ), \n",
+ " x='value:Q',\n",
+ " yOffset=\"jitter:Q\",\n",
+ " color=alt.Color('id_title:N', legend=None),\n",
+ " tooltip='repo'\n",
+ ").transform_calculate(\n",
+ " # Generate Gaussian jitter with a Box-Muller transform\n",
+ " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n",
+ ")\n",
+ "\n",
+ "(\n",
+ " box + stripplot\n",
+ ").configure_view( \n",
+ " stroke=None\n",
+ ").configure_axis( \n",
+ " labelFontSize=12, \n",
+ " titleFontSize=12\n",
+ ").properties(\n",
+ " height=300, \n",
+ " width=600,\n",
+ " title=\"30 Runs on Openja's Repositories for each Checklist Item\"\n",
+ ") "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1d493dc0-5b75-4348-a627-b1194e498b0d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "bade3842-185e-4369-a5d7-4356290df058",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_repo_4o__stat = pd.read_csv('score_stat_by_repo_4o.csv')\n",
+ "df_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])\n",
+ "df_repo_4o__stat_with_gt['model'] = 'gpt-4o'\n",
+ "\n",
+ "df_repo_35turbo__stat_with_gt = df_repo__stat_with_gt.query(\"repo == 'lightfm'\").copy()\n",
+ "df_repo_35turbo__stat_with_gt['model'] = 'gpt-3.5-turbo'\n",
+ "\n",
+ "df_model_comp = pd.concat(\n",
+ " (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), \n",
+ " axis=0\n",
+ ")\n",
+ "\n",
+ "base = alt.Chart(\n",
+ " df_model_comp\n",
+ ").transform_calculate(\n",
+ " min=\"max(0, datum.mean-datum.std)\",\n",
+ " max=\"min(1, datum.mean+datum.std)\"\n",
+ ")\n",
+ " \n",
+ "# generate the points\n",
+ "points = base.mark_point(\n",
+ " filled=True,\n",
+ " size=50,\n",
+ " color='black'\n",
+ ").encode(\n",
+ " x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n",
+ " labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n",
+ " ),\n",
+ " y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n",
+ ")\n",
+ "\n",
+ "# generate the points for ground truth\n",
+ "gt_points = base.mark_point(\n",
+ " filled=True,\n",
+ " size=200,\n",
+ " color='green',\n",
+ " shape=\"diamond\"\n",
+ ").encode(\n",
+ " x=alt.X('ground_truth:Q'),\n",
+ " y=alt.Y('id_title:N')\n",
+ ")\n",
+ "\n",
+ "# generate the error bars\n",
+ "errorbars = base.mark_errorbar().encode(\n",
+ " x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n",
+ " x2=\"max:Q\",\n",
+ " y=\"id_title:N\"\n",
+ ")\n",
+ "\n",
+ "(gt_points + points + errorbars).facet(\n",
+ " column=alt.Column('model:N').title(None)\n",
+ ").configure_axis( \n",
+ " labelFontSize=12, \n",
+ " titleFontSize=12\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1af0fef8-2c34-4166-affe-93224c639cf9",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:fixml]",
+ "language": "python",
+ "name": "conda-env-fixml-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/analysis/preprocess_batch_run_result.py b/analysis/preprocess_batch_run_result.py
new file mode 100644
index 0000000..d792633
--- /dev/null
+++ b/analysis/preprocess_batch_run_result.py
@@ -0,0 +1,163 @@
+import os
+import json
+import yaml
+import pandas as pd
+from collections import Counter
+
+id_item_map = {
+ '2.1': 'Ensure Data File Loads as Expected',
+ '3.2': 'Data in the Expected Format',
+ '3.5': 'Check for Duplicate Records in Data',
+ '4.2': 'Verify Data Split Proportion',
+ '5.3': 'Ensure Model Output Shape Aligns with Expectation',
+ '6.1': 'Verify Evaluation Metrics Implementation',
+ '6.2': "Evaluate Model's Performance Against Thresholds"
+}
+
+ground_truth = [
+ {'repo': 'lightfm', 'id': '2.1', 'score': 1},
+ {'repo': 'lightfm', 'id': '3.2', 'score': 1},
+ {'repo': 'lightfm', 'id': '3.5', 'score': 0},
+ {'repo': 'lightfm', 'id': '4.2', 'score': 1},
+ {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},
+ {'repo': 'lightfm', 'id': '6.1', 'score': 1},
+ {'repo': 'lightfm', 'id': '6.2', 'score': 1},
+ {'repo': 'qlib', 'id': '2.1', 'score': 0.5},
+ {'repo': 'qlib', 'id': '3.2', 'score': 1},
+ {'repo': 'qlib', 'id': '3.5', 'score': 0},
+ {'repo': 'qlib', 'id': '4.2', 'score': 0.5},
+ {'repo': 'qlib', 'id': '5.3', 'score': 1},
+ {'repo': 'qlib', 'id': '6.1', 'score': 1},
+ {'repo': 'qlib', 'id': '6.2', 'score': 1},
+ {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},
+ {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},
+ {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},
+ {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},
+ {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},
+ {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},
+ {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},
+]
+
+def get_score_report_from_response(resp_path, verbose=False):
+ if verbose:
+ print(resp_path)
+ with open(resp_path, 'rb') as file:
+ response = json.load(file)
+
+ reports = [] # report for each test file
+ for result in response['call_results']: # one test file per response
+ if result['parsed_response']:
+ resp = result['parsed_response']['results']
+ for item in resp:
+ item['file'] = result['files_evaluated'][0]
+ item['success'] = result['success']
+ reports.append(item)
+ # FIXME: not handled failed run for now
+ # else: # if the run is failed, the parsed_response will be None
+ # reports.append({
+ # 'ID': '2.1',
+ # 'Title': '',
+ # 'Requirement': '',
+ # 'Observation': '',
+ # 'Functions': [],
+ # 'Evaluation': '',
+ # 'Score': 0,
+ # 'file': result['files_evaluated'][0],
+ # 'success': result['success']
+ # })
+
+ reports_df = pd.DataFrame(reports)
+ df = (
+ reports_df
+ .pivot(index='file', columns='ID', values='Score')
+ .rename_axis(None, axis=1)
+ )
+ df['success'] = reports_df.groupby(['file'])['success'].all()
+ df['response_path'] = os.path.abspath(resp_path)
+
+ return df.reset_index()
+
+def get_scores_by_repo_by_run_by_file(batch_run_dir_path, record_yml='record_combine.yml', verbose=False):
+ ''' Get score for each checklist item, by repository, by run and by test file
+ '''
+ with open(os.path.join(batch_run_dir_path, record_yml), 'r') as file:
+ config = pd.DataFrame(yaml.safe_load(file))
+
+ config['response_path'] = config['response_path'].apply(
+ lambda x: os.path.abspath(os.path.join(batch_run_dir_path, x))
+ )
+
+ tmp = [
+ get_score_report_from_response(
+ os.path.join(batch_run_dir_path, path),
+ verbose=verbose
+ ) for path in config['response_path']
+ ]
+ tmp = pd.concat(tmp, axis=0).reset_index(drop=True)
+
+ return config.merge(tmp, on='response_path', how='left')
+
+def preprocess(df_repo_run_file, id_item_map=None):
+ if id_item_map is None:
+ id_item_map = {
+ '2.1': 'Ensure Data File Loads as Expected',
+ '3.2': 'Data in the Expected Format',
+ '3.5': 'Check for Duplicate Records in Data',
+ '4.2': 'Verify Data Split Proportion',
+ '5.3': 'Ensure Model Output Shape Aligns with Expectation',
+ '6.1': 'Verify Evaluation Metrics Implementation',
+ '6.2': "Evaluate Model's Performance Against Thresholds"
+ }
+
+ # prepare score data by repo, by run
+ df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
+ id: ['max'] for id in id_item_map.keys()
+ })
+ df_repo_run.columns = [col[0] for col in df_repo_run.columns]
+ df_repo_run = df_repo_run.reset_index()
+
+ # prepare statistics of scores by repo
+ df_repo__stat = df_repo_run.groupby(['repo']).agg({
+ id: ['mean', 'std', 'count'] for id in id_item_map.keys()
+ })
+ df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
+ df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
+ df_repo__stat = (
+ df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
+ .reset_index()
+ .rename_axis(None, axis=1)
+ )
+ df_repo__stat['title'] = df_repo__stat['id'].apply(lambda x: id_item_map[x])
+ df_repo__stat['id_title'] = df_repo__stat['id'] + '. ' + df_repo__stat['title']
+
+ # prepare counting of scores by repo
+ df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
+ for id in list(id_item_map.keys())[1:]:
+ df_repo__count = df_repo__count.merge(
+ df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
+ on=['repo', 'level_1'],
+ how='outer'
+ )
+ #df_repo__count['title'] = df_repo__count['id'].apply(lambda x: id_item_map[x])
+
+ df_repo__count = df_repo__count.fillna(0)
+
+ df_repo_run = df_repo_run.melt(id_vars=['repo', 'run'], var_name='id', value_name='score')
+ df_repo_run['title'] = df_repo_run['id'].apply(lambda x: id_item_map[x])
+ df_repo_run['id_title'] = df_repo_run['id'] + '. ' + df_repo_run['title']
+
+ return (df_repo_run, df_repo__stat, df_repo__count)
+
+
+for model in ['3.5-turbo', '4-turbo', '4o']:
+ df_repo_run_file = get_scores_by_repo_by_run_by_file(f'data/batch_run/batch_run_{model}/')
+ df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)
+
+ df_repo_run.to_csv(f'data/processed/score_by_repo_run_{model}.csv', index=False)
+ df_repo__stat.to_csv(f'data/processed/score_stat_by_repo_{model}.csv', index=False)
+ df_repo__count.to_csv(f'data/processed/score_count_by_repo_{model}.csv', index=False)
+
+ground_truth_df = pd.DataFrame(ground_truth)
+ground_truth_df['title'] = ground_truth_df['id'].apply(lambda x: id_item_map[x])
+ground_truth_df = ground_truth_df.pivot(index=['id', 'title'], columns='repo', values='score')
+ground_truth_df.to_csv('data/processed/ground_truth.csv')
\ No newline at end of file
diff --git a/analysis/results/figures/.gitkeep b/analysis/results/figures/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/analysis/results/output/.gitkeep b/analysis/results/output/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/analysis/results/tables/.gitkeep b/analysis/results/tables/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/environment.yml b/environment.yml
index a39d0d9..33ce2ab 100644
--- a/environment.yml
+++ b/environment.yml
@@ -4,6 +4,7 @@ channels:
- defaults
- anaconda
dependencies:
+ - altair=5.3
- tectonic=0.15.0
- pandoc=3.2
- poetry=1.8.3