diff --git a/analysis/batch_run.yml b/analysis/batch_run.yml index 64b803a..0214292 100644 --- a/analysis/batch_run.yml +++ b/analysis/batch_run.yml @@ -1,28 +1,28 @@ -runs: 28 +runs: 30 checklist_path: null -model: 'gpt-4-turbo' -repo_base_path: '../data/raw/openja/' -response_path: '../data/processed/batch_run_4-turbo' +model: 'gpt-3.5-turbo' +repo_base_path: 'data/raw/openja/' +response_path: 'data/processed/batch_run_3.5-turbo' repo: - name: lightfm path: './lightfm' - # - name: qlib - # path: './qlib' - # - name: mmf - # path: './mmf' - # - name: nanodet - # path: './nanodet' - # - name: magenta - # path: './magenta' - # - name: nupic - # path: './nupic' - # - name: DeepSpeech - # path: './DeepSpeech' - # - name: apollo - # path: './apollo' - # - name: 'paperless-ng' - # path: './paperless-ng' - # - name: 'mycroft-core' - # path: './mycroft-core' - # - name: deepchem - # path: './deepchem' + - name: qlib + path: './qlib' + - name: mmf + path: './mmf' + - name: nanodet + path: './nanodet' + - name: magenta + path: './magenta' + - name: nupic + path: './nupic' + - name: DeepSpeech + path: './DeepSpeech' + - name: apollo + path: './apollo' + - name: 'paperless-ng' + path: './paperless-ng' + - name: 'mycroft-core' + path: './mycroft-core' + - name: deepchem + path: './deepchem' diff --git a/analysis/ipynb/01_preprocess.ipynb b/analysis/ipynb/01_preprocess.ipynb new file mode 100644 index 0000000..c539db1 --- /dev/null +++ b/analysis/ipynb/01_preprocess.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "18830011-62d1-4242-b851-e6e9ae47b49d", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install scipy altair" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "47bea46f-2b65-42ce-801e-55bf6576a67a", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import yaml\n", + "import pandas as pd\n", + "from collections import Counter\n", + "\n", + "id_item_map = {\n", + " '2.1': 'Ensure Data File Loads as Expected',\n", + " '3.2': 'Data in the Expected Format',\n", + " '3.5': 'Check for Duplicate Records in Data',\n", + " '4.2': 'Verify Data Split Proportion',\n", + " '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n", + " '6.1': 'Verify Evaluation Metrics Implementation',\n", + " '6.2': \"Evaluate Model's Performance Against Thresholds\"\n", + "}\n", + "\n", + "ground_truth = [\n", + " {'repo': 'lightfm', 'id': '2.1', 'score': 1},\n", + " {'repo': 'lightfm', 'id': '3.2', 'score': 1},\n", + " {'repo': 'lightfm', 'id': '3.5', 'score': 0},\n", + " {'repo': 'lightfm', 'id': '4.2', 'score': 1},\n", + " {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},\n", + " {'repo': 'lightfm', 'id': '6.1', 'score': 1},\n", + " {'repo': 'lightfm', 'id': '6.2', 'score': 1},\n", + " {'repo': 'qlib', 'id': '2.1', 'score': 0.5},\n", + " {'repo': 'qlib', 'id': '3.2', 'score': 1},\n", + " {'repo': 'qlib', 'id': '3.5', 'score': 0},\n", + " {'repo': 'qlib', 'id': '4.2', 'score': 0.5},\n", + " {'repo': 'qlib', 'id': '5.3', 'score': 1},\n", + " {'repo': 'qlib', 'id': '6.1', 'score': 1},\n", + " {'repo': 'qlib', 'id': '6.2', 'score': 1},\n", + " {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},\n", + " {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},\n", + " {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},\n", + " {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},\n", + " {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},\n", + " {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},\n", + " {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},\n", + "]\n", + "\n", + "def get_score_report_from_response(resp_path, verbose=False):\n", + " if verbose:\n", + " print(resp_path)\n", + " with open(resp_path, 'rb') as file:\n", + " response = json.load(file)\n", + " \n", + " reports = [] # report for each test file\n", + " for result in response['call_results']: # one test file per response\n", + " if result['parsed_response']:\n", + " resp = result['parsed_response']['results']\n", + " for item in resp:\n", + " item['file'] = result['files_evaluated'][0] \n", + " item['success'] = result['success']\n", + " reports.append(item)\n", + " # FIXME: not handled failed run for now\n", + " # else: # if the run is failed, the parsed_response will be None\n", + " # reports.append({\n", + " # 'ID': '2.1', \n", + " # 'Title': '',\n", + " # 'Requirement': '',\n", + " # 'Observation': '',\n", + " # 'Functions': [],\n", + " # 'Evaluation': '',\n", + " # 'Score': 0,\n", + " # 'file': result['files_evaluated'][0],\n", + " # 'success': result['success']\n", + " # })\n", + " \n", + " reports_df = pd.DataFrame(reports)\n", + " df = (\n", + " reports_df\n", + " .pivot(index='file', columns='ID', values='Score')\n", + " .rename_axis(None, axis=1)\n", + " )\n", + " df['success'] = reports_df.groupby(['file'])['success'].all()\n", + " df['response_path'] = os.path.abspath(resp_path)\n", + " \n", + " return df.reset_index()\n", + "\n", + "def get_scores_by_repo_by_run_by_file(batch_run_dir_path, record_yml='record_combine.yml', verbose=False):\n", + " ''' Get score for each checklist item, by repository, by run and by test file\n", + " '''\n", + " with open(os.path.join(batch_run_dir_path, record_yml), 'r') as file:\n", + " config = pd.DataFrame(yaml.safe_load(file))\n", + "\n", + " config['response_path'] = config['response_path'].apply(\n", + " lambda x: os.path.abspath(os.path.join(batch_run_dir_path, x))\n", + " )\n", + " \n", + " tmp = [\n", + " get_score_report_from_response(\n", + " os.path.join(batch_run_dir_path, path),\n", + " verbose=verbose\n", + " ) for path in config['response_path']\n", + " ]\n", + " tmp = pd.concat(tmp, axis=0).reset_index(drop=True)\n", + " \n", + " return config.merge(tmp, on='response_path', how='left')\n", + "\n", + "def preprocess(df_repo_run_file, id_item_map=None):\n", + " if id_item_map is None:\n", + " id_item_map = {\n", + " '2.1': 'Ensure Data File Loads as Expected',\n", + " '3.2': 'Data in the Expected Format',\n", + " '3.5': 'Check for Duplicate Records in Data',\n", + " '4.2': 'Verify Data Split Proportion',\n", + " '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n", + " '6.1': 'Verify Evaluation Metrics Implementation',\n", + " '6.2': \"Evaluate Model's Performance Against Thresholds\"\n", + " }\n", + "\n", + " # prepare score data by repo, by run\n", + " df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({\n", + " id: ['max'] for id in id_item_map.keys()\n", + " })\n", + " df_repo_run.columns = [col[0] for col in df_repo_run.columns]\n", + " df_repo_run = df_repo_run.reset_index()\n", + " \n", + " # prepare statistics of scores by repo\n", + " df_repo__stat = df_repo_run.groupby(['repo']).agg({\n", + " id: ['mean', 'std', 'count'] for id in id_item_map.keys()\n", + " })\n", + " df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])\n", + " df_repo__stat.columns = ['repo', 'id', 'stat', 'value']\n", + " df_repo__stat = (\n", + " df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')\n", + " .reset_index()\n", + " .rename_axis(None, axis=1)\n", + " )\n", + " df_repo__stat['title'] = df_repo__stat['id'].apply(lambda x: id_item_map[x])\n", + " df_repo__stat['id_title'] = df_repo__stat['id'] + '. ' + df_repo__stat['title']\n", + " \n", + " # prepare counting of scores by repo\n", + " df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()\n", + " for id in list(id_item_map.keys())[1:]:\n", + " df_repo__count = df_repo__count.merge(\n", + " df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),\n", + " on=['repo', 'level_1'],\n", + " how='outer'\n", + " )\n", + " #df_repo__count['title'] = df_repo__count['id'].apply(lambda x: id_item_map[x])\n", + " \n", + " df_repo__count = df_repo__count.fillna(0)\n", + "\n", + " df_repo_run = df_repo_run.melt(id_vars=['repo', 'run'], var_name='id', value_name='score')\n", + " df_repo_run['title'] = df_repo_run['id'].apply(lambda x: id_item_map[x])\n", + " df_repo_run['id_title'] = df_repo_run['id'] + '. ' + df_repo_run['title']\n", + " \n", + " return (df_repo_run, df_repo__stat, df_repo__count)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "31c1ce0b-14e3-4825-aa6e-74dd4d4af960", + "metadata": {}, + "outputs": [], + "source": [ + "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_3.5-turbo/')\n", + "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n", + "\n", + "df_repo_run.to_csv('score_by_repo_run_3.5-turbo.csv', index=False)\n", + "df_repo__stat.to_csv('score_stat_by_repo_3.5-turbo.csv', index=False)\n", + "df_repo__count.to_csv('score_count_by_repo_3.5-turbo.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6343bba3-4fff-4c24-8e71-30ec81df4c4f", + "metadata": {}, + "outputs": [], + "source": [ + "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4-turbo/')\n", + "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n", + "\n", + "df_repo_run.to_csv('score_by_repo_run_4-turbo.csv', index=False)\n", + "df_repo__stat.to_csv('score_stat_by_repo_4-turbo.csv', index=False)\n", + "df_repo__count.to_csv('score_count_by_repo_4-turbo.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "11b75320-05a7-4db2-86ea-9c085df26d73", + "metadata": {}, + "outputs": [], + "source": [ + "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4o/')\n", + "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n", + "\n", + "df_repo_run.to_csv('score_by_repo_run_4o.csv', index=False)\n", + "df_repo__stat.to_csv('score_stat_by_repo_4o.csv', index=False)\n", + "df_repo__count.to_csv('score_count_by_repo_4o.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "455c8477-e02d-44cd-80dc-854b3e9e0fa5", + "metadata": {}, + "outputs": [], + "source": [ + "ground_truth_df = pd.DataFrame(ground_truth)\n", + "ground_truth_df['title'] = ground_truth_df['id'].apply(lambda x: id_item_map[x])\n", + "ground_truth_df = ground_truth_df.pivot(index=['id', 'title'], columns='repo', values='score')\n", + "ground_truth_df.to_csv('ground_truth.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28b5150d-d11b-41a5-91bb-32af1a19a776", + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46ae0ecc-c510-418e-ae7b-6db3d6571219", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:fixml]", + "language": "python", + "name": "conda-env-fixml-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/ipynb/02_plots-for-final-report.ipynb b/analysis/ipynb/02_plots-for-final-report.ipynb new file mode 100644 index 0000000..36e9feb --- /dev/null +++ b/analysis/ipynb/02_plots-for-final-report.ipynb @@ -0,0 +1,693 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "e9a74646-ec18-49c0-b9ef-ed3b5ba64087", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import altair as alt\n", + "import pandas as pd\n", + "\n", + "df_repo__stat = pd.read_csv('score_stat_by_repo_3.5-turbo.csv')\n", + "gt = pd.read_csv('ground_truth.csv')\n", + "gt = gt.melt(id_vars=['id', 'title'], var_name='repo', value_name='ground_truth')\n", + "\n", + "df_repo__stat_with_gt = df_repo__stat.merge(gt, on=['id', 'title', 'repo'])\n", + "\n", + "base = alt.Chart(\n", + " df_repo__stat_with_gt.query('repo in [\"lightfm\", \"qlib\", \"DeepSpeech\"]')\n", + ").transform_calculate(\n", + " min=\"max(0, datum.mean-datum.std)\",\n", + " max=\"min(1, datum.mean+datum.std)\"\n", + ")\n", + " \n", + "# generate the points\n", + "points = base.mark_point(\n", + " filled=True,\n", + " size=50,\n", + " color='black'\n", + ").encode(\n", + " x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n", + " labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n", + " ),\n", + " y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n", + ")\n", + "\n", + "# generate the points for ground truth\n", + "gt_points = base.mark_point(\n", + " filled=True,\n", + " size=200,\n", + " color='green',\n", + " shape=\"diamond\"\n", + ").encode(\n", + " x=alt.X('ground_truth:Q'),\n", + " y=alt.Y('id_title:N')\n", + ")\n", + "\n", + "# generate the error bars\n", + "errorbars = base.mark_errorbar().encode(\n", + " x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n", + " x2=\"max:Q\",\n", + " y=\"id_title:N\"\n", + ")\n", + "\n", + "(gt_points + points + errorbars).facet(\n", + " column=alt.Column('repo:N').title(None)\n", + ").configure_axis( \n", + " labelFontSize=12, \n", + " titleFontSize=12\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa1c3071-33f8-47da-a10d-42b7dbdcce1d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a169da71-4be7-4c88-8553-d6b68c2b1edf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
score0.00.51.0
RepositoryChecklist ItemGround Truth
lightfm3.5. Check for Duplicate Records in Data0.03000
5.3. Ensure Model Output Shape Aligns with Expectation0.51290
2.1. Ensure Data File Loads as Expected1.00030
3.2. Data in the Expected Format1.00300
4.2. Verify Data Split Proportion1.001119
6.1. Verify Evaluation Metrics Implementation1.00525
6.2. Evaluate Model's Performance Against Thresholds1.00129
qlib3.5. Check for Duplicate Records in Data0.02370
2.1. Ensure Data File Loads as Expected0.50030
4.2. Verify Data Split Proportion0.53252
3.2. Data in the Expected Format1.001416
5.3. Ensure Model Output Shape Aligns with Expectation1.01254
6.1. Verify Evaluation Metrics Implementation1.021810
6.2. Evaluate Model's Performance Against Thresholds1.00246
\n", + "
" + ], + "text/plain": [ + "score 0.0 \\\n", + "Repository Checklist Item Ground Truth \n", + "lightfm 3.5. Check for Duplicate Records in Data 0.0 30 \n", + " 5.3. Ensure Model Output Shape Aligns with Expe... 0.5 1 \n", + " 2.1. Ensure Data File Loads as Expected 1.0 0 \n", + " 3.2. Data in the Expected Format 1.0 0 \n", + " 4.2. Verify Data Split Proportion 1.0 0 \n", + " 6.1. Verify Evaluation Metrics Implementation 1.0 0 \n", + " 6.2. Evaluate Model's Performance Against Thres... 1.0 0 \n", + "qlib 3.5. Check for Duplicate Records in Data 0.0 23 \n", + " 2.1. Ensure Data File Loads as Expected 0.5 0 \n", + " 4.2. Verify Data Split Proportion 0.5 3 \n", + " 3.2. Data in the Expected Format 1.0 0 \n", + " 5.3. Ensure Model Output Shape Aligns with Expe... 1.0 1 \n", + " 6.1. Verify Evaluation Metrics Implementation 1.0 2 \n", + " 6.2. Evaluate Model's Performance Against Thres... 1.0 0 \n", + "\n", + "score 0.5 \\\n", + "Repository Checklist Item Ground Truth \n", + "lightfm 3.5. Check for Duplicate Records in Data 0.0 0 \n", + " 5.3. Ensure Model Output Shape Aligns with Expe... 0.5 29 \n", + " 2.1. Ensure Data File Loads as Expected 1.0 0 \n", + " 3.2. Data in the Expected Format 1.0 30 \n", + " 4.2. Verify Data Split Proportion 1.0 11 \n", + " 6.1. Verify Evaluation Metrics Implementation 1.0 5 \n", + " 6.2. Evaluate Model's Performance Against Thres... 1.0 1 \n", + "qlib 3.5. Check for Duplicate Records in Data 0.0 7 \n", + " 2.1. Ensure Data File Loads as Expected 0.5 0 \n", + " 4.2. Verify Data Split Proportion 0.5 25 \n", + " 3.2. Data in the Expected Format 1.0 14 \n", + " 5.3. Ensure Model Output Shape Aligns with Expe... 1.0 25 \n", + " 6.1. Verify Evaluation Metrics Implementation 1.0 18 \n", + " 6.2. Evaluate Model's Performance Against Thres... 1.0 24 \n", + "\n", + "score 1.0 \n", + "Repository Checklist Item Ground Truth \n", + "lightfm 3.5. Check for Duplicate Records in Data 0.0 0 \n", + " 5.3. Ensure Model Output Shape Aligns with Expe... 0.5 0 \n", + " 2.1. Ensure Data File Loads as Expected 1.0 30 \n", + " 3.2. Data in the Expected Format 1.0 0 \n", + " 4.2. Verify Data Split Proportion 1.0 19 \n", + " 6.1. Verify Evaluation Metrics Implementation 1.0 25 \n", + " 6.2. Evaluate Model's Performance Against Thres... 1.0 29 \n", + "qlib 3.5. Check for Duplicate Records in Data 0.0 0 \n", + " 2.1. Ensure Data File Loads as Expected 0.5 30 \n", + " 4.2. Verify Data Split Proportion 0.5 2 \n", + " 3.2. Data in the Expected Format 1.0 16 \n", + " 5.3. Ensure Model Output Shape Aligns with Expe... 1.0 4 \n", + " 6.1. Verify Evaluation Metrics Implementation 1.0 10 \n", + " 6.2. Evaluate Model's Performance Against Thres... 1.0 6 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_repo_run = pd.read_csv('score_by_repo_run_3.5-turbo.csv')\n", + "\n", + "df_repo_run = df_repo_run.merge(gt, on=['id', 'title', 'repo'])\n", + "\n", + "contingency_table = pd.pivot_table(\n", + " df_repo_run,\n", + " values='run', \n", + " index=['repo', 'id_title', 'ground_truth'], \n", + " columns=['score'],\n", + " aggfunc='count', \n", + " fill_value=0\n", + ")\n", + "contingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth']\n", + "contingency_table.sort_index(level=[0, 2])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afc06ca7-5f39-4293-8bdb-9d46558e7535", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f23950d5-792f-4f0a-8e3a-1727b3598dd8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stds = df_repo__stat[['repo', 'std', 'id_title']].pivot(index='repo', columns='id_title').copy()\n", + "stds.columns = [col[1] for col in stds.columns]\n", + "stds = stds.reset_index()\n", + "stds = stds.melt(id_vars='repo', var_name='id_title')\n", + "\n", + "base = alt.Chart(stds)\n", + "\n", + "box = base.mark_boxplot(\n", + " color='grey',\n", + " opacity=0.5,\n", + " size=20,\n", + ").encode(\n", + " x=alt.X('value:Q').title('Standard Deviation of Scores'),\n", + " y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))\n", + ")\n", + "\n", + "stripplot = base.mark_circle(size=100).encode(\n", + " y=alt.Y( \n", + " 'id_title:N',\n", + " axis=alt.Axis(ticks=False, grid=True, labels=True), \n", + " scale=alt.Scale(), \n", + " ), \n", + " x='value:Q',\n", + " yOffset=\"jitter:Q\",\n", + " color=alt.Color('id_title:N', legend=None),\n", + " tooltip='repo'\n", + ").transform_calculate(\n", + " # Generate Gaussian jitter with a Box-Muller transform\n", + " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", + ")\n", + "\n", + "(\n", + " box + stripplot\n", + ").configure_view( \n", + " stroke=None\n", + ").configure_axis( \n", + " labelFontSize=12, \n", + " titleFontSize=12\n", + ").properties(\n", + " height=300, \n", + " width=600,\n", + " title=\"30 Runs on Openja's Repositories for each Checklist Item\"\n", + ") " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d493dc0-5b75-4348-a627-b1194e498b0d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bade3842-185e-4369-a5d7-4356290df058", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_repo_4o__stat = pd.read_csv('score_stat_by_repo_4o.csv')\n", + "df_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])\n", + "df_repo_4o__stat_with_gt['model'] = 'gpt-4o'\n", + "\n", + "df_repo_35turbo__stat_with_gt = df_repo__stat_with_gt.query(\"repo == 'lightfm'\").copy()\n", + "df_repo_35turbo__stat_with_gt['model'] = 'gpt-3.5-turbo'\n", + "\n", + "df_model_comp = pd.concat(\n", + " (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), \n", + " axis=0\n", + ")\n", + "\n", + "base = alt.Chart(\n", + " df_model_comp\n", + ").transform_calculate(\n", + " min=\"max(0, datum.mean-datum.std)\",\n", + " max=\"min(1, datum.mean+datum.std)\"\n", + ")\n", + " \n", + "# generate the points\n", + "points = base.mark_point(\n", + " filled=True,\n", + " size=50,\n", + " color='black'\n", + ").encode(\n", + " x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n", + " labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n", + " ),\n", + " y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n", + ")\n", + "\n", + "# generate the points for ground truth\n", + "gt_points = base.mark_point(\n", + " filled=True,\n", + " size=200,\n", + " color='green',\n", + " shape=\"diamond\"\n", + ").encode(\n", + " x=alt.X('ground_truth:Q'),\n", + " y=alt.Y('id_title:N')\n", + ")\n", + "\n", + "# generate the error bars\n", + "errorbars = base.mark_errorbar().encode(\n", + " x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n", + " x2=\"max:Q\",\n", + " y=\"id_title:N\"\n", + ")\n", + "\n", + "(gt_points + points + errorbars).facet(\n", + " column=alt.Column('model:N').title(None)\n", + ").configure_axis( \n", + " labelFontSize=12, \n", + " titleFontSize=12\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1af0fef8-2c34-4166-affe-93224c639cf9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:fixml]", + "language": "python", + "name": "conda-env-fixml-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/preprocess_batch_run_result.py b/analysis/preprocess_batch_run_result.py new file mode 100644 index 0000000..d792633 --- /dev/null +++ b/analysis/preprocess_batch_run_result.py @@ -0,0 +1,163 @@ +import os +import json +import yaml +import pandas as pd +from collections import Counter + +id_item_map = { + '2.1': 'Ensure Data File Loads as Expected', + '3.2': 'Data in the Expected Format', + '3.5': 'Check for Duplicate Records in Data', + '4.2': 'Verify Data Split Proportion', + '5.3': 'Ensure Model Output Shape Aligns with Expectation', + '6.1': 'Verify Evaluation Metrics Implementation', + '6.2': "Evaluate Model's Performance Against Thresholds" +} + +ground_truth = [ + {'repo': 'lightfm', 'id': '2.1', 'score': 1}, + {'repo': 'lightfm', 'id': '3.2', 'score': 1}, + {'repo': 'lightfm', 'id': '3.5', 'score': 0}, + {'repo': 'lightfm', 'id': '4.2', 'score': 1}, + {'repo': 'lightfm', 'id': '5.3', 'score': 0.5}, + {'repo': 'lightfm', 'id': '6.1', 'score': 1}, + {'repo': 'lightfm', 'id': '6.2', 'score': 1}, + {'repo': 'qlib', 'id': '2.1', 'score': 0.5}, + {'repo': 'qlib', 'id': '3.2', 'score': 1}, + {'repo': 'qlib', 'id': '3.5', 'score': 0}, + {'repo': 'qlib', 'id': '4.2', 'score': 0.5}, + {'repo': 'qlib', 'id': '5.3', 'score': 1}, + {'repo': 'qlib', 'id': '6.1', 'score': 1}, + {'repo': 'qlib', 'id': '6.2', 'score': 1}, + {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0}, + {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0}, + {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0}, + {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0}, + {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0}, + {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0}, + {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0}, +] + +def get_score_report_from_response(resp_path, verbose=False): + if verbose: + print(resp_path) + with open(resp_path, 'rb') as file: + response = json.load(file) + + reports = [] # report for each test file + for result in response['call_results']: # one test file per response + if result['parsed_response']: + resp = result['parsed_response']['results'] + for item in resp: + item['file'] = result['files_evaluated'][0] + item['success'] = result['success'] + reports.append(item) + # FIXME: not handled failed run for now + # else: # if the run is failed, the parsed_response will be None + # reports.append({ + # 'ID': '2.1', + # 'Title': '', + # 'Requirement': '', + # 'Observation': '', + # 'Functions': [], + # 'Evaluation': '', + # 'Score': 0, + # 'file': result['files_evaluated'][0], + # 'success': result['success'] + # }) + + reports_df = pd.DataFrame(reports) + df = ( + reports_df + .pivot(index='file', columns='ID', values='Score') + .rename_axis(None, axis=1) + ) + df['success'] = reports_df.groupby(['file'])['success'].all() + df['response_path'] = os.path.abspath(resp_path) + + return df.reset_index() + +def get_scores_by_repo_by_run_by_file(batch_run_dir_path, record_yml='record_combine.yml', verbose=False): + ''' Get score for each checklist item, by repository, by run and by test file + ''' + with open(os.path.join(batch_run_dir_path, record_yml), 'r') as file: + config = pd.DataFrame(yaml.safe_load(file)) + + config['response_path'] = config['response_path'].apply( + lambda x: os.path.abspath(os.path.join(batch_run_dir_path, x)) + ) + + tmp = [ + get_score_report_from_response( + os.path.join(batch_run_dir_path, path), + verbose=verbose + ) for path in config['response_path'] + ] + tmp = pd.concat(tmp, axis=0).reset_index(drop=True) + + return config.merge(tmp, on='response_path', how='left') + +def preprocess(df_repo_run_file, id_item_map=None): + if id_item_map is None: + id_item_map = { + '2.1': 'Ensure Data File Loads as Expected', + '3.2': 'Data in the Expected Format', + '3.5': 'Check for Duplicate Records in Data', + '4.2': 'Verify Data Split Proportion', + '5.3': 'Ensure Model Output Shape Aligns with Expectation', + '6.1': 'Verify Evaluation Metrics Implementation', + '6.2': "Evaluate Model's Performance Against Thresholds" + } + + # prepare score data by repo, by run + df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({ + id: ['max'] for id in id_item_map.keys() + }) + df_repo_run.columns = [col[0] for col in df_repo_run.columns] + df_repo_run = df_repo_run.reset_index() + + # prepare statistics of scores by repo + df_repo__stat = df_repo_run.groupby(['repo']).agg({ + id: ['mean', 'std', 'count'] for id in id_item_map.keys() + }) + df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')]) + df_repo__stat.columns = ['repo', 'id', 'stat', 'value'] + df_repo__stat = ( + df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value') + .reset_index() + .rename_axis(None, axis=1) + ) + df_repo__stat['title'] = df_repo__stat['id'].apply(lambda x: id_item_map[x]) + df_repo__stat['id_title'] = df_repo__stat['id'] + '. ' + df_repo__stat['title'] + + # prepare counting of scores by repo + df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index() + for id in list(id_item_map.keys())[1:]: + df_repo__count = df_repo__count.merge( + df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(), + on=['repo', 'level_1'], + how='outer' + ) + #df_repo__count['title'] = df_repo__count['id'].apply(lambda x: id_item_map[x]) + + df_repo__count = df_repo__count.fillna(0) + + df_repo_run = df_repo_run.melt(id_vars=['repo', 'run'], var_name='id', value_name='score') + df_repo_run['title'] = df_repo_run['id'].apply(lambda x: id_item_map[x]) + df_repo_run['id_title'] = df_repo_run['id'] + '. ' + df_repo_run['title'] + + return (df_repo_run, df_repo__stat, df_repo__count) + + +for model in ['3.5-turbo', '4-turbo', '4o']: + df_repo_run_file = get_scores_by_repo_by_run_by_file(f'data/batch_run/batch_run_{model}/') + df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file) + + df_repo_run.to_csv(f'data/processed/score_by_repo_run_{model}.csv', index=False) + df_repo__stat.to_csv(f'data/processed/score_stat_by_repo_{model}.csv', index=False) + df_repo__count.to_csv(f'data/processed/score_count_by_repo_{model}.csv', index=False) + +ground_truth_df = pd.DataFrame(ground_truth) +ground_truth_df['title'] = ground_truth_df['id'].apply(lambda x: id_item_map[x]) +ground_truth_df = ground_truth_df.pivot(index=['id', 'title'], columns='repo', values='score') +ground_truth_df.to_csv('data/processed/ground_truth.csv') \ No newline at end of file diff --git a/analysis/results/figures/.gitkeep b/analysis/results/figures/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/analysis/results/output/.gitkeep b/analysis/results/output/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/analysis/results/tables/.gitkeep b/analysis/results/tables/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/environment.yml b/environment.yml index a39d0d9..33ce2ab 100644 --- a/environment.yml +++ b/environment.yml @@ -4,6 +4,7 @@ channels: - defaults - anaconda dependencies: + - altair=5.3 - tectonic=0.15.0 - pandoc=3.2 - poetry=1.8.3