Skip to content

Commit

Permalink
included altair in environment, updated preprocessing ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
John Shiu committed Jun 22, 2024
1 parent 253f2b5 commit 7c9d169
Show file tree
Hide file tree
Showing 8 changed files with 1,150 additions and 24 deletions.
48 changes: 24 additions & 24 deletions analysis/batch_run.yml
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
runs: 28
runs: 30
checklist_path: null
model: 'gpt-4-turbo'
repo_base_path: '../data/raw/openja/'
response_path: '../data/processed/batch_run_4-turbo'
model: 'gpt-3.5-turbo'
repo_base_path: 'data/raw/openja/'
response_path: 'data/processed/batch_run_3.5-turbo'
repo:
- name: lightfm
path: './lightfm'
# - name: qlib
# path: './qlib'
# - name: mmf
# path: './mmf'
# - name: nanodet
# path: './nanodet'
# - name: magenta
# path: './magenta'
# - name: nupic
# path: './nupic'
# - name: DeepSpeech
# path: './DeepSpeech'
# - name: apollo
# path: './apollo'
# - name: 'paperless-ng'
# path: './paperless-ng'
# - name: 'mycroft-core'
# path: './mycroft-core'
# - name: deepchem
# path: './deepchem'
- name: qlib
path: './qlib'
- name: mmf
path: './mmf'
- name: nanodet
path: './nanodet'
- name: magenta
path: './magenta'
- name: nupic
path: './nupic'
- name: DeepSpeech
path: './DeepSpeech'
- name: apollo
path: './apollo'
- name: 'paperless-ng'
path: './paperless-ng'
- name: 'mycroft-core'
path: './mycroft-core'
- name: deepchem
path: './deepchem'
269 changes: 269 additions & 0 deletions analysis/ipynb/01_preprocess.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "18830011-62d1-4242-b851-e6e9ae47b49d",
"metadata": {},
"outputs": [],
"source": [
"#!pip install scipy altair"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "47bea46f-2b65-42ce-801e-55bf6576a67a",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"import yaml\n",
"import pandas as pd\n",
"from collections import Counter\n",
"\n",
"id_item_map = {\n",
" '2.1': 'Ensure Data File Loads as Expected',\n",
" '3.2': 'Data in the Expected Format',\n",
" '3.5': 'Check for Duplicate Records in Data',\n",
" '4.2': 'Verify Data Split Proportion',\n",
" '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n",
" '6.1': 'Verify Evaluation Metrics Implementation',\n",
" '6.2': \"Evaluate Model's Performance Against Thresholds\"\n",
"}\n",
"\n",
"ground_truth = [\n",
" {'repo': 'lightfm', 'id': '2.1', 'score': 1},\n",
" {'repo': 'lightfm', 'id': '3.2', 'score': 1},\n",
" {'repo': 'lightfm', 'id': '3.5', 'score': 0},\n",
" {'repo': 'lightfm', 'id': '4.2', 'score': 1},\n",
" {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},\n",
" {'repo': 'lightfm', 'id': '6.1', 'score': 1},\n",
" {'repo': 'lightfm', 'id': '6.2', 'score': 1},\n",
" {'repo': 'qlib', 'id': '2.1', 'score': 0.5},\n",
" {'repo': 'qlib', 'id': '3.2', 'score': 1},\n",
" {'repo': 'qlib', 'id': '3.5', 'score': 0},\n",
" {'repo': 'qlib', 'id': '4.2', 'score': 0.5},\n",
" {'repo': 'qlib', 'id': '5.3', 'score': 1},\n",
" {'repo': 'qlib', 'id': '6.1', 'score': 1},\n",
" {'repo': 'qlib', 'id': '6.2', 'score': 1},\n",
" {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},\n",
" {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},\n",
" {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},\n",
" {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},\n",
" {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},\n",
" {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},\n",
" {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},\n",
"]\n",
"\n",
"def get_score_report_from_response(resp_path, verbose=False):\n",
" if verbose:\n",
" print(resp_path)\n",
" with open(resp_path, 'rb') as file:\n",
" response = json.load(file)\n",
" \n",
" reports = [] # report for each test file\n",
" for result in response['call_results']: # one test file per response\n",
" if result['parsed_response']:\n",
" resp = result['parsed_response']['results']\n",
" for item in resp:\n",
" item['file'] = result['files_evaluated'][0] \n",
" item['success'] = result['success']\n",
" reports.append(item)\n",
" # FIXME: not handled failed run for now\n",
" # else: # if the run is failed, the parsed_response will be None\n",
" # reports.append({\n",
" # 'ID': '2.1', \n",
" # 'Title': '',\n",
" # 'Requirement': '',\n",
" # 'Observation': '',\n",
" # 'Functions': [],\n",
" # 'Evaluation': '',\n",
" # 'Score': 0,\n",
" # 'file': result['files_evaluated'][0],\n",
" # 'success': result['success']\n",
" # })\n",
" \n",
" reports_df = pd.DataFrame(reports)\n",
" df = (\n",
" reports_df\n",
" .pivot(index='file', columns='ID', values='Score')\n",
" .rename_axis(None, axis=1)\n",
" )\n",
" df['success'] = reports_df.groupby(['file'])['success'].all()\n",
" df['response_path'] = os.path.abspath(resp_path)\n",
" \n",
" return df.reset_index()\n",
"\n",
"def get_scores_by_repo_by_run_by_file(batch_run_dir_path, record_yml='record_combine.yml', verbose=False):\n",
" ''' Get score for each checklist item, by repository, by run and by test file\n",
" '''\n",
" with open(os.path.join(batch_run_dir_path, record_yml), 'r') as file:\n",
" config = pd.DataFrame(yaml.safe_load(file))\n",
"\n",
" config['response_path'] = config['response_path'].apply(\n",
" lambda x: os.path.abspath(os.path.join(batch_run_dir_path, x))\n",
" )\n",
" \n",
" tmp = [\n",
" get_score_report_from_response(\n",
" os.path.join(batch_run_dir_path, path),\n",
" verbose=verbose\n",
" ) for path in config['response_path']\n",
" ]\n",
" tmp = pd.concat(tmp, axis=0).reset_index(drop=True)\n",
" \n",
" return config.merge(tmp, on='response_path', how='left')\n",
"\n",
"def preprocess(df_repo_run_file, id_item_map=None):\n",
" if id_item_map is None:\n",
" id_item_map = {\n",
" '2.1': 'Ensure Data File Loads as Expected',\n",
" '3.2': 'Data in the Expected Format',\n",
" '3.5': 'Check for Duplicate Records in Data',\n",
" '4.2': 'Verify Data Split Proportion',\n",
" '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n",
" '6.1': 'Verify Evaluation Metrics Implementation',\n",
" '6.2': \"Evaluate Model's Performance Against Thresholds\"\n",
" }\n",
"\n",
" # prepare score data by repo, by run\n",
" df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({\n",
" id: ['max'] for id in id_item_map.keys()\n",
" })\n",
" df_repo_run.columns = [col[0] for col in df_repo_run.columns]\n",
" df_repo_run = df_repo_run.reset_index()\n",
" \n",
" # prepare statistics of scores by repo\n",
" df_repo__stat = df_repo_run.groupby(['repo']).agg({\n",
" id: ['mean', 'std', 'count'] for id in id_item_map.keys()\n",
" })\n",
" df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])\n",
" df_repo__stat.columns = ['repo', 'id', 'stat', 'value']\n",
" df_repo__stat = (\n",
" df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')\n",
" .reset_index()\n",
" .rename_axis(None, axis=1)\n",
" )\n",
" df_repo__stat['title'] = df_repo__stat['id'].apply(lambda x: id_item_map[x])\n",
" df_repo__stat['id_title'] = df_repo__stat['id'] + '. ' + df_repo__stat['title']\n",
" \n",
" # prepare counting of scores by repo\n",
" df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()\n",
" for id in list(id_item_map.keys())[1:]:\n",
" df_repo__count = df_repo__count.merge(\n",
" df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),\n",
" on=['repo', 'level_1'],\n",
" how='outer'\n",
" )\n",
" #df_repo__count['title'] = df_repo__count['id'].apply(lambda x: id_item_map[x])\n",
" \n",
" df_repo__count = df_repo__count.fillna(0)\n",
"\n",
" df_repo_run = df_repo_run.melt(id_vars=['repo', 'run'], var_name='id', value_name='score')\n",
" df_repo_run['title'] = df_repo_run['id'].apply(lambda x: id_item_map[x])\n",
" df_repo_run['id_title'] = df_repo_run['id'] + '. ' + df_repo_run['title']\n",
" \n",
" return (df_repo_run, df_repo__stat, df_repo__count)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "31c1ce0b-14e3-4825-aa6e-74dd4d4af960",
"metadata": {},
"outputs": [],
"source": [
"df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_3.5-turbo/')\n",
"df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
"\n",
"df_repo_run.to_csv('score_by_repo_run_3.5-turbo.csv', index=False)\n",
"df_repo__stat.to_csv('score_stat_by_repo_3.5-turbo.csv', index=False)\n",
"df_repo__count.to_csv('score_count_by_repo_3.5-turbo.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6343bba3-4fff-4c24-8e71-30ec81df4c4f",
"metadata": {},
"outputs": [],
"source": [
"df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4-turbo/')\n",
"df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
"\n",
"df_repo_run.to_csv('score_by_repo_run_4-turbo.csv', index=False)\n",
"df_repo__stat.to_csv('score_stat_by_repo_4-turbo.csv', index=False)\n",
"df_repo__count.to_csv('score_count_by_repo_4-turbo.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "11b75320-05a7-4db2-86ea-9c085df26d73",
"metadata": {},
"outputs": [],
"source": [
"df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4o/')\n",
"df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
"\n",
"df_repo_run.to_csv('score_by_repo_run_4o.csv', index=False)\n",
"df_repo__stat.to_csv('score_stat_by_repo_4o.csv', index=False)\n",
"df_repo__count.to_csv('score_count_by_repo_4o.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "455c8477-e02d-44cd-80dc-854b3e9e0fa5",
"metadata": {},
"outputs": [],
"source": [
"ground_truth_df = pd.DataFrame(ground_truth)\n",
"ground_truth_df['title'] = ground_truth_df['id'].apply(lambda x: id_item_map[x])\n",
"ground_truth_df = ground_truth_df.pivot(index=['id', 'title'], columns='repo', values='score')\n",
"ground_truth_df.to_csv('ground_truth.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "28b5150d-d11b-41a5-91bb-32af1a19a776",
"metadata": {},
"outputs": [],
"source": [
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46ae0ecc-c510-418e-ae7b-6db3d6571219",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:fixml]",
"language": "python",
"name": "conda-env-fixml-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 7c9d169

Please sign in to comment.