-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
included altair in environment, updated preprocessing ipynb
- Loading branch information
John Shiu
committed
Jun 22, 2024
1 parent
253f2b5
commit 7c9d169
Showing
8 changed files
with
1,150 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,28 +1,28 @@ | ||
runs: 28 | ||
runs: 30 | ||
checklist_path: null | ||
model: 'gpt-4-turbo' | ||
repo_base_path: '../data/raw/openja/' | ||
response_path: '../data/processed/batch_run_4-turbo' | ||
model: 'gpt-3.5-turbo' | ||
repo_base_path: 'data/raw/openja/' | ||
response_path: 'data/processed/batch_run_3.5-turbo' | ||
repo: | ||
- name: lightfm | ||
path: './lightfm' | ||
# - name: qlib | ||
# path: './qlib' | ||
# - name: mmf | ||
# path: './mmf' | ||
# - name: nanodet | ||
# path: './nanodet' | ||
# - name: magenta | ||
# path: './magenta' | ||
# - name: nupic | ||
# path: './nupic' | ||
# - name: DeepSpeech | ||
# path: './DeepSpeech' | ||
# - name: apollo | ||
# path: './apollo' | ||
# - name: 'paperless-ng' | ||
# path: './paperless-ng' | ||
# - name: 'mycroft-core' | ||
# path: './mycroft-core' | ||
# - name: deepchem | ||
# path: './deepchem' | ||
- name: qlib | ||
path: './qlib' | ||
- name: mmf | ||
path: './mmf' | ||
- name: nanodet | ||
path: './nanodet' | ||
- name: magenta | ||
path: './magenta' | ||
- name: nupic | ||
path: './nupic' | ||
- name: DeepSpeech | ||
path: './DeepSpeech' | ||
- name: apollo | ||
path: './apollo' | ||
- name: 'paperless-ng' | ||
path: './paperless-ng' | ||
- name: 'mycroft-core' | ||
path: './mycroft-core' | ||
- name: deepchem | ||
path: './deepchem' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,269 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "18830011-62d1-4242-b851-e6e9ae47b49d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#!pip install scipy altair" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "47bea46f-2b65-42ce-801e-55bf6576a67a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import json\n", | ||
"import yaml\n", | ||
"import pandas as pd\n", | ||
"from collections import Counter\n", | ||
"\n", | ||
"id_item_map = {\n", | ||
" '2.1': 'Ensure Data File Loads as Expected',\n", | ||
" '3.2': 'Data in the Expected Format',\n", | ||
" '3.5': 'Check for Duplicate Records in Data',\n", | ||
" '4.2': 'Verify Data Split Proportion',\n", | ||
" '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n", | ||
" '6.1': 'Verify Evaluation Metrics Implementation',\n", | ||
" '6.2': \"Evaluate Model's Performance Against Thresholds\"\n", | ||
"}\n", | ||
"\n", | ||
"ground_truth = [\n", | ||
" {'repo': 'lightfm', 'id': '2.1', 'score': 1},\n", | ||
" {'repo': 'lightfm', 'id': '3.2', 'score': 1},\n", | ||
" {'repo': 'lightfm', 'id': '3.5', 'score': 0},\n", | ||
" {'repo': 'lightfm', 'id': '4.2', 'score': 1},\n", | ||
" {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},\n", | ||
" {'repo': 'lightfm', 'id': '6.1', 'score': 1},\n", | ||
" {'repo': 'lightfm', 'id': '6.2', 'score': 1},\n", | ||
" {'repo': 'qlib', 'id': '2.1', 'score': 0.5},\n", | ||
" {'repo': 'qlib', 'id': '3.2', 'score': 1},\n", | ||
" {'repo': 'qlib', 'id': '3.5', 'score': 0},\n", | ||
" {'repo': 'qlib', 'id': '4.2', 'score': 0.5},\n", | ||
" {'repo': 'qlib', 'id': '5.3', 'score': 1},\n", | ||
" {'repo': 'qlib', 'id': '6.1', 'score': 1},\n", | ||
" {'repo': 'qlib', 'id': '6.2', 'score': 1},\n", | ||
" {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},\n", | ||
" {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},\n", | ||
" {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},\n", | ||
" {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},\n", | ||
" {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},\n", | ||
" {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},\n", | ||
" {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},\n", | ||
"]\n", | ||
"\n", | ||
"def get_score_report_from_response(resp_path, verbose=False):\n", | ||
" if verbose:\n", | ||
" print(resp_path)\n", | ||
" with open(resp_path, 'rb') as file:\n", | ||
" response = json.load(file)\n", | ||
" \n", | ||
" reports = [] # report for each test file\n", | ||
" for result in response['call_results']: # one test file per response\n", | ||
" if result['parsed_response']:\n", | ||
" resp = result['parsed_response']['results']\n", | ||
" for item in resp:\n", | ||
" item['file'] = result['files_evaluated'][0] \n", | ||
" item['success'] = result['success']\n", | ||
" reports.append(item)\n", | ||
" # FIXME: not handled failed run for now\n", | ||
" # else: # if the run is failed, the parsed_response will be None\n", | ||
" # reports.append({\n", | ||
" # 'ID': '2.1', \n", | ||
" # 'Title': '',\n", | ||
" # 'Requirement': '',\n", | ||
" # 'Observation': '',\n", | ||
" # 'Functions': [],\n", | ||
" # 'Evaluation': '',\n", | ||
" # 'Score': 0,\n", | ||
" # 'file': result['files_evaluated'][0],\n", | ||
" # 'success': result['success']\n", | ||
" # })\n", | ||
" \n", | ||
" reports_df = pd.DataFrame(reports)\n", | ||
" df = (\n", | ||
" reports_df\n", | ||
" .pivot(index='file', columns='ID', values='Score')\n", | ||
" .rename_axis(None, axis=1)\n", | ||
" )\n", | ||
" df['success'] = reports_df.groupby(['file'])['success'].all()\n", | ||
" df['response_path'] = os.path.abspath(resp_path)\n", | ||
" \n", | ||
" return df.reset_index()\n", | ||
"\n", | ||
"def get_scores_by_repo_by_run_by_file(batch_run_dir_path, record_yml='record_combine.yml', verbose=False):\n", | ||
" ''' Get score for each checklist item, by repository, by run and by test file\n", | ||
" '''\n", | ||
" with open(os.path.join(batch_run_dir_path, record_yml), 'r') as file:\n", | ||
" config = pd.DataFrame(yaml.safe_load(file))\n", | ||
"\n", | ||
" config['response_path'] = config['response_path'].apply(\n", | ||
" lambda x: os.path.abspath(os.path.join(batch_run_dir_path, x))\n", | ||
" )\n", | ||
" \n", | ||
" tmp = [\n", | ||
" get_score_report_from_response(\n", | ||
" os.path.join(batch_run_dir_path, path),\n", | ||
" verbose=verbose\n", | ||
" ) for path in config['response_path']\n", | ||
" ]\n", | ||
" tmp = pd.concat(tmp, axis=0).reset_index(drop=True)\n", | ||
" \n", | ||
" return config.merge(tmp, on='response_path', how='left')\n", | ||
"\n", | ||
"def preprocess(df_repo_run_file, id_item_map=None):\n", | ||
" if id_item_map is None:\n", | ||
" id_item_map = {\n", | ||
" '2.1': 'Ensure Data File Loads as Expected',\n", | ||
" '3.2': 'Data in the Expected Format',\n", | ||
" '3.5': 'Check for Duplicate Records in Data',\n", | ||
" '4.2': 'Verify Data Split Proportion',\n", | ||
" '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n", | ||
" '6.1': 'Verify Evaluation Metrics Implementation',\n", | ||
" '6.2': \"Evaluate Model's Performance Against Thresholds\"\n", | ||
" }\n", | ||
"\n", | ||
" # prepare score data by repo, by run\n", | ||
" df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({\n", | ||
" id: ['max'] for id in id_item_map.keys()\n", | ||
" })\n", | ||
" df_repo_run.columns = [col[0] for col in df_repo_run.columns]\n", | ||
" df_repo_run = df_repo_run.reset_index()\n", | ||
" \n", | ||
" # prepare statistics of scores by repo\n", | ||
" df_repo__stat = df_repo_run.groupby(['repo']).agg({\n", | ||
" id: ['mean', 'std', 'count'] for id in id_item_map.keys()\n", | ||
" })\n", | ||
" df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])\n", | ||
" df_repo__stat.columns = ['repo', 'id', 'stat', 'value']\n", | ||
" df_repo__stat = (\n", | ||
" df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')\n", | ||
" .reset_index()\n", | ||
" .rename_axis(None, axis=1)\n", | ||
" )\n", | ||
" df_repo__stat['title'] = df_repo__stat['id'].apply(lambda x: id_item_map[x])\n", | ||
" df_repo__stat['id_title'] = df_repo__stat['id'] + '. ' + df_repo__stat['title']\n", | ||
" \n", | ||
" # prepare counting of scores by repo\n", | ||
" df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()\n", | ||
" for id in list(id_item_map.keys())[1:]:\n", | ||
" df_repo__count = df_repo__count.merge(\n", | ||
" df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),\n", | ||
" on=['repo', 'level_1'],\n", | ||
" how='outer'\n", | ||
" )\n", | ||
" #df_repo__count['title'] = df_repo__count['id'].apply(lambda x: id_item_map[x])\n", | ||
" \n", | ||
" df_repo__count = df_repo__count.fillna(0)\n", | ||
"\n", | ||
" df_repo_run = df_repo_run.melt(id_vars=['repo', 'run'], var_name='id', value_name='score')\n", | ||
" df_repo_run['title'] = df_repo_run['id'].apply(lambda x: id_item_map[x])\n", | ||
" df_repo_run['id_title'] = df_repo_run['id'] + '. ' + df_repo_run['title']\n", | ||
" \n", | ||
" return (df_repo_run, df_repo__stat, df_repo__count)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "31c1ce0b-14e3-4825-aa6e-74dd4d4af960", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_3.5-turbo/')\n", | ||
"df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n", | ||
"\n", | ||
"df_repo_run.to_csv('score_by_repo_run_3.5-turbo.csv', index=False)\n", | ||
"df_repo__stat.to_csv('score_stat_by_repo_3.5-turbo.csv', index=False)\n", | ||
"df_repo__count.to_csv('score_count_by_repo_3.5-turbo.csv', index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "6343bba3-4fff-4c24-8e71-30ec81df4c4f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4-turbo/')\n", | ||
"df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n", | ||
"\n", | ||
"df_repo_run.to_csv('score_by_repo_run_4-turbo.csv', index=False)\n", | ||
"df_repo__stat.to_csv('score_stat_by_repo_4-turbo.csv', index=False)\n", | ||
"df_repo__count.to_csv('score_count_by_repo_4-turbo.csv', index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "11b75320-05a7-4db2-86ea-9c085df26d73", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4o/')\n", | ||
"df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n", | ||
"\n", | ||
"df_repo_run.to_csv('score_by_repo_run_4o.csv', index=False)\n", | ||
"df_repo__stat.to_csv('score_stat_by_repo_4o.csv', index=False)\n", | ||
"df_repo__count.to_csv('score_count_by_repo_4o.csv', index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "455c8477-e02d-44cd-80dc-854b3e9e0fa5", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ground_truth_df = pd.DataFrame(ground_truth)\n", | ||
"ground_truth_df['title'] = ground_truth_df['id'].apply(lambda x: id_item_map[x])\n", | ||
"ground_truth_df = ground_truth_df.pivot(index=['id', 'title'], columns='repo', values='score')\n", | ||
"ground_truth_df.to_csv('ground_truth.csv')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "28b5150d-d11b-41a5-91bb-32af1a19a776", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "46ae0ecc-c510-418e-ae7b-6db3d6571219", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python [conda env:fixml]", | ||
"language": "python", | ||
"name": "conda-env-fixml-py" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.