included altair in environment, updated preprocessing ipynb

UBC-MDS · Jun 22, 2024 · 7c9d169 · 7c9d169
1 parent 253f2b5
commit 7c9d169
Show file tree

Hide file tree

Showing 8 changed files with 1,150 additions and 24 deletions.
diff --git a/analysis/batch_run.yml b/analysis/batch_run.yml
@@ -1,28 +1,28 @@
-runs: 28
+runs: 30
 checklist_path: null
-model: 'gpt-4-turbo'
-repo_base_path: '../data/raw/openja/'
-response_path: '../data/processed/batch_run_4-turbo'
+model: 'gpt-3.5-turbo'
+repo_base_path: 'data/raw/openja/'
+response_path: 'data/processed/batch_run_3.5-turbo'
 repo:
   - name: lightfm
     path: './lightfm'
-      #  - name: qlib
-      #    path: './qlib'
-      #  - name: mmf
-      #    path: './mmf'
-      #  - name: nanodet
-      #    path: './nanodet'
-      #  - name: magenta
-      #    path: './magenta'
-      #  - name: nupic
-      #    path: './nupic'
-      #  - name: DeepSpeech
-      #    path: './DeepSpeech'
-      #  - name: apollo
-      #    path: './apollo'
-      #  - name: 'paperless-ng'
-      #    path: './paperless-ng'
-      #  - name: 'mycroft-core'
-      #    path: './mycroft-core'
-      #  - name: deepchem
-      #    path: './deepchem'
+  - name: qlib
+    path: './qlib'
+  - name: mmf
+    path: './mmf'
+  - name: nanodet
+    path: './nanodet'
+  - name: magenta
+    path: './magenta'
+  - name: nupic
+    path: './nupic'
+  - name: DeepSpeech
+    path: './DeepSpeech'
+  - name: apollo
+    path: './apollo'
+  - name: 'paperless-ng'
+    path: './paperless-ng'
+  - name: 'mycroft-core'
+    path: './mycroft-core'
+  - name: deepchem
+    path: './deepchem'
diff --git a/analysis/ipynb/01_preprocess.ipynb b/analysis/ipynb/01_preprocess.ipynb
@@ -0,0 +1,269 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "18830011-62d1-4242-b851-e6e9ae47b49d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install scipy altair"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "47bea46f-2b65-42ce-801e-55bf6576a67a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import yaml\n",
+    "import pandas as pd\n",
+    "from collections import Counter\n",
+    "\n",
+    "id_item_map = {\n",
+    "    '2.1': 'Ensure Data File Loads as Expected',\n",
+    "    '3.2': 'Data in the Expected Format',\n",
+    "    '3.5': 'Check for Duplicate Records in Data',\n",
+    "    '4.2': 'Verify Data Split Proportion',\n",
+    "    '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n",
+    "    '6.1': 'Verify Evaluation Metrics Implementation',\n",
+    "    '6.2': \"Evaluate Model's Performance Against Thresholds\"\n",
+    "}\n",
+    "\n",
+    "ground_truth = [\n",
+    "    {'repo': 'lightfm', 'id': '2.1', 'score': 1},\n",
+    "    {'repo': 'lightfm', 'id': '3.2', 'score': 1},\n",
+    "    {'repo': 'lightfm', 'id': '3.5', 'score': 0},\n",
+    "    {'repo': 'lightfm', 'id': '4.2', 'score': 1},\n",
+    "    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},\n",
+    "    {'repo': 'lightfm', 'id': '6.1', 'score': 1},\n",
+    "    {'repo': 'lightfm', 'id': '6.2', 'score': 1},\n",
+    "    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},\n",
+    "    {'repo': 'qlib', 'id': '3.2', 'score': 1},\n",
+    "    {'repo': 'qlib', 'id': '3.5', 'score': 0},\n",
+    "    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},\n",
+    "    {'repo': 'qlib', 'id': '5.3', 'score': 1},\n",
+    "    {'repo': 'qlib', 'id': '6.1', 'score': 1},\n",
+    "    {'repo': 'qlib', 'id': '6.2', 'score': 1},\n",
+    "    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},\n",
+    "    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},\n",
+    "    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},\n",
+    "    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},\n",
+    "    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},\n",
+    "    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},\n",
+    "    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},\n",
+    "]\n",
+    "\n",
+    "def get_score_report_from_response(resp_path, verbose=False):\n",
+    "    if verbose:\n",
+    "        print(resp_path)\n",
+    "    with open(resp_path, 'rb') as file:\n",
+    "        response = json.load(file)\n",
+    "    \n",
+    "    reports = [] # report for each test file\n",
+    "    for result in response['call_results']: # one test file per response\n",
+    "        if result['parsed_response']:\n",
+    "            resp = result['parsed_response']['results']\n",
+    "            for item in resp:\n",
+    "                item['file'] = result['files_evaluated'][0] \n",
+    "                item['success'] = result['success']\n",
+    "                reports.append(item)\n",
+    "        # FIXME: not handled failed run for now\n",
+    "        # else: # if the run is failed, the parsed_response will be None\n",
+    "        #     reports.append({\n",
+    "        #         'ID': '2.1', \n",
+    "        #         'Title': '',\n",
+    "        #         'Requirement': '',\n",
+    "        #         'Observation': '',\n",
+    "        #         'Functions': [],\n",
+    "        #         'Evaluation': '',\n",
+    "        #         'Score': 0,\n",
+    "        #         'file': result['files_evaluated'][0],\n",
+    "        #         'success': result['success']\n",
+    "        #     })\n",
+    "    \n",
+    "    reports_df = pd.DataFrame(reports)\n",
+    "    df = (\n",
+    "        reports_df\n",
+    "        .pivot(index='file', columns='ID', values='Score')\n",
+    "        .rename_axis(None, axis=1)\n",
+    "    )\n",
+    "    df['success'] = reports_df.groupby(['file'])['success'].all()\n",
+    "    df['response_path'] = os.path.abspath(resp_path)\n",
+    "    \n",
+    "    return df.reset_index()\n",
+    "\n",
+    "def get_scores_by_repo_by_run_by_file(batch_run_dir_path, record_yml='record_combine.yml', verbose=False):\n",
+    "    ''' Get score for each checklist item, by repository, by run and by test file\n",
+    "    '''\n",
+    "    with open(os.path.join(batch_run_dir_path, record_yml), 'r') as file:\n",
+    "        config = pd.DataFrame(yaml.safe_load(file))\n",
+    "\n",
+    "    config['response_path'] = config['response_path'].apply(\n",
+    "        lambda x: os.path.abspath(os.path.join(batch_run_dir_path, x))\n",
+    "    )\n",
+    "    \n",
+    "    tmp = [\n",
+    "        get_score_report_from_response(\n",
+    "            os.path.join(batch_run_dir_path, path),\n",
+    "            verbose=verbose\n",
+    "        ) for path in config['response_path']\n",
+    "    ]\n",
+    "    tmp = pd.concat(tmp, axis=0).reset_index(drop=True)\n",
+    "    \n",
+    "    return config.merge(tmp, on='response_path', how='left')\n",
+    "\n",
+    "def preprocess(df_repo_run_file, id_item_map=None):\n",
+    "    if id_item_map is None:\n",
+    "        id_item_map = {\n",
+    "            '2.1': 'Ensure Data File Loads as Expected',\n",
+    "            '3.2': 'Data in the Expected Format',\n",
+    "            '3.5': 'Check for Duplicate Records in Data',\n",
+    "            '4.2': 'Verify Data Split Proportion',\n",
+    "            '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n",
+    "            '6.1': 'Verify Evaluation Metrics Implementation',\n",
+    "            '6.2': \"Evaluate Model's Performance Against Thresholds\"\n",
+    "        }\n",
+    "\n",
+    "    # prepare score data by repo, by run\n",
+    "    df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({\n",
+    "        id: ['max'] for id in id_item_map.keys()\n",
+    "    })\n",
+    "    df_repo_run.columns = [col[0] for col in df_repo_run.columns]\n",
+    "    df_repo_run = df_repo_run.reset_index()\n",
+    "    \n",
+    "    # prepare statistics of scores by repo\n",
+    "    df_repo__stat = df_repo_run.groupby(['repo']).agg({\n",
+    "        id: ['mean', 'std', 'count'] for id in id_item_map.keys()\n",
+    "    })\n",
+    "    df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])\n",
+    "    df_repo__stat.columns = ['repo', 'id', 'stat', 'value']\n",
+    "    df_repo__stat = (\n",
+    "        df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')\n",
+    "        .reset_index()\n",
+    "        .rename_axis(None, axis=1)\n",
+    "    )\n",
+    "    df_repo__stat['title'] = df_repo__stat['id'].apply(lambda x: id_item_map[x])\n",
+    "    df_repo__stat['id_title'] = df_repo__stat['id'] + '. ' + df_repo__stat['title']\n",
+    "    \n",
+    "    # prepare counting of scores by repo\n",
+    "    df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()\n",
+    "    for id in list(id_item_map.keys())[1:]:\n",
+    "        df_repo__count = df_repo__count.merge(\n",
+    "            df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),\n",
+    "            on=['repo', 'level_1'],\n",
+    "            how='outer'\n",
+    "        )\n",
+    "    #df_repo__count['title'] = df_repo__count['id'].apply(lambda x: id_item_map[x])\n",
+    "    \n",
+    "    df_repo__count = df_repo__count.fillna(0)\n",
+    "\n",
+    "    df_repo_run = df_repo_run.melt(id_vars=['repo', 'run'], var_name='id', value_name='score')\n",
+    "    df_repo_run['title'] = df_repo_run['id'].apply(lambda x: id_item_map[x])\n",
+    "    df_repo_run['id_title'] = df_repo_run['id'] + '. ' + df_repo_run['title']\n",
+    "    \n",
+    "    return (df_repo_run, df_repo__stat, df_repo__count)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "31c1ce0b-14e3-4825-aa6e-74dd4d4af960",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_3.5-turbo/')\n",
+    "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
+    "\n",
+    "df_repo_run.to_csv('score_by_repo_run_3.5-turbo.csv', index=False)\n",
+    "df_repo__stat.to_csv('score_stat_by_repo_3.5-turbo.csv', index=False)\n",
+    "df_repo__count.to_csv('score_count_by_repo_3.5-turbo.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6343bba3-4fff-4c24-8e71-30ec81df4c4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4-turbo/')\n",
+    "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
+    "\n",
+    "df_repo_run.to_csv('score_by_repo_run_4-turbo.csv', index=False)\n",
+    "df_repo__stat.to_csv('score_stat_by_repo_4-turbo.csv', index=False)\n",
+    "df_repo__count.to_csv('score_count_by_repo_4-turbo.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "11b75320-05a7-4db2-86ea-9c085df26d73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4o/')\n",
+    "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
+    "\n",
+    "df_repo_run.to_csv('score_by_repo_run_4o.csv', index=False)\n",
+    "df_repo__stat.to_csv('score_stat_by_repo_4o.csv', index=False)\n",
+    "df_repo__count.to_csv('score_count_by_repo_4o.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "455c8477-e02d-44cd-80dc-854b3e9e0fa5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ground_truth_df = pd.DataFrame(ground_truth)\n",
+    "ground_truth_df['title'] = ground_truth_df['id'].apply(lambda x: id_item_map[x])\n",
+    "ground_truth_df = ground_truth_df.pivot(index=['id', 'title'], columns='repo', values='score')\n",
+    "ground_truth_df.to_csv('ground_truth.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "28b5150d-d11b-41a5-91bb-32af1a19a776",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46ae0ecc-c510-418e-ae7b-6db3d6571219",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:fixml]",
+   "language": "python",
+   "name": "conda-env-fixml-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}