diff --git a/analysis/batch_run.yml b/analysis/batch_run.yml
index 64b803a..0214292 100644
--- a/analysis/batch_run.yml
+++ b/analysis/batch_run.yml
@@ -1,28 +1,28 @@
-runs: 28
+runs: 30
 checklist_path: null
-model: 'gpt-4-turbo'
-repo_base_path: '../data/raw/openja/'
-response_path: '../data/processed/batch_run_4-turbo'
+model: 'gpt-3.5-turbo'
+repo_base_path: 'data/raw/openja/'
+response_path: 'data/processed/batch_run_3.5-turbo'
 repo:
   - name: lightfm
     path: './lightfm'
-      #  - name: qlib
-      #    path: './qlib'
-      #  - name: mmf
-      #    path: './mmf'
-      #  - name: nanodet
-      #    path: './nanodet'
-      #  - name: magenta
-      #    path: './magenta'
-      #  - name: nupic
-      #    path: './nupic'
-      #  - name: DeepSpeech
-      #    path: './DeepSpeech'
-      #  - name: apollo
-      #    path: './apollo'
-      #  - name: 'paperless-ng'
-      #    path: './paperless-ng'
-      #  - name: 'mycroft-core'
-      #    path: './mycroft-core'
-      #  - name: deepchem
-      #    path: './deepchem'
+  - name: qlib
+    path: './qlib'
+  - name: mmf
+    path: './mmf'
+  - name: nanodet
+    path: './nanodet'
+  - name: magenta
+    path: './magenta'
+  - name: nupic
+    path: './nupic'
+  - name: DeepSpeech
+    path: './DeepSpeech'
+  - name: apollo
+    path: './apollo'
+  - name: 'paperless-ng'
+    path: './paperless-ng'
+  - name: 'mycroft-core'
+    path: './mycroft-core'
+  - name: deepchem
+    path: './deepchem'
diff --git a/analysis/ipynb/01_preprocess.ipynb b/analysis/ipynb/01_preprocess.ipynb
new file mode 100644
index 0000000..c539db1
--- /dev/null
+++ b/analysis/ipynb/01_preprocess.ipynb
@@ -0,0 +1,269 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "18830011-62d1-4242-b851-e6e9ae47b49d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install scipy altair"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "47bea46f-2b65-42ce-801e-55bf6576a67a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import yaml\n",
+    "import pandas as pd\n",
+    "from collections import Counter\n",
+    "\n",
+    "id_item_map = {\n",
+    "    '2.1': 'Ensure Data File Loads as Expected',\n",
+    "    '3.2': 'Data in the Expected Format',\n",
+    "    '3.5': 'Check for Duplicate Records in Data',\n",
+    "    '4.2': 'Verify Data Split Proportion',\n",
+    "    '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n",
+    "    '6.1': 'Verify Evaluation Metrics Implementation',\n",
+    "    '6.2': \"Evaluate Model's Performance Against Thresholds\"\n",
+    "}\n",
+    "\n",
+    "ground_truth = [\n",
+    "    {'repo': 'lightfm', 'id': '2.1', 'score': 1},\n",
+    "    {'repo': 'lightfm', 'id': '3.2', 'score': 1},\n",
+    "    {'repo': 'lightfm', 'id': '3.5', 'score': 0},\n",
+    "    {'repo': 'lightfm', 'id': '4.2', 'score': 1},\n",
+    "    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},\n",
+    "    {'repo': 'lightfm', 'id': '6.1', 'score': 1},\n",
+    "    {'repo': 'lightfm', 'id': '6.2', 'score': 1},\n",
+    "    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},\n",
+    "    {'repo': 'qlib', 'id': '3.2', 'score': 1},\n",
+    "    {'repo': 'qlib', 'id': '3.5', 'score': 0},\n",
+    "    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},\n",
+    "    {'repo': 'qlib', 'id': '5.3', 'score': 1},\n",
+    "    {'repo': 'qlib', 'id': '6.1', 'score': 1},\n",
+    "    {'repo': 'qlib', 'id': '6.2', 'score': 1},\n",
+    "    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},\n",
+    "    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},\n",
+    "    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},\n",
+    "    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},\n",
+    "    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},\n",
+    "    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},\n",
+    "    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},\n",
+    "]\n",
+    "\n",
+    "def get_score_report_from_response(resp_path, verbose=False):\n",
+    "    if verbose:\n",
+    "        print(resp_path)\n",
+    "    with open(resp_path, 'rb') as file:\n",
+    "        response = json.load(file)\n",
+    "    \n",
+    "    reports = [] # report for each test file\n",
+    "    for result in response['call_results']: # one test file per response\n",
+    "        if result['parsed_response']:\n",
+    "            resp = result['parsed_response']['results']\n",
+    "            for item in resp:\n",
+    "                item['file'] = result['files_evaluated'][0] \n",
+    "                item['success'] = result['success']\n",
+    "                reports.append(item)\n",
+    "        # FIXME: not handled failed run for now\n",
+    "        # else: # if the run is failed, the parsed_response will be None\n",
+    "        #     reports.append({\n",
+    "        #         'ID': '2.1', \n",
+    "        #         'Title': '',\n",
+    "        #         'Requirement': '',\n",
+    "        #         'Observation': '',\n",
+    "        #         'Functions': [],\n",
+    "        #         'Evaluation': '',\n",
+    "        #         'Score': 0,\n",
+    "        #         'file': result['files_evaluated'][0],\n",
+    "        #         'success': result['success']\n",
+    "        #     })\n",
+    "    \n",
+    "    reports_df = pd.DataFrame(reports)\n",
+    "    df = (\n",
+    "        reports_df\n",
+    "        .pivot(index='file', columns='ID', values='Score')\n",
+    "        .rename_axis(None, axis=1)\n",
+    "    )\n",
+    "    df['success'] = reports_df.groupby(['file'])['success'].all()\n",
+    "    df['response_path'] = os.path.abspath(resp_path)\n",
+    "    \n",
+    "    return df.reset_index()\n",
+    "\n",
+    "def get_scores_by_repo_by_run_by_file(batch_run_dir_path, record_yml='record_combine.yml', verbose=False):\n",
+    "    ''' Get score for each checklist item, by repository, by run and by test file\n",
+    "    '''\n",
+    "    with open(os.path.join(batch_run_dir_path, record_yml), 'r') as file:\n",
+    "        config = pd.DataFrame(yaml.safe_load(file))\n",
+    "\n",
+    "    config['response_path'] = config['response_path'].apply(\n",
+    "        lambda x: os.path.abspath(os.path.join(batch_run_dir_path, x))\n",
+    "    )\n",
+    "    \n",
+    "    tmp = [\n",
+    "        get_score_report_from_response(\n",
+    "            os.path.join(batch_run_dir_path, path),\n",
+    "            verbose=verbose\n",
+    "        ) for path in config['response_path']\n",
+    "    ]\n",
+    "    tmp = pd.concat(tmp, axis=0).reset_index(drop=True)\n",
+    "    \n",
+    "    return config.merge(tmp, on='response_path', how='left')\n",
+    "\n",
+    "def preprocess(df_repo_run_file, id_item_map=None):\n",
+    "    if id_item_map is None:\n",
+    "        id_item_map = {\n",
+    "            '2.1': 'Ensure Data File Loads as Expected',\n",
+    "            '3.2': 'Data in the Expected Format',\n",
+    "            '3.5': 'Check for Duplicate Records in Data',\n",
+    "            '4.2': 'Verify Data Split Proportion',\n",
+    "            '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n",
+    "            '6.1': 'Verify Evaluation Metrics Implementation',\n",
+    "            '6.2': \"Evaluate Model's Performance Against Thresholds\"\n",
+    "        }\n",
+    "\n",
+    "    # prepare score data by repo, by run\n",
+    "    df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({\n",
+    "        id: ['max'] for id in id_item_map.keys()\n",
+    "    })\n",
+    "    df_repo_run.columns = [col[0] for col in df_repo_run.columns]\n",
+    "    df_repo_run = df_repo_run.reset_index()\n",
+    "    \n",
+    "    # prepare statistics of scores by repo\n",
+    "    df_repo__stat = df_repo_run.groupby(['repo']).agg({\n",
+    "        id: ['mean', 'std', 'count'] for id in id_item_map.keys()\n",
+    "    })\n",
+    "    df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])\n",
+    "    df_repo__stat.columns = ['repo', 'id', 'stat', 'value']\n",
+    "    df_repo__stat = (\n",
+    "        df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')\n",
+    "        .reset_index()\n",
+    "        .rename_axis(None, axis=1)\n",
+    "    )\n",
+    "    df_repo__stat['title'] = df_repo__stat['id'].apply(lambda x: id_item_map[x])\n",
+    "    df_repo__stat['id_title'] = df_repo__stat['id'] + '. ' + df_repo__stat['title']\n",
+    "    \n",
+    "    # prepare counting of scores by repo\n",
+    "    df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()\n",
+    "    for id in list(id_item_map.keys())[1:]:\n",
+    "        df_repo__count = df_repo__count.merge(\n",
+    "            df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),\n",
+    "            on=['repo', 'level_1'],\n",
+    "            how='outer'\n",
+    "        )\n",
+    "    #df_repo__count['title'] = df_repo__count['id'].apply(lambda x: id_item_map[x])\n",
+    "    \n",
+    "    df_repo__count = df_repo__count.fillna(0)\n",
+    "\n",
+    "    df_repo_run = df_repo_run.melt(id_vars=['repo', 'run'], var_name='id', value_name='score')\n",
+    "    df_repo_run['title'] = df_repo_run['id'].apply(lambda x: id_item_map[x])\n",
+    "    df_repo_run['id_title'] = df_repo_run['id'] + '. ' + df_repo_run['title']\n",
+    "    \n",
+    "    return (df_repo_run, df_repo__stat, df_repo__count)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "31c1ce0b-14e3-4825-aa6e-74dd4d4af960",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_3.5-turbo/')\n",
+    "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
+    "\n",
+    "df_repo_run.to_csv('score_by_repo_run_3.5-turbo.csv', index=False)\n",
+    "df_repo__stat.to_csv('score_stat_by_repo_3.5-turbo.csv', index=False)\n",
+    "df_repo__count.to_csv('score_count_by_repo_3.5-turbo.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6343bba3-4fff-4c24-8e71-30ec81df4c4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4-turbo/')\n",
+    "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
+    "\n",
+    "df_repo_run.to_csv('score_by_repo_run_4-turbo.csv', index=False)\n",
+    "df_repo__stat.to_csv('score_stat_by_repo_4-turbo.csv', index=False)\n",
+    "df_repo__count.to_csv('score_count_by_repo_4-turbo.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "11b75320-05a7-4db2-86ea-9c085df26d73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/batch_run/batch_run_4o/')\n",
+    "df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n",
+    "\n",
+    "df_repo_run.to_csv('score_by_repo_run_4o.csv', index=False)\n",
+    "df_repo__stat.to_csv('score_stat_by_repo_4o.csv', index=False)\n",
+    "df_repo__count.to_csv('score_count_by_repo_4o.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "455c8477-e02d-44cd-80dc-854b3e9e0fa5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ground_truth_df = pd.DataFrame(ground_truth)\n",
+    "ground_truth_df['title'] = ground_truth_df['id'].apply(lambda x: id_item_map[x])\n",
+    "ground_truth_df = ground_truth_df.pivot(index=['id', 'title'], columns='repo', values='score')\n",
+    "ground_truth_df.to_csv('ground_truth.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "28b5150d-d11b-41a5-91bb-32af1a19a776",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46ae0ecc-c510-418e-ae7b-6db3d6571219",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:fixml]",
+   "language": "python",
+   "name": "conda-env-fixml-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/analysis/ipynb/02_plots-for-final-report.ipynb b/analysis/ipynb/02_plots-for-final-report.ipynb
new file mode 100644
index 0000000..36e9feb
--- /dev/null
+++ b/analysis/ipynb/02_plots-for-final-report.ipynb
@@ -0,0 +1,693 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e9a74646-ec18-49c0-b9ef-ed3b5ba64087",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-89a40543444c4e129acbefdfb51c6f79.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-89a40543444c4e129acbefdfb51c6f79.vega-embed details,\n",
+       "  #altair-viz-89a40543444c4e129acbefdfb51c6f79.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-89a40543444c4e129acbefdfb51c6f79\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-89a40543444c4e129acbefdfb51c6f79\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-89a40543444c4e129acbefdfb51c6f79\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.17.0\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300}, \"axis\": {\"labelFontSize\": 12, \"titleFontSize\": 12}}, \"data\": {\"name\": \"data-0cd69fd1c95de279cdd7d1f0310bd508\"}, \"facet\": {\"column\": {\"field\": \"repo\", \"title\": null, \"type\": \"nominal\"}}, \"spec\": {\"layer\": [{\"mark\": {\"type\": \"point\", \"color\": \"green\", \"filled\": true, \"shape\": \"diamond\", \"size\": 200}, \"encoding\": {\"x\": {\"field\": \"ground_truth\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"id_title\", \"type\": \"nominal\"}}, \"transform\": [{\"calculate\": \"max(0, datum.mean-datum.std)\", \"as\": \"min\"}, {\"calculate\": \"min(1, datum.mean+datum.std)\", \"as\": \"max\"}]}, {\"mark\": {\"type\": \"point\", \"color\": \"black\", \"filled\": true, \"size\": 50}, \"encoding\": {\"x\": {\"axis\": {\"labelExpr\": \"datum.value % 0.5 ? null : datum.label\"}, \"field\": \"mean\", \"scale\": {\"domainMin\": 0, \"domainMax\": 1}, \"title\": \"Score\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"grid\": false, \"labelLimit\": 1000, \"labelPadding\": 10}, \"field\": \"id_title\", \"title\": null, \"type\": \"nominal\"}}, \"transform\": [{\"calculate\": \"max(0, datum.mean-datum.std)\", \"as\": \"min\"}, {\"calculate\": \"min(1, datum.mean+datum.std)\", \"as\": \"max\"}]}, {\"mark\": {\"type\": \"errorbar\"}, \"encoding\": {\"x\": {\"field\": \"min\", \"title\": \"1 SD\", \"type\": \"quantitative\"}, \"x2\": {\"field\": \"max\"}, \"y\": {\"field\": \"id_title\", \"type\": \"nominal\"}}, \"transform\": [{\"calculate\": \"max(0, datum.mean-datum.std)\", \"as\": \"min\"}, {\"calculate\": \"min(1, datum.mean+datum.std)\", \"as\": \"max\"}]}]}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.17.0.json\", \"datasets\": {\"data-0cd69fd1c95de279cdd7d1f0310bd508\": [{\"repo\": \"lightfm\", \"id\": 2.1, \"count\": 30.0, \"mean\": 1.0, \"std\": 0.0, \"title\": \"Ensure Data File Loads as Expected\", \"id_title\": \"2.1. Ensure Data File Loads as Expected\", \"ground_truth\": 1.0}, {\"repo\": \"lightfm\", \"id\": 3.2, \"count\": 30.0, \"mean\": 0.5, \"std\": 0.0, \"title\": \"Data in the Expected Format\", \"id_title\": \"3.2. Data in the Expected Format\", \"ground_truth\": 1.0}, {\"repo\": \"lightfm\", \"id\": 3.5, \"count\": 30.0, \"mean\": 0.0, \"std\": 0.0, \"title\": \"Check for Duplicate Records in Data\", \"id_title\": \"3.5. Check for Duplicate Records in Data\", \"ground_truth\": 0.0}, {\"repo\": \"lightfm\", \"id\": 4.2, \"count\": 30.0, \"mean\": 0.8166666666666667, \"std\": 0.2450662589267805, \"title\": \"Verify Data Split Proportion\", \"id_title\": \"4.2. Verify Data Split Proportion\", \"ground_truth\": 1.0}, {\"repo\": \"lightfm\", \"id\": 5.3, \"count\": 30.0, \"mean\": 0.4833333333333333, \"std\": 0.0912870929175276, \"title\": \"Ensure Model Output Shape Aligns with Expectation\", \"id_title\": \"5.3. Ensure Model Output Shape Aligns with Expectation\", \"ground_truth\": 0.5}, {\"repo\": \"lightfm\", \"id\": 6.1, \"count\": 30.0, \"mean\": 0.9166666666666666, \"std\": 0.1895245108947258, \"title\": \"Verify Evaluation Metrics Implementation\", \"id_title\": \"6.1. Verify Evaluation Metrics Implementation\", \"ground_truth\": 1.0}, {\"repo\": \"lightfm\", \"id\": 6.2, \"count\": 30.0, \"mean\": 0.9833333333333332, \"std\": 0.0912870929175276, \"title\": \"Evaluate Model's Performance Against Thresholds\", \"id_title\": \"6.2. Evaluate Model's Performance Against Thresholds\", \"ground_truth\": 1.0}, {\"repo\": \"qlib\", \"id\": 2.1, \"count\": 30.0, \"mean\": 1.0, \"std\": 0.0, \"title\": \"Ensure Data File Loads as Expected\", \"id_title\": \"2.1. Ensure Data File Loads as Expected\", \"ground_truth\": 0.5}, {\"repo\": \"qlib\", \"id\": 3.2, \"count\": 30.0, \"mean\": 0.7666666666666667, \"std\": 0.2537081317024624, \"title\": \"Data in the Expected Format\", \"id_title\": \"3.2. Data in the Expected Format\", \"ground_truth\": 1.0}, {\"repo\": \"qlib\", \"id\": 3.5, \"count\": 30.0, \"mean\": 0.1166666666666666, \"std\": 0.2150915335760381, \"title\": \"Check for Duplicate Records in Data\", \"id_title\": \"3.5. Check for Duplicate Records in Data\", \"ground_truth\": 0.0}, {\"repo\": \"qlib\", \"id\": 4.2, \"count\": 30.0, \"mean\": 0.4833333333333333, \"std\": 0.2069204966986668, \"title\": \"Verify Data Split Proportion\", \"id_title\": \"4.2. Verify Data Split Proportion\", \"ground_truth\": 0.5}, {\"repo\": \"qlib\", \"id\": 5.3, \"count\": 30.0, \"mean\": 0.55, \"std\": 0.2012889499682243, \"title\": \"Ensure Model Output Shape Aligns with Expectation\", \"id_title\": \"5.3. Ensure Model Output Shape Aligns with Expectation\", \"ground_truth\": 1.0}, {\"repo\": \"qlib\", \"id\": 6.1, \"count\": 30.0, \"mean\": 0.6333333333333333, \"std\": 0.2916461404928373, \"title\": \"Verify Evaluation Metrics Implementation\", \"id_title\": \"6.1. Verify Evaluation Metrics Implementation\", \"ground_truth\": 1.0}, {\"repo\": \"qlib\", \"id\": 6.2, \"count\": 30.0, \"mean\": 0.6, \"std\": 0.203419051086243, \"title\": \"Evaluate Model's Performance Against Thresholds\", \"id_title\": \"6.2. Evaluate Model's Performance Against Thresholds\", \"ground_truth\": 1.0}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.FacetChart(...)"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import altair as alt\n",
+    "import pandas as pd\n",
+    "\n",
+    "df_repo__stat = pd.read_csv('score_stat_by_repo_3.5-turbo.csv')\n",
+    "gt = pd.read_csv('ground_truth.csv')\n",
+    "gt = gt.melt(id_vars=['id', 'title'], var_name='repo', value_name='ground_truth')\n",
+    "\n",
+    "df_repo__stat_with_gt = df_repo__stat.merge(gt, on=['id', 'title', 'repo'])\n",
+    "\n",
+    "base = alt.Chart(\n",
+    "    df_repo__stat_with_gt.query('repo in [\"lightfm\", \"qlib\", \"DeepSpeech\"]')\n",
+    ").transform_calculate(\n",
+    "    min=\"max(0, datum.mean-datum.std)\",\n",
+    "    max=\"min(1, datum.mean+datum.std)\"\n",
+    ")\n",
+    "    \n",
+    "# generate the points\n",
+    "points = base.mark_point(\n",
+    "    filled=True,\n",
+    "    size=50,\n",
+    "    color='black'\n",
+    ").encode(\n",
+    "    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n",
+    "        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n",
+    "    ),\n",
+    "    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n",
+    ")\n",
+    "\n",
+    "# generate the points for ground truth\n",
+    "gt_points = base.mark_point(\n",
+    "    filled=True,\n",
+    "    size=200,\n",
+    "    color='green',\n",
+    "    shape=\"diamond\"\n",
+    ").encode(\n",
+    "    x=alt.X('ground_truth:Q'),\n",
+    "    y=alt.Y('id_title:N')\n",
+    ")\n",
+    "\n",
+    "# generate the error bars\n",
+    "errorbars = base.mark_errorbar().encode(\n",
+    "    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n",
+    "    x2=\"max:Q\",\n",
+    "    y=\"id_title:N\"\n",
+    ")\n",
+    "\n",
+    "(gt_points + points + errorbars).facet(\n",
+    "    column=alt.Column('repo:N').title(None)\n",
+    ").configure_axis( \n",
+    "    labelFontSize=12, \n",
+    "    titleFontSize=12\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa1c3071-33f8-47da-a10d-42b7dbdcce1d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a169da71-4be7-4c88-8553-d6b68c2b1edf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>score</th>\n",
+       "      <th>0.0</th>\n",
+       "      <th>0.5</th>\n",
+       "      <th>1.0</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Repository</th>\n",
+       "      <th>Checklist Item</th>\n",
+       "      <th>Ground Truth</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"7\" valign=\"top\">lightfm</th>\n",
+       "      <th>3.5. Check for Duplicate Records in Data</th>\n",
+       "      <th>0.0</th>\n",
+       "      <td>30</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5.3. Ensure Model Output Shape Aligns with Expectation</th>\n",
+       "      <th>0.5</th>\n",
+       "      <td>1</td>\n",
+       "      <td>29</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2.1. Ensure Data File Loads as Expected</th>\n",
+       "      <th>1.0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3.2. Data in the Expected Format</th>\n",
+       "      <th>1.0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4.2. Verify Data Split Proportion</th>\n",
+       "      <th>1.0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6.1. Verify Evaluation Metrics Implementation</th>\n",
+       "      <th>1.0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6.2. Evaluate Model's Performance Against Thresholds</th>\n",
+       "      <th>1.0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>29</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"7\" valign=\"top\">qlib</th>\n",
+       "      <th>3.5. Check for Duplicate Records in Data</th>\n",
+       "      <th>0.0</th>\n",
+       "      <td>23</td>\n",
+       "      <td>7</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2.1. Ensure Data File Loads as Expected</th>\n",
+       "      <th>0.5</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4.2. Verify Data Split Proportion</th>\n",
+       "      <th>0.5</th>\n",
+       "      <td>3</td>\n",
+       "      <td>25</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3.2. Data in the Expected Format</th>\n",
+       "      <th>1.0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>14</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5.3. Ensure Model Output Shape Aligns with Expectation</th>\n",
+       "      <th>1.0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>25</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6.1. Verify Evaluation Metrics Implementation</th>\n",
+       "      <th>1.0</th>\n",
+       "      <td>2</td>\n",
+       "      <td>18</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6.2. Evaluate Model's Performance Against Thresholds</th>\n",
+       "      <th>1.0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>24</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "score                                                                       0.0  \\\n",
+       "Repository Checklist Item                                     Ground Truth        \n",
+       "lightfm    3.5. Check for Duplicate Records in Data           0.0            30   \n",
+       "           5.3. Ensure Model Output Shape Aligns with Expe... 0.5             1   \n",
+       "           2.1. Ensure Data File Loads as Expected            1.0             0   \n",
+       "           3.2. Data in the Expected Format                   1.0             0   \n",
+       "           4.2. Verify Data Split Proportion                  1.0             0   \n",
+       "           6.1. Verify Evaluation Metrics Implementation      1.0             0   \n",
+       "           6.2. Evaluate Model's Performance Against Thres... 1.0             0   \n",
+       "qlib       3.5. Check for Duplicate Records in Data           0.0            23   \n",
+       "           2.1. Ensure Data File Loads as Expected            0.5             0   \n",
+       "           4.2. Verify Data Split Proportion                  0.5             3   \n",
+       "           3.2. Data in the Expected Format                   1.0             0   \n",
+       "           5.3. Ensure Model Output Shape Aligns with Expe... 1.0             1   \n",
+       "           6.1. Verify Evaluation Metrics Implementation      1.0             2   \n",
+       "           6.2. Evaluate Model's Performance Against Thres... 1.0             0   \n",
+       "\n",
+       "score                                                                       0.5  \\\n",
+       "Repository Checklist Item                                     Ground Truth        \n",
+       "lightfm    3.5. Check for Duplicate Records in Data           0.0             0   \n",
+       "           5.3. Ensure Model Output Shape Aligns with Expe... 0.5            29   \n",
+       "           2.1. Ensure Data File Loads as Expected            1.0             0   \n",
+       "           3.2. Data in the Expected Format                   1.0            30   \n",
+       "           4.2. Verify Data Split Proportion                  1.0            11   \n",
+       "           6.1. Verify Evaluation Metrics Implementation      1.0             5   \n",
+       "           6.2. Evaluate Model's Performance Against Thres... 1.0             1   \n",
+       "qlib       3.5. Check for Duplicate Records in Data           0.0             7   \n",
+       "           2.1. Ensure Data File Loads as Expected            0.5             0   \n",
+       "           4.2. Verify Data Split Proportion                  0.5            25   \n",
+       "           3.2. Data in the Expected Format                   1.0            14   \n",
+       "           5.3. Ensure Model Output Shape Aligns with Expe... 1.0            25   \n",
+       "           6.1. Verify Evaluation Metrics Implementation      1.0            18   \n",
+       "           6.2. Evaluate Model's Performance Against Thres... 1.0            24   \n",
+       "\n",
+       "score                                                                       1.0  \n",
+       "Repository Checklist Item                                     Ground Truth       \n",
+       "lightfm    3.5. Check for Duplicate Records in Data           0.0             0  \n",
+       "           5.3. Ensure Model Output Shape Aligns with Expe... 0.5             0  \n",
+       "           2.1. Ensure Data File Loads as Expected            1.0            30  \n",
+       "           3.2. Data in the Expected Format                   1.0             0  \n",
+       "           4.2. Verify Data Split Proportion                  1.0            19  \n",
+       "           6.1. Verify Evaluation Metrics Implementation      1.0            25  \n",
+       "           6.2. Evaluate Model's Performance Against Thres... 1.0            29  \n",
+       "qlib       3.5. Check for Duplicate Records in Data           0.0             0  \n",
+       "           2.1. Ensure Data File Loads as Expected            0.5            30  \n",
+       "           4.2. Verify Data Split Proportion                  0.5             2  \n",
+       "           3.2. Data in the Expected Format                   1.0            16  \n",
+       "           5.3. Ensure Model Output Shape Aligns with Expe... 1.0             4  \n",
+       "           6.1. Verify Evaluation Metrics Implementation      1.0            10  \n",
+       "           6.2. Evaluate Model's Performance Against Thres... 1.0             6  "
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_repo_run = pd.read_csv('score_by_repo_run_3.5-turbo.csv')\n",
+    "\n",
+    "df_repo_run = df_repo_run.merge(gt, on=['id', 'title', 'repo'])\n",
+    "\n",
+    "contingency_table = pd.pivot_table(\n",
+    "    df_repo_run,\n",
+    "    values='run', \n",
+    "    index=['repo', 'id_title', 'ground_truth'], \n",
+    "    columns=['score'],\n",
+    "    aggfunc='count', \n",
+    "    fill_value=0\n",
+    ")\n",
+    "contingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth']\n",
+    "contingency_table.sort_index(level=[0, 2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "afc06ca7-5f39-4293-8bdb-9d46558e7535",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f23950d5-792f-4f0a-8e3a-1727b3598dd8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-0ea52a4d09614afca5f0fd61133d47be.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-0ea52a4d09614afca5f0fd61133d47be.vega-embed details,\n",
+       "  #altair-viz-0ea52a4d09614afca5f0fd61133d47be.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-0ea52a4d09614afca5f0fd61133d47be\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-0ea52a4d09614afca5f0fd61133d47be\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-0ea52a4d09614afca5f0fd61133d47be\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.17.0\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"stroke\": null}, \"axis\": {\"labelFontSize\": 12, \"titleFontSize\": 12}}, \"layer\": [{\"mark\": {\"type\": \"boxplot\", \"color\": \"grey\", \"opacity\": 0.5, \"size\": 20}, \"encoding\": {\"x\": {\"field\": \"value\", \"title\": \"Standard Deviation of Scores\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"grid\": false, \"labelLimit\": 1000, \"labelPadding\": 10}, \"field\": \"id_title\", \"title\": null, \"type\": \"nominal\"}}}, {\"mark\": {\"type\": \"circle\", \"size\": 100}, \"encoding\": {\"color\": {\"field\": \"id_title\", \"legend\": null, \"type\": \"nominal\"}, \"tooltip\": {\"field\": \"repo\", \"type\": \"nominal\"}, \"x\": {\"field\": \"value\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"grid\": true, \"labels\": true, \"ticks\": false}, \"field\": \"id_title\", \"scale\": {}, \"type\": \"nominal\"}, \"yOffset\": {\"field\": \"jitter\", \"type\": \"quantitative\"}}, \"transform\": [{\"calculate\": \"sqrt(-2*log(random()))*cos(2*PI*random())\", \"as\": \"jitter\"}]}], \"data\": {\"name\": \"data-6781bc08998d89e40d13eed2c6299b07\"}, \"height\": 300, \"title\": \"30 Runs on Openja's Repositories for each Checklist Item\", \"width\": 600, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.17.0.json\", \"datasets\": {\"data-6781bc08998d89e40d13eed2c6299b07\": [{\"repo\": \"lightfm\", \"id_title\": \"2.1. Ensure Data File Loads as Expected\", \"value\": 0.0}, {\"repo\": \"magenta\", \"id_title\": \"2.1. Ensure Data File Loads as Expected\", \"value\": 0.0912870929175276}, {\"repo\": \"mmf\", \"id_title\": \"2.1. Ensure Data File Loads as Expected\", \"value\": 0.0}, {\"repo\": \"nanodet\", \"id_title\": \"2.1. Ensure Data File Loads as Expected\", \"value\": 0.0}, {\"repo\": \"qlib\", \"id_title\": \"2.1. Ensure Data File Loads as Expected\", \"value\": 0.0}, {\"repo\": \"lightfm\", \"id_title\": \"3.2. Data in the Expected Format\", \"value\": 0.0}, {\"repo\": \"magenta\", \"id_title\": \"3.2. Data in the Expected Format\", \"value\": 0.0}, {\"repo\": \"mmf\", \"id_title\": \"3.2. Data in the Expected Format\", \"value\": 0.2330457998496995}, {\"repo\": \"nanodet\", \"id_title\": \"3.2. Data in the Expected Format\", \"value\": 0.2542738138578039}, {\"repo\": \"qlib\", \"id_title\": \"3.2. Data in the Expected Format\", \"value\": 0.2537081317024624}, {\"repo\": \"lightfm\", \"id_title\": \"3.5. Check for Duplicate Records in Data\", \"value\": 0.0}, {\"repo\": \"magenta\", \"id_title\": \"3.5. Check for Duplicate Records in Data\", \"value\": 0.0}, {\"repo\": \"mmf\", \"id_title\": \"3.5. Check for Duplicate Records in Data\", \"value\": 0.2537081317024624}, {\"repo\": \"nanodet\", \"id_title\": \"3.5. Check for Duplicate Records in Data\", \"value\": 0.0}, {\"repo\": \"qlib\", \"id_title\": \"3.5. Check for Duplicate Records in Data\", \"value\": 0.2150915335760381}, {\"repo\": \"lightfm\", \"id_title\": \"4.2. Verify Data Split Proportion\", \"value\": 0.2450662589267805}, {\"repo\": \"magenta\", \"id_title\": \"4.2. Verify Data Split Proportion\", \"value\": 0.0}, {\"repo\": \"mmf\", \"id_title\": \"4.2. Verify Data Split Proportion\", \"value\": 0.1728729518208802}, {\"repo\": \"nanodet\", \"id_title\": \"4.2. Verify Data Split Proportion\", \"value\": 0.0}, {\"repo\": \"qlib\", \"id_title\": \"4.2. Verify Data Split Proportion\", \"value\": 0.2069204966986668}, {\"repo\": \"lightfm\", \"id_title\": \"5.3. Ensure Model Output Shape Aligns with Expectation\", \"value\": 0.0912870929175276}, {\"repo\": \"magenta\", \"id_title\": \"5.3. Ensure Model Output Shape Aligns with Expectation\", \"value\": 0.0}, {\"repo\": \"mmf\", \"id_title\": \"5.3. Ensure Model Output Shape Aligns with Expectation\", \"value\": 0.2491364395612199}, {\"repo\": \"nanodet\", \"id_title\": \"5.3. Ensure Model Output Shape Aligns with Expectation\", \"value\": 0.0}, {\"repo\": \"qlib\", \"id_title\": \"5.3. Ensure Model Output Shape Aligns with Expectation\", \"value\": 0.2012889499682243}, {\"repo\": \"lightfm\", \"id_title\": \"6.1. Verify Evaluation Metrics Implementation\", \"value\": 0.1895245108947258}, {\"repo\": \"magenta\", \"id_title\": \"6.1. Verify Evaluation Metrics Implementation\", \"value\": 0.0}, {\"repo\": \"mmf\", \"id_title\": \"6.1. Verify Evaluation Metrics Implementation\", \"value\": 0.1525642883146823}, {\"repo\": \"nanodet\", \"id_title\": \"6.1. Verify Evaluation Metrics Implementation\", \"value\": 0.2491364395612199}, {\"repo\": \"qlib\", \"id_title\": \"6.1. Verify Evaluation Metrics Implementation\", \"value\": 0.2916461404928373}, {\"repo\": \"lightfm\", \"id_title\": \"6.2. Evaluate Model's Performance Against Thresholds\", \"value\": 0.0912870929175276}, {\"repo\": \"magenta\", \"id_title\": \"6.2. Evaluate Model's Performance Against Thresholds\", \"value\": 0.0}, {\"repo\": \"mmf\", \"id_title\": \"6.2. Evaluate Model's Performance Against Thresholds\", \"value\": 0.2248882225544018}, {\"repo\": \"nanodet\", \"id_title\": \"6.2. Evaluate Model's Performance Against Thresholds\", \"value\": 0.1268540658512312}, {\"repo\": \"qlib\", \"id_title\": \"6.2. Evaluate Model's Performance Against Thresholds\", \"value\": 0.203419051086243}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.LayerChart(...)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stds = df_repo__stat[['repo', 'std', 'id_title']].pivot(index='repo', columns='id_title').copy()\n",
+    "stds.columns = [col[1] for col in stds.columns]\n",
+    "stds = stds.reset_index()\n",
+    "stds = stds.melt(id_vars='repo', var_name='id_title')\n",
+    "\n",
+    "base = alt.Chart(stds)\n",
+    "\n",
+    "box = base.mark_boxplot(\n",
+    "    color='grey',\n",
+    "    opacity=0.5,\n",
+    "    size=20,\n",
+    ").encode(\n",
+    "    x=alt.X('value:Q').title('Standard Deviation of Scores'),\n",
+    "    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))\n",
+    ")\n",
+    "\n",
+    "stripplot = base.mark_circle(size=100).encode(\n",
+    "    y=alt.Y( \n",
+    "        'id_title:N',\n",
+    "        axis=alt.Axis(ticks=False, grid=True, labels=True), \n",
+    "        scale=alt.Scale(), \n",
+    "    ), \n",
+    "    x='value:Q',\n",
+    "    yOffset=\"jitter:Q\",\n",
+    "    color=alt.Color('id_title:N', legend=None),\n",
+    "    tooltip='repo'\n",
+    ").transform_calculate(\n",
+    "    # Generate Gaussian jitter with a Box-Muller transform\n",
+    "    jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n",
+    ")\n",
+    "\n",
+    "(\n",
+    "    box + stripplot\n",
+    ").configure_view( \n",
+    "    stroke=None\n",
+    ").configure_axis( \n",
+    "    labelFontSize=12, \n",
+    "    titleFontSize=12\n",
+    ").properties(\n",
+    "    height=300, \n",
+    "    width=600,\n",
+    "    title=\"30 Runs on Openja's Repositories for each Checklist Item\"\n",
+    ") "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d493dc0-5b75-4348-a627-b1194e498b0d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "bade3842-185e-4369-a5d7-4356290df058",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-8eb7d45402d04510aba64c561c1696f8.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-8eb7d45402d04510aba64c561c1696f8.vega-embed details,\n",
+       "  #altair-viz-8eb7d45402d04510aba64c561c1696f8.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-8eb7d45402d04510aba64c561c1696f8\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-8eb7d45402d04510aba64c561c1696f8\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-8eb7d45402d04510aba64c561c1696f8\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.17.0\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300}, \"axis\": {\"labelFontSize\": 12, \"titleFontSize\": 12}}, \"data\": {\"name\": \"data-fa92dc5f0050e8b2ad7776e083e19b07\"}, \"facet\": {\"column\": {\"field\": \"model\", \"title\": null, \"type\": \"nominal\"}}, \"spec\": {\"layer\": [{\"mark\": {\"type\": \"point\", \"color\": \"green\", \"filled\": true, \"shape\": \"diamond\", \"size\": 200}, \"encoding\": {\"x\": {\"field\": \"ground_truth\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"id_title\", \"type\": \"nominal\"}}, \"transform\": [{\"calculate\": \"max(0, datum.mean-datum.std)\", \"as\": \"min\"}, {\"calculate\": \"min(1, datum.mean+datum.std)\", \"as\": \"max\"}]}, {\"mark\": {\"type\": \"point\", \"color\": \"black\", \"filled\": true, \"size\": 50}, \"encoding\": {\"x\": {\"axis\": {\"labelExpr\": \"datum.value % 0.5 ? null : datum.label\"}, \"field\": \"mean\", \"scale\": {\"domainMin\": 0, \"domainMax\": 1}, \"title\": \"Score\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"grid\": false, \"labelLimit\": 1000, \"labelPadding\": 10}, \"field\": \"id_title\", \"title\": null, \"type\": \"nominal\"}}, \"transform\": [{\"calculate\": \"max(0, datum.mean-datum.std)\", \"as\": \"min\"}, {\"calculate\": \"min(1, datum.mean+datum.std)\", \"as\": \"max\"}]}, {\"mark\": {\"type\": \"errorbar\"}, \"encoding\": {\"x\": {\"field\": \"min\", \"title\": \"1 SD\", \"type\": \"quantitative\"}, \"x2\": {\"field\": \"max\"}, \"y\": {\"field\": \"id_title\", \"type\": \"nominal\"}}, \"transform\": [{\"calculate\": \"max(0, datum.mean-datum.std)\", \"as\": \"min\"}, {\"calculate\": \"min(1, datum.mean+datum.std)\", \"as\": \"max\"}]}]}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.17.0.json\", \"datasets\": {\"data-fa92dc5f0050e8b2ad7776e083e19b07\": [{\"repo\": \"lightfm\", \"id\": 2.1, \"count\": 30.0, \"mean\": 1.0, \"std\": 0.0, \"title\": \"Ensure Data File Loads as Expected\", \"id_title\": \"2.1. Ensure Data File Loads as Expected\", \"ground_truth\": 1.0, \"model\": \"gpt-3.5-turbo\"}, {\"repo\": \"lightfm\", \"id\": 3.2, \"count\": 30.0, \"mean\": 0.5, \"std\": 0.0, \"title\": \"Data in the Expected Format\", \"id_title\": \"3.2. Data in the Expected Format\", \"ground_truth\": 1.0, \"model\": \"gpt-3.5-turbo\"}, {\"repo\": \"lightfm\", \"id\": 3.5, \"count\": 30.0, \"mean\": 0.0, \"std\": 0.0, \"title\": \"Check for Duplicate Records in Data\", \"id_title\": \"3.5. Check for Duplicate Records in Data\", \"ground_truth\": 0.0, \"model\": \"gpt-3.5-turbo\"}, {\"repo\": \"lightfm\", \"id\": 4.2, \"count\": 30.0, \"mean\": 0.8166666666666667, \"std\": 0.2450662589267805, \"title\": \"Verify Data Split Proportion\", \"id_title\": \"4.2. Verify Data Split Proportion\", \"ground_truth\": 1.0, \"model\": \"gpt-3.5-turbo\"}, {\"repo\": \"lightfm\", \"id\": 5.3, \"count\": 30.0, \"mean\": 0.4833333333333333, \"std\": 0.0912870929175276, \"title\": \"Ensure Model Output Shape Aligns with Expectation\", \"id_title\": \"5.3. Ensure Model Output Shape Aligns with Expectation\", \"ground_truth\": 0.5, \"model\": \"gpt-3.5-turbo\"}, {\"repo\": \"lightfm\", \"id\": 6.1, \"count\": 30.0, \"mean\": 0.9166666666666666, \"std\": 0.1895245108947258, \"title\": \"Verify Evaluation Metrics Implementation\", \"id_title\": \"6.1. Verify Evaluation Metrics Implementation\", \"ground_truth\": 1.0, \"model\": \"gpt-3.5-turbo\"}, {\"repo\": \"lightfm\", \"id\": 6.2, \"count\": 30.0, \"mean\": 0.9833333333333332, \"std\": 0.0912870929175276, \"title\": \"Evaluate Model's Performance Against Thresholds\", \"id_title\": \"6.2. Evaluate Model's Performance Against Thresholds\", \"ground_truth\": 1.0, \"model\": \"gpt-3.5-turbo\"}, {\"repo\": \"lightfm\", \"id\": 2.1, \"count\": 30.0, \"mean\": 1.0, \"std\": 0.0, \"title\": \"Ensure Data File Loads as Expected\", \"id_title\": \"2.1. Ensure Data File Loads as Expected\", \"ground_truth\": 1.0, \"model\": \"gpt-4o\"}, {\"repo\": \"lightfm\", \"id\": 3.2, \"count\": 30.0, \"mean\": 1.0, \"std\": 0.0, \"title\": \"Data in the Expected Format\", \"id_title\": \"3.2. Data in the Expected Format\", \"ground_truth\": 1.0, \"model\": \"gpt-4o\"}, {\"repo\": \"lightfm\", \"id\": 3.5, \"count\": 30.0, \"mean\": 1.0, \"std\": 0.0, \"title\": \"Check for Duplicate Records in Data\", \"id_title\": \"3.5. Check for Duplicate Records in Data\", \"ground_truth\": 0.0, \"model\": \"gpt-4o\"}, {\"repo\": \"lightfm\", \"id\": 4.2, \"count\": 30.0, \"mean\": 1.0, \"std\": 0.0, \"title\": \"Verify Data Split Proportion\", \"id_title\": \"4.2. Verify Data Split Proportion\", \"ground_truth\": 1.0, \"model\": \"gpt-4o\"}, {\"repo\": \"lightfm\", \"id\": 5.3, \"count\": 30.0, \"mean\": 1.0, \"std\": 0.0, \"title\": \"Ensure Model Output Shape Aligns with Expectation\", \"id_title\": \"5.3. Ensure Model Output Shape Aligns with Expectation\", \"ground_truth\": 0.5, \"model\": \"gpt-4o\"}, {\"repo\": \"lightfm\", \"id\": 6.1, \"count\": 30.0, \"mean\": 1.0, \"std\": 0.0, \"title\": \"Verify Evaluation Metrics Implementation\", \"id_title\": \"6.1. Verify Evaluation Metrics Implementation\", \"ground_truth\": 1.0, \"model\": \"gpt-4o\"}, {\"repo\": \"lightfm\", \"id\": 6.2, \"count\": 30.0, \"mean\": 1.0, \"std\": 0.0, \"title\": \"Evaluate Model's Performance Against Thresholds\", \"id_title\": \"6.2. Evaluate Model's Performance Against Thresholds\", \"ground_truth\": 1.0, \"model\": \"gpt-4o\"}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.FacetChart(...)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_repo_4o__stat = pd.read_csv('score_stat_by_repo_4o.csv')\n",
+    "df_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])\n",
+    "df_repo_4o__stat_with_gt['model'] = 'gpt-4o'\n",
+    "\n",
+    "df_repo_35turbo__stat_with_gt = df_repo__stat_with_gt.query(\"repo == 'lightfm'\").copy()\n",
+    "df_repo_35turbo__stat_with_gt['model'] = 'gpt-3.5-turbo'\n",
+    "\n",
+    "df_model_comp = pd.concat(\n",
+    "    (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), \n",
+    "    axis=0\n",
+    ")\n",
+    "\n",
+    "base = alt.Chart(\n",
+    "    df_model_comp\n",
+    ").transform_calculate(\n",
+    "    min=\"max(0, datum.mean-datum.std)\",\n",
+    "    max=\"min(1, datum.mean+datum.std)\"\n",
+    ")\n",
+    "    \n",
+    "# generate the points\n",
+    "points = base.mark_point(\n",
+    "    filled=True,\n",
+    "    size=50,\n",
+    "    color='black'\n",
+    ").encode(\n",
+    "    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n",
+    "        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n",
+    "    ),\n",
+    "    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n",
+    ")\n",
+    "\n",
+    "# generate the points for ground truth\n",
+    "gt_points = base.mark_point(\n",
+    "    filled=True,\n",
+    "    size=200,\n",
+    "    color='green',\n",
+    "    shape=\"diamond\"\n",
+    ").encode(\n",
+    "    x=alt.X('ground_truth:Q'),\n",
+    "    y=alt.Y('id_title:N')\n",
+    ")\n",
+    "\n",
+    "# generate the error bars\n",
+    "errorbars = base.mark_errorbar().encode(\n",
+    "    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n",
+    "    x2=\"max:Q\",\n",
+    "    y=\"id_title:N\"\n",
+    ")\n",
+    "\n",
+    "(gt_points + points + errorbars).facet(\n",
+    "    column=alt.Column('model:N').title(None)\n",
+    ").configure_axis( \n",
+    "    labelFontSize=12, \n",
+    "    titleFontSize=12\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1af0fef8-2c34-4166-affe-93224c639cf9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:fixml]",
+   "language": "python",
+   "name": "conda-env-fixml-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/analysis/preprocess_batch_run_result.py b/analysis/preprocess_batch_run_result.py
new file mode 100644
index 0000000..d792633
--- /dev/null
+++ b/analysis/preprocess_batch_run_result.py
@@ -0,0 +1,163 @@
+import os
+import json
+import yaml
+import pandas as pd
+from collections import Counter
+
+id_item_map = {
+    '2.1': 'Ensure Data File Loads as Expected',
+    '3.2': 'Data in the Expected Format',
+    '3.5': 'Check for Duplicate Records in Data',
+    '4.2': 'Verify Data Split Proportion',
+    '5.3': 'Ensure Model Output Shape Aligns with Expectation',
+    '6.1': 'Verify Evaluation Metrics Implementation',
+    '6.2': "Evaluate Model's Performance Against Thresholds"
+}
+
+ground_truth = [
+    {'repo': 'lightfm', 'id': '2.1', 'score': 1},
+    {'repo': 'lightfm', 'id': '3.2', 'score': 1},
+    {'repo': 'lightfm', 'id': '3.5', 'score': 0},
+    {'repo': 'lightfm', 'id': '4.2', 'score': 1},
+    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},
+    {'repo': 'lightfm', 'id': '6.1', 'score': 1},
+    {'repo': 'lightfm', 'id': '6.2', 'score': 1},
+    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},
+    {'repo': 'qlib', 'id': '3.2', 'score': 1},
+    {'repo': 'qlib', 'id': '3.5', 'score': 0},
+    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},
+    {'repo': 'qlib', 'id': '5.3', 'score': 1},
+    {'repo': 'qlib', 'id': '6.1', 'score': 1},
+    {'repo': 'qlib', 'id': '6.2', 'score': 1},
+    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},
+    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},
+    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},
+    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},
+    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},
+    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},
+    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},
+]
+
+def get_score_report_from_response(resp_path, verbose=False):
+    if verbose:
+        print(resp_path)
+    with open(resp_path, 'rb') as file:
+        response = json.load(file)
+    
+    reports = [] # report for each test file
+    for result in response['call_results']: # one test file per response
+        if result['parsed_response']:
+            resp = result['parsed_response']['results']
+            for item in resp:
+                item['file'] = result['files_evaluated'][0] 
+                item['success'] = result['success']
+                reports.append(item)
+        # FIXME: not handled failed run for now
+        # else: # if the run is failed, the parsed_response will be None
+        #     reports.append({
+        #         'ID': '2.1', 
+        #         'Title': '',
+        #         'Requirement': '',
+        #         'Observation': '',
+        #         'Functions': [],
+        #         'Evaluation': '',
+        #         'Score': 0,
+        #         'file': result['files_evaluated'][0],
+        #         'success': result['success']
+        #     })
+    
+    reports_df = pd.DataFrame(reports)
+    df = (
+        reports_df
+        .pivot(index='file', columns='ID', values='Score')
+        .rename_axis(None, axis=1)
+    )
+    df['success'] = reports_df.groupby(['file'])['success'].all()
+    df['response_path'] = os.path.abspath(resp_path)
+    
+    return df.reset_index()
+
+def get_scores_by_repo_by_run_by_file(batch_run_dir_path, record_yml='record_combine.yml', verbose=False):
+    ''' Get score for each checklist item, by repository, by run and by test file
+    '''
+    with open(os.path.join(batch_run_dir_path, record_yml), 'r') as file:
+        config = pd.DataFrame(yaml.safe_load(file))
+
+    config['response_path'] = config['response_path'].apply(
+        lambda x: os.path.abspath(os.path.join(batch_run_dir_path, x))
+    )
+    
+    tmp = [
+        get_score_report_from_response(
+            os.path.join(batch_run_dir_path, path),
+            verbose=verbose
+        ) for path in config['response_path']
+    ]
+    tmp = pd.concat(tmp, axis=0).reset_index(drop=True)
+    
+    return config.merge(tmp, on='response_path', how='left')
+
+def preprocess(df_repo_run_file, id_item_map=None):
+    if id_item_map is None:
+        id_item_map = {
+            '2.1': 'Ensure Data File Loads as Expected',
+            '3.2': 'Data in the Expected Format',
+            '3.5': 'Check for Duplicate Records in Data',
+            '4.2': 'Verify Data Split Proportion',
+            '5.3': 'Ensure Model Output Shape Aligns with Expectation',
+            '6.1': 'Verify Evaluation Metrics Implementation',
+            '6.2': "Evaluate Model's Performance Against Thresholds"
+        }
+
+    # prepare score data by repo, by run
+    df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
+        id: ['max'] for id in id_item_map.keys()
+    })
+    df_repo_run.columns = [col[0] for col in df_repo_run.columns]
+    df_repo_run = df_repo_run.reset_index()
+    
+    # prepare statistics of scores by repo
+    df_repo__stat = df_repo_run.groupby(['repo']).agg({
+        id: ['mean', 'std', 'count'] for id in id_item_map.keys()
+    })
+    df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
+    df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
+    df_repo__stat = (
+        df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
+        .reset_index()
+        .rename_axis(None, axis=1)
+    )
+    df_repo__stat['title'] = df_repo__stat['id'].apply(lambda x: id_item_map[x])
+    df_repo__stat['id_title'] = df_repo__stat['id'] + '. ' + df_repo__stat['title']
+    
+    # prepare counting of scores by repo
+    df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
+    for id in list(id_item_map.keys())[1:]:
+        df_repo__count = df_repo__count.merge(
+            df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
+            on=['repo', 'level_1'],
+            how='outer'
+        )
+    #df_repo__count['title'] = df_repo__count['id'].apply(lambda x: id_item_map[x])
+    
+    df_repo__count = df_repo__count.fillna(0)
+
+    df_repo_run = df_repo_run.melt(id_vars=['repo', 'run'], var_name='id', value_name='score')
+    df_repo_run['title'] = df_repo_run['id'].apply(lambda x: id_item_map[x])
+    df_repo_run['id_title'] = df_repo_run['id'] + '. ' + df_repo_run['title']
+    
+    return (df_repo_run, df_repo__stat, df_repo__count)
+
+
+for model in ['3.5-turbo', '4-turbo', '4o']:
+    df_repo_run_file = get_scores_by_repo_by_run_by_file(f'data/batch_run/batch_run_{model}/')
+    df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)
+
+    df_repo_run.to_csv(f'data/processed/score_by_repo_run_{model}.csv', index=False)
+    df_repo__stat.to_csv(f'data/processed/score_stat_by_repo_{model}.csv', index=False)
+    df_repo__count.to_csv(f'data/processed/score_count_by_repo_{model}.csv', index=False)
+
+ground_truth_df = pd.DataFrame(ground_truth)
+ground_truth_df['title'] = ground_truth_df['id'].apply(lambda x: id_item_map[x])
+ground_truth_df = ground_truth_df.pivot(index=['id', 'title'], columns='repo', values='score')
+ground_truth_df.to_csv('data/processed/ground_truth.csv')
\ No newline at end of file
diff --git a/analysis/results/figures/.gitkeep b/analysis/results/figures/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/analysis/results/output/.gitkeep b/analysis/results/output/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/analysis/results/tables/.gitkeep b/analysis/results/tables/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/environment.yml b/environment.yml
index a39d0d9..33ce2ab 100644
--- a/environment.yml
+++ b/environment.yml
@@ -4,6 +4,7 @@ channels:
   - defaults
   - anaconda
 dependencies:
+  - altair=5.3
   - tectonic=0.15.0
   - pandoc=3.2
   - poetry=1.8.3