diff --git a/docs/02_finding-report.html b/docs/02_finding-report.html
deleted file mode 100644
index 5ceefa2..0000000
--- a/docs/02_finding-report.html
+++ /dev/null
@@ -1,2241 +0,0 @@
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
-
-<meta charset="utf-8">
-<meta name="generator" content="quarto-1.3.450">
-
-<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
-
-
-<title>NOTE: the result is based on the code base abb9a21, which is similar to the commit 69d61a9 in the main branch</title>
-<style>
-code{white-space: pre-wrap;}
-span.smallcaps{font-variant: small-caps;}
-div.columns{display: flex; gap: min(4vw, 1.5em);}
-div.column{flex: auto; overflow-x: auto;}
-div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
-ul.task-list{list-style: none;}
-ul.task-list li input[type="checkbox"] {
-  width: 0.8em;
-  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
-  vertical-align: middle;
-}
-/* CSS for syntax highlighting */
-pre > code.sourceCode { white-space: pre; position: relative; }
-pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
-pre > code.sourceCode > span:empty { height: 1.2em; }
-.sourceCode { overflow: visible; }
-code.sourceCode > span { color: inherit; text-decoration: inherit; }
-div.sourceCode { margin: 1em 0; }
-pre.sourceCode { margin: 0; }
-@media screen {
-div.sourceCode { overflow: auto; }
-}
-@media print {
-pre > code.sourceCode { white-space: pre-wrap; }
-pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
-}
-pre.numberSource code
-  { counter-reset: source-line 0; }
-pre.numberSource code > span
-  { position: relative; left: -4em; counter-increment: source-line; }
-pre.numberSource code > span > a:first-child::before
-  { content: counter(source-line);
-    position: relative; left: -1em; text-align: right; vertical-align: baseline;
-    border: none; display: inline-block;
-    -webkit-touch-callout: none; -webkit-user-select: none;
-    -khtml-user-select: none; -moz-user-select: none;
-    -ms-user-select: none; user-select: none;
-    padding: 0 4px; width: 4em;
-  }
-pre.numberSource { margin-left: 3em;  padding-left: 4px; }
-div.sourceCode
-  {   }
-@media screen {
-pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
-}
-</style>
-
-
-<script src="site_libs/quarto-nav/quarto-nav.js"></script>
-<script src="site_libs/quarto-nav/headroom.min.js"></script>
-<script src="site_libs/clipboard/clipboard.min.js"></script>
-<script src="site_libs/quarto-search/autocomplete.umd.js"></script>
-<script src="site_libs/quarto-search/fuse.min.js"></script>
-<script src="site_libs/quarto-search/quarto-search.js"></script>
-<meta name="quarto:offset" content="./">
-<script src="site_libs/quarto-html/quarto.js"></script>
-<script src="site_libs/quarto-html/popper.min.js"></script>
-<script src="site_libs/quarto-html/tippy.umd.min.js"></script>
-<script src="site_libs/quarto-html/anchor.min.js"></script>
-<link href="site_libs/quarto-html/tippy.css" rel="stylesheet">
-<link href="site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
-<script src="site_libs/bootstrap/bootstrap.min.js"></script>
-<link href="site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
-<link href="site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
-<script id="quarto-search-options" type="application/json">{
-  "location": "sidebar",
-  "copy-button": false,
-  "collapse-after": 3,
-  "panel-placement": "start",
-  "type": "textbox",
-  "limit": 20,
-  "language": {
-    "search-no-results-text": "No results",
-    "search-matching-documents-text": "matching documents",
-    "search-copy-link-title": "Copy link to search",
-    "search-hide-matches-text": "Hide additional matches",
-    "search-more-match-text": "more match in this document",
-    "search-more-matches-text": "more matches in this document",
-    "search-clear-button-title": "Clear",
-    "search-detached-cancel-button-title": "Cancel",
-    "search-submit-button-title": "Submit",
-    "search-label": "Search"
-  }
-}</script>
-<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
-<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script>
-<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
-
-
-<link rel="stylesheet" href="styles.css">
-</head>
-
-<body class="nav-sidebar docked">
-
-<div id="quarto-search-results"></div>
-  <header id="quarto-header" class="headroom fixed-top">
-  <nav class="quarto-secondary-nav">
-    <div class="container-fluid d-flex">
-      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
-        <i class="bi bi-layout-text-sidebar-reverse"></i>
-      </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item">NOTE: the result is based on the code base <a href="https://github.com/UBC-MDS/test-creation/pull/136/commits/abb9a21828cb257ff8f2629261dc1ad64ad7dcb0">abb9a21</a>, which is similar to the commit <a href="https://github.com/UBC-MDS/test-creation/tree/69d61a9f5ac62baefca23ee293a6cd09fe41eeb2">69d61a9</a> in the main branch</li></ol></nav>
-      <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
-      </a>
-      <button type="button" class="btn quarto-search-button" aria-label="" onclick="window.quartoOpenSearch();">
-        <i class="bi bi-search"></i>
-      </button>
-    </div>
-  </nav>
-</header>
-<!-- content -->
-<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
-<!-- sidebar -->
-  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal sidebar-navigation docked overflow-auto">
-    <div class="pt-lg-2 mt-2 text-left sidebar-header">
-      <a href="./index.html" class="sidebar-logo-link">
-      <img src="./logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
-      </a>
-      </div>
-        <div class="mt-2 flex-shrink-0 align-items-center">
-        <div class="sidebar-search">
-        <div id="quarto-search" class="" title="Search"></div>
-        </div>
-        </div>
-    <div class="sidebar-menu-container"> 
-    <ul class="list-unstyled mt-1">
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
- <span class="menu-text">Final Report</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./final_report.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
- <span class="menu-text">Proposal</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./proposal.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-    </ul>
-    </div>
-</nav>
-<div id="quarto-sidebar-glass" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass"></div>
-<!-- margin-sidebar -->
-    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
-        <nav id="TOC" role="doc-toc" class="toc-active">
-    <h2 id="toc-title">On this page</h2>
-   
-  <ul>
-  <li><a href="#preprocess-data" id="toc-preprocess-data" class="nav-link active" data-scroll-target="#preprocess-data">preprocess data</a></li>
-  <li><a href="#runs-quality" id="toc-runs-quality" class="nav-link" data-scroll-target="#runs-quality">Runs Quality</a></li>
-  <li><a href="#findings-on-8-repos" id="toc-findings-on-8-repos" class="nav-link" data-scroll-target="#findings-on-8-repos">Findings on 8 repos</a></li>
-  </ul>
-</nav>
-    </div>
-<!-- main -->
-<main class="content" id="quarto-document-content">
-
-<header id="title-block-header" class="quarto-title-block default">
-<div class="quarto-title">
-<h1 class="title">NOTE: the result is based on the code base <a href="https://github.com/UBC-MDS/test-creation/pull/136/commits/abb9a21828cb257ff8f2629261dc1ad64ad7dcb0">abb9a21</a>, which is similar to the commit <a href="https://github.com/UBC-MDS/test-creation/tree/69d61a9f5ac62baefca23ee293a6cd09fe41eeb2">69d61a9</a> in the main branch</h1>
-</div>
-
-
-
-<div class="quarto-title-meta">
-
-    
-  
-    
-  </div>
-  
-
-</header>
-
-<div class="cell" data-execution_count="1">
-<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> scipy</span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pickle</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> yaml</span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> altair <span class="im">as</span> alt</span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> Counter</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="2">
-<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_report(response):</span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>    report <span class="op">=</span> []</span>
-<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> result <span class="kw">in</span> response.call_results:</span>
-<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> result.parsed_response:</span>
-<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>            resp <span class="op">=</span> result.parsed_response[<span class="st">'results'</span>]</span>
-<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>            <span class="cf">for</span> item <span class="kw">in</span> resp:</span>
-<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'file'</span>] <span class="op">=</span> result.files_evaluated[<span class="dv">0</span>] </span>
-<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'success'</span>] <span class="op">=</span> result.success</span>
-<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>                report.append(item)</span>
-<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a>        <span class="cf">else</span>:</span>
-<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>            report.append({</span>
-<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a>                <span class="st">'ID'</span>: <span class="st">'2.1'</span>, <span class="co"># </span><span class="al">FIXME</span></span>
-<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Title'</span>: <span class="st">''</span>,</span>
-<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Requirement'</span>: <span class="st">''</span>,</span>
-<span id="cb2-15"><a href="#cb2-15" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Observation'</span>: <span class="st">''</span>,</span>
-<span id="cb2-16"><a href="#cb2-16" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Functions'</span>: [],</span>
-<span id="cb2-17"><a href="#cb2-17" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Evaluation'</span>: <span class="st">''</span>,</span>
-<span id="cb2-18"><a href="#cb2-18" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Score'</span>: <span class="dv">0</span>,</span>
-<span id="cb2-19"><a href="#cb2-19" aria-hidden="true" tabindex="-1"></a>                <span class="st">'file'</span>: result.files_evaluated[<span class="dv">0</span>],</span>
-<span id="cb2-20"><a href="#cb2-20" aria-hidden="true" tabindex="-1"></a>                <span class="st">'success'</span>: result.success</span>
-<span id="cb2-21"><a href="#cb2-21" aria-hidden="true" tabindex="-1"></a>            })</span>
-<span id="cb2-22"><a href="#cb2-22" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> pd.DataFrame(report)</span>
-<span id="cb2-23"><a href="#cb2-23" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-24"><a href="#cb2-24" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> extract_file_and_scores(resp_path, verbose<span class="op">=</span><span class="va">False</span>):</span>
-<span id="cb2-25"><a href="#cb2-25" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> verbose:</span>
-<span id="cb2-26"><a href="#cb2-26" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(resp_path)</span>
-<span id="cb2-27"><a href="#cb2-27" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> <span class="bu">open</span>(resp_path, <span class="st">'rb'</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
-<span id="cb2-28"><a href="#cb2-28" aria-hidden="true" tabindex="-1"></a>        response <span class="op">=</span> pickle.load(<span class="bu">file</span>)</span>
-<span id="cb2-29"><a href="#cb2-29" aria-hidden="true" tabindex="-1"></a>    report <span class="op">=</span> get_report(response)</span>
-<span id="cb2-30"><a href="#cb2-30" aria-hidden="true" tabindex="-1"></a>    df <span class="op">=</span> (</span>
-<span id="cb2-31"><a href="#cb2-31" aria-hidden="true" tabindex="-1"></a>        report</span>
-<span id="cb2-32"><a href="#cb2-32" aria-hidden="true" tabindex="-1"></a>        .pivot(index<span class="op">=</span><span class="st">'file'</span>, columns<span class="op">=</span><span class="st">'ID'</span>, values<span class="op">=</span><span class="st">'Score'</span>)</span>
-<span id="cb2-33"><a href="#cb2-33" aria-hidden="true" tabindex="-1"></a>        .rename_axis(<span class="va">None</span>, axis<span class="op">=</span><span class="dv">1</span>)</span>
-<span id="cb2-34"><a href="#cb2-34" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-35"><a href="#cb2-35" aria-hidden="true" tabindex="-1"></a>    df[<span class="st">'success'</span>] <span class="op">=</span> report.groupby([<span class="st">'file'</span>])[<span class="st">'success'</span>].<span class="bu">all</span>()</span>
-<span id="cb2-36"><a href="#cb2-36" aria-hidden="true" tabindex="-1"></a>    df[<span class="st">'response_path'</span>] <span class="op">=</span> resp_path</span>
-<span id="cb2-37"><a href="#cb2-37" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> df.reset_index()</span>
-<span id="cb2-38"><a href="#cb2-38" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-39"><a href="#cb2-39" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> generate_stat_plot(df_repo__stat, ground_truth<span class="op">=</span><span class="va">None</span>, facet_col<span class="op">=</span><span class="st">'repo'</span>, repo<span class="op">=</span><span class="va">None</span>, <span class="bu">id</span><span class="op">=</span><span class="va">None</span>):</span>
-<span id="cb2-40"><a href="#cb2-40" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""</span></span>
-<span id="cb2-41"><a href="#cb2-41" aria-hidden="true" tabindex="-1"></a><span class="co">    Generate Stat plot across all repo and all checklist item</span></span>
-<span id="cb2-42"><a href="#cb2-42" aria-hidden="true" tabindex="-1"></a><span class="co">    Optional to incorporate ground truth and select specific repo/checklist item</span></span>
-<span id="cb2-43"><a href="#cb2-43" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
-<span id="cb2-44"><a href="#cb2-44" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> facet_col <span class="op">==</span> <span class="st">'repo'</span>:</span>
-<span id="cb2-45"><a href="#cb2-45" aria-hidden="true" tabindex="-1"></a>        x_col <span class="op">=</span> <span class="st">'id'</span></span>
-<span id="cb2-46"><a href="#cb2-46" aria-hidden="true" tabindex="-1"></a>        x_title <span class="op">=</span> <span class="st">'Checklist ID'</span></span>
-<span id="cb2-47"><a href="#cb2-47" aria-hidden="true" tabindex="-1"></a>    <span class="cf">elif</span> facet_col <span class="op">==</span> <span class="st">'id'</span>:</span>
-<span id="cb2-48"><a href="#cb2-48" aria-hidden="true" tabindex="-1"></a>        x_col <span class="op">=</span> <span class="st">'repo'</span></span>
-<span id="cb2-49"><a href="#cb2-49" aria-hidden="true" tabindex="-1"></a>        x_title <span class="op">=</span> <span class="st">'Repository'</span></span>
-<span id="cb2-50"><a href="#cb2-50" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-51"><a href="#cb2-51" aria-hidden="true" tabindex="-1"></a>    <span class="co"># the base chart</span></span>
-<span id="cb2-52"><a href="#cb2-52" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> repo:</span>
-<span id="cb2-53"><a href="#cb2-53" aria-hidden="true" tabindex="-1"></a>        df_repo__stat <span class="op">=</span> df_repo__stat.query(<span class="ss">f'repo == "</span><span class="sc">{</span>repo<span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb2-54"><a href="#cb2-54" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> <span class="bu">id</span>:</span>
-<span id="cb2-55"><a href="#cb2-55" aria-hidden="true" tabindex="-1"></a>        df_repo__stat <span class="op">=</span> df_repo__stat.query(<span class="ss">f'id == "</span><span class="sc">{</span><span class="bu">id</span><span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb2-56"><a href="#cb2-56" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-57"><a href="#cb2-57" aria-hidden="true" tabindex="-1"></a>    base <span class="op">=</span> alt.Chart().transform_calculate(</span>
-<span id="cb2-58"><a href="#cb2-58" aria-hidden="true" tabindex="-1"></a>        <span class="bu">min</span><span class="op">=</span><span class="st">"max(0, datum.mean-datum.std)"</span>,</span>
-<span id="cb2-59"><a href="#cb2-59" aria-hidden="true" tabindex="-1"></a>        <span class="bu">max</span><span class="op">=</span><span class="st">"min(1, datum.mean+datum.std)"</span></span>
-<span id="cb2-60"><a href="#cb2-60" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-61"><a href="#cb2-61" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-62"><a href="#cb2-62" aria-hidden="true" tabindex="-1"></a>    <span class="co"># generate the points</span></span>
-<span id="cb2-63"><a href="#cb2-63" aria-hidden="true" tabindex="-1"></a>    points <span class="op">=</span> base.mark_point(</span>
-<span id="cb2-64"><a href="#cb2-64" aria-hidden="true" tabindex="-1"></a>        filled<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb2-65"><a href="#cb2-65" aria-hidden="true" tabindex="-1"></a>        size<span class="op">=</span><span class="dv">50</span>,</span>
-<span id="cb2-66"><a href="#cb2-66" aria-hidden="true" tabindex="-1"></a>        color<span class="op">=</span><span class="st">'black'</span></span>
-<span id="cb2-67"><a href="#cb2-67" aria-hidden="true" tabindex="-1"></a>    ).encode(</span>
-<span id="cb2-68"><a href="#cb2-68" aria-hidden="true" tabindex="-1"></a>        x<span class="op">=</span>alt.X(<span class="ss">f'</span><span class="sc">{</span>x_col<span class="sc">}</span><span class="ss">:O'</span>).axis(labelAngle<span class="op">=</span><span class="dv">0</span>).title(x_title),</span>
-<span id="cb2-69"><a href="#cb2-69" aria-hidden="true" tabindex="-1"></a>        y<span class="op">=</span>alt.Y(<span class="st">'mean:Q'</span>).scale(domainMin<span class="op">=</span><span class="dv">0</span>, domainMax<span class="op">=</span><span class="dv">1</span>).title(<span class="st">'Score'</span>),</span>
-<span id="cb2-70"><a href="#cb2-70" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-71"><a href="#cb2-71" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-72"><a href="#cb2-72" aria-hidden="true" tabindex="-1"></a>    <span class="co"># generate the error bars</span></span>
-<span id="cb2-73"><a href="#cb2-73" aria-hidden="true" tabindex="-1"></a>    errorbars <span class="op">=</span> base.mark_errorbar().encode(</span>
-<span id="cb2-74"><a href="#cb2-74" aria-hidden="true" tabindex="-1"></a>        x<span class="op">=</span><span class="ss">f"</span><span class="sc">{</span>x_col<span class="sc">}</span><span class="ss">:O"</span>,</span>
-<span id="cb2-75"><a href="#cb2-75" aria-hidden="true" tabindex="-1"></a>        y<span class="op">=</span>alt.Y(<span class="st">"min:Q"</span>).title(<span class="st">'1 SD'</span>),</span>
-<span id="cb2-76"><a href="#cb2-76" aria-hidden="true" tabindex="-1"></a>        y2<span class="op">=</span><span class="st">"max:Q"</span></span>
-<span id="cb2-77"><a href="#cb2-77" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-78"><a href="#cb2-78" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-79"><a href="#cb2-79" aria-hidden="true" tabindex="-1"></a>    plot <span class="op">=</span> points <span class="op">+</span> errorbars</span>
-<span id="cb2-80"><a href="#cb2-80" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-81"><a href="#cb2-81" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> ground_truth <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
-<span id="cb2-82"><a href="#cb2-82" aria-hidden="true" tabindex="-1"></a>        <span class="co"># generate points of ground truth</span></span>
-<span id="cb2-83"><a href="#cb2-83" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> repo:</span>
-<span id="cb2-84"><a href="#cb2-84" aria-hidden="true" tabindex="-1"></a>            ground_truth <span class="op">=</span> ground_truth.query(<span class="ss">f'repo == "</span><span class="sc">{</span>repo<span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb2-85"><a href="#cb2-85" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> <span class="bu">id</span>:</span>
-<span id="cb2-86"><a href="#cb2-86" aria-hidden="true" tabindex="-1"></a>            ground_truth <span class="op">=</span> ground_truth.query(<span class="ss">f'id == "</span><span class="sc">{</span><span class="bu">id</span><span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb2-87"><a href="#cb2-87" aria-hidden="true" tabindex="-1"></a>        </span>
-<span id="cb2-88"><a href="#cb2-88" aria-hidden="true" tabindex="-1"></a>        df_repo__stat <span class="op">=</span> pd.merge(df_repo__stat, ground_truth, how<span class="op">=</span><span class="st">'left'</span>, on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>])</span>
-<span id="cb2-89"><a href="#cb2-89" aria-hidden="true" tabindex="-1"></a>        </span>
-<span id="cb2-90"><a href="#cb2-90" aria-hidden="true" tabindex="-1"></a>        gt_points <span class="op">=</span> alt.Chart().mark_point(</span>
-<span id="cb2-91"><a href="#cb2-91" aria-hidden="true" tabindex="-1"></a>            filled<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb2-92"><a href="#cb2-92" aria-hidden="true" tabindex="-1"></a>            size<span class="op">=</span><span class="dv">100</span>,</span>
-<span id="cb2-93"><a href="#cb2-93" aria-hidden="true" tabindex="-1"></a>            color<span class="op">=</span><span class="st">'green'</span>,</span>
-<span id="cb2-94"><a href="#cb2-94" aria-hidden="true" tabindex="-1"></a>            shape<span class="op">=</span><span class="st">"diamond"</span></span>
-<span id="cb2-95"><a href="#cb2-95" aria-hidden="true" tabindex="-1"></a>        ).encode(</span>
-<span id="cb2-96"><a href="#cb2-96" aria-hidden="true" tabindex="-1"></a>            x<span class="op">=</span>alt.X(<span class="ss">f'</span><span class="sc">{</span>x_col<span class="sc">}</span><span class="ss">:O'</span>),</span>
-<span id="cb2-97"><a href="#cb2-97" aria-hidden="true" tabindex="-1"></a>            y<span class="op">=</span>alt.Y(<span class="st">'score:Q'</span>)</span>
-<span id="cb2-98"><a href="#cb2-98" aria-hidden="true" tabindex="-1"></a>        )</span>
-<span id="cb2-99"><a href="#cb2-99" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-100"><a href="#cb2-100" aria-hidden="true" tabindex="-1"></a>        plot <span class="op">+=</span> gt_points</span>
-<span id="cb2-101"><a href="#cb2-101" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-102"><a href="#cb2-102" aria-hidden="true" tabindex="-1"></a>    plot <span class="op">=</span> alt.layer(</span>
-<span id="cb2-103"><a href="#cb2-103" aria-hidden="true" tabindex="-1"></a>                plot,</span>
-<span id="cb2-104"><a href="#cb2-104" aria-hidden="true" tabindex="-1"></a>                data<span class="op">=</span>df_repo__stat</span>
-<span id="cb2-105"><a href="#cb2-105" aria-hidden="true" tabindex="-1"></a>            ).properties(</span>
-<span id="cb2-106"><a href="#cb2-106" aria-hidden="true" tabindex="-1"></a>                width<span class="op">=</span><span class="dv">400</span>,</span>
-<span id="cb2-107"><a href="#cb2-107" aria-hidden="true" tabindex="-1"></a>            ).facet(</span>
-<span id="cb2-108"><a href="#cb2-108" aria-hidden="true" tabindex="-1"></a>                column<span class="op">=</span><span class="ss">f'</span><span class="sc">{</span>facet_col<span class="sc">}</span><span class="ss">'</span>,</span>
-<span id="cb2-109"><a href="#cb2-109" aria-hidden="true" tabindex="-1"></a>                columns<span class="op">=</span><span class="dv">2</span></span>
-<span id="cb2-110"><a href="#cb2-110" aria-hidden="true" tabindex="-1"></a>            )</span>
-<span id="cb2-111"><a href="#cb2-111" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-112"><a href="#cb2-112" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> plot</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<section id="preprocess-data" class="level3">
-<h3 class="anchored" data-anchor-id="preprocess-data">preprocess data</h3>
-<div class="cell" data-scrolled="true" data-execution_count="3">
-<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>checklist_ids <span class="op">=</span> [<span class="st">'2.1'</span>, <span class="st">'3.2'</span>, <span class="st">'3.5'</span>, <span class="st">'4.2'</span>, <span class="st">'5.3'</span>, <span class="st">'6.1'</span>, <span class="st">'6.2'</span>]</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="co">#result_path = '../draft/batch_run_results/record_combine.yml'</span></span>
-<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>result_path <span class="op">=</span> <span class="st">'../data/processed/batch_run/record_combine.yml'</span></span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(result_path, <span class="st">'r'</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
-<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>    config <span class="op">=</span> pd.DataFrame(yaml.safe_load(<span class="bu">file</span>))</span>
-<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a><span class="co"># prepare score data by repo, run, file</span></span>
-<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>tmp <span class="op">=</span> [</span>
-<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a>    extract_file_and_scores(path) <span class="cf">for</span> path <span class="kw">in</span> config[<span class="st">'response_path'</span>] <span class="co"># </span><span class="al">FIXME</span><span class="co">: excluded deepchem</span></span>
-<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>]</span>
-<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>tmp <span class="op">=</span> pd.concat(tmp, axis<span class="op">=</span><span class="dv">0</span>).reset_index(drop<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>raw_df_repo_run_file <span class="op">=</span> config.merge(tmp, on<span class="op">=</span><span class="st">'response_path'</span>, how<span class="op">=</span><span class="st">'left'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="4">
-<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># filter non-test files in qlib</span></span>
-<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>df_repo_run_file <span class="op">=</span> raw_df_repo_run_file.query(<span class="st">'(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))'</span>)</span>
-<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co"># prepare score data by repo, run</span></span>
-<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>df_repo_run <span class="op">=</span> df_repo_run_file.groupby([<span class="st">'repo'</span>, <span class="st">'run'</span>]).agg({</span>
-<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>    <span class="bu">id</span>: [<span class="st">'max'</span>] <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> checklist_ids</span>
-<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>})</span>
-<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>df_repo_run.columns <span class="op">=</span> [col[<span class="dv">0</span>] <span class="cf">for</span> col <span class="kw">in</span> df_repo_run.columns]</span>
-<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>df_repo_run <span class="op">=</span> df_repo_run.reset_index()</span>
-<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a><span class="co"># prepare statistics of scores by repo</span></span>
-<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a>df_repo__stat <span class="op">=</span> df_repo_run.groupby([<span class="st">'repo'</span>]).agg({</span>
-<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>    <span class="bu">id</span>: [<span class="st">'mean'</span>, <span class="st">'std'</span>, <span class="st">'count'</span>] <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> checklist_ids</span>
-<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>})</span>
-<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a>df_repo__stat <span class="op">=</span> pd.melt(df_repo__stat.reset_index(), id_vars<span class="op">=</span>[(<span class="st">'repo'</span>, <span class="st">''</span>)])</span>
-<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a>df_repo__stat.columns <span class="op">=</span> [<span class="st">'repo'</span>, <span class="st">'id'</span>, <span class="st">'stat'</span>, <span class="st">'value'</span>]</span>
-<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a>df_repo__stat <span class="op">=</span> (</span>
-<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a>    df_repo__stat.pivot(index<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>], columns<span class="op">=</span><span class="st">'stat'</span>, values<span class="op">=</span><span class="st">'value'</span>)</span>
-<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a>    .reset_index()</span>
-<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a>    .rename_axis(<span class="va">None</span>, axis<span class="op">=</span><span class="dv">1</span>)</span>
-<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a><span class="co"># prepare counting of scores by repo</span></span>
-<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a>df_repo__count <span class="op">=</span> df_repo_run.groupby([<span class="st">'repo'</span>])[<span class="st">'2.1'</span>].<span class="bu">apply</span>(Counter).reset_index()</span>
-<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> checklist_ids[<span class="dv">1</span>:]:</span>
-<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a>    df_repo__count <span class="op">=</span> df_repo__count.merge(</span>
-<span id="cb4-27"><a href="#cb4-27" aria-hidden="true" tabindex="-1"></a>        df_repo_run.groupby([<span class="st">'repo'</span>])[<span class="bu">id</span>].<span class="bu">apply</span>(Counter).reset_index(),</span>
-<span id="cb4-28"><a href="#cb4-28" aria-hidden="true" tabindex="-1"></a>        on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'level_1'</span>],</span>
-<span id="cb4-29"><a href="#cb4-29" aria-hidden="true" tabindex="-1"></a>        how<span class="op">=</span><span class="st">'outer'</span></span>
-<span id="cb4-30"><a href="#cb4-30" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb4-31"><a href="#cb4-31" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-32"><a href="#cb4-32" aria-hidden="true" tabindex="-1"></a>df_repo__count <span class="op">=</span> df_repo__count.fillna(<span class="dv">0</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-</section>
-<section id="runs-quality" class="level3">
-<h3 class="anchored" data-anchor-id="runs-quality">Runs Quality</h3>
-<section id="some-non-test-files-are-included-in-the-evaluation" class="level4">
-<h4 class="anchored" data-anchor-id="some-non-test-files-are-included-in-the-evaluation">1. Some non-test files are included in the evaluation</h4>
-<p>For example, the <code>./nanodet/nanodet/trainer/task.py</code></p>
-<div class="cell" data-execution_count="5">
-<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>raw_df_repo_run_file.query(<span class="st">'repo == "nanodet"'</span>)[<span class="st">'file'</span>].unique()[:<span class="dv">3</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="5">
-<pre><code>array(['../data/raw/openja/nanodet/nanodet/trainer/task.py',
-       '../data/raw/openja/nanodet/tests/test_configs/test_config.py',
-       '../data/raw/openja/nanodet/tests/test_data/test_batch_process.py'],
-      dtype=object)</code></pre>
-</div>
-</div>
-</section>
-<section id="evaluation-on-the-file-magentamagentamodelsmusic_vaedata_test.py-is-always-failed" class="level4">
-<h4 class="anchored" data-anchor-id="evaluation-on-the-file-magentamagentamodelsmusic_vaedata_test.py-is-always-failed">2. Evaluation on the file <code>magenta/magenta/models/music_vae/data_test.py</code> is always failed</h4>
-<div class="cell" data-execution_count="6">
-<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_file[<span class="op">~</span>df_repo_run_file.success][<span class="st">'file'</span>].unique()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="6">
-<pre><code>array(['../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',
-       '../data/raw/openja/paperless-ng/src/documents/tests/test_api.py'],
-      dtype=object)</code></pre>
-</div>
-</div>
-</section>
-<section id="deepspeech-lightfm-and-magenta-have-the-least-python-test-files" class="level4">
-<h4 class="anchored" data-anchor-id="deepspeech-lightfm-and-magenta-have-the-least-python-test-files">3. <code>DeepSpeech</code>, <code>lightfm</code> and <code>magenta</code> have the least (Python) test files</h4>
-<div class="cell" data-execution_count="7">
-<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_file.query(<span class="st">'run == 1'</span>).groupby([<span class="st">'repo'</span>])[<span class="st">'file'</span>].count().reset_index()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="7">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">file</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>DeepSpeech</td>
-<td>3</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>apollo</td>
-<td>14</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>lightfm</td>
-<td>7</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>magenta</td>
-<td>8</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>mmf</td>
-<td>70</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">5</td>
-<td>mycroft-core</td>
-<td>64</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">6</td>
-<td>nanodet</td>
-<td>42</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">7</td>
-<td>paperless-ng</td>
-<td>35</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">8</td>
-<td>qlib</td>
-<td>31</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-</section>
-<section id="the-test-files-are-not-always-in-a-tests-folder.-is-it-be-good-practice-to-always-do-that-should-it-be-one-of-the-checklist-item-to-ensure-all-tests-placed-under-tests-folder" class="level4">
-<h4 class="anchored" data-anchor-id="the-test-files-are-not-always-in-a-tests-folder.-is-it-be-good-practice-to-always-do-that-should-it-be-one-of-the-checklist-item-to-ensure-all-tests-placed-under-tests-folder">4. The test files are not always in a <code>tests/</code> folder. Is it be good practice to always do that? Should it be one of the checklist item to ensure all tests placed under <code>tests/</code> folder?</h4>
-<p>For example, <code>magenta</code></p>
-<div class="cell" data-execution_count="8">
-<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_file.query(<span class="st">'repo == "magenta"'</span>)[<span class="st">'file'</span>].unique()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="8">
-<pre><code>array(['../data/raw/openja/magenta/conftest.py',
-       '../data/raw/openja/magenta/magenta/common/state_util_test.py',
-       '../data/raw/openja/magenta/magenta/models/coconet/export_saved_model_test.py',
-       '../data/raw/openja/magenta/magenta/models/coconet/lib_data.py',
-       '../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',
-       '../data/raw/openja/magenta/magenta/models/onsets_frames_transcription/create_dataset_lib_test.py',
-       '../data/raw/openja/magenta/magenta/models/score2perf/datagen_beam_test.py',
-       '../data/raw/openja/magenta/magenta/pipelines/pipeline_test.py'],
-      dtype=object)</code></pre>
-</div>
-</div>
-</section>
-</section>
-<section id="findings-on-8-repos" class="level3">
-<h3 class="anchored" data-anchor-id="findings-on-8-repos">Findings on 8 repos</h3>
-<div class="cell" data-execution_count="9">
-<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_file.repo.unique()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="9">
-<pre><code>array(['lightfm', 'qlib', 'mmf', 'nanodet', 'magenta', 'DeepSpeech',
-       'paperless-ng', 'mycroft-core', 'apollo'], dtype=object)</code></pre>
-</div>
-</div>
-<section id="overview-of-accuracy-and-consistency-lightfm-evaluation" class="level4">
-<h4 class="anchored" data-anchor-id="overview-of-accuracy-and-consistency-lightfm-evaluation">1. Overview of accuracy and consistency <code>lightfm</code> evaluation</h4>
-<p>Let the ground truth of the <code>lightfm</code> is as the <a href="https://github.com/UBC-MDS/test-creation/blob/69d61a9f5ac62baefca23ee293a6cd09fe41eeb2/report/repo_human_evaluation/human_evaluation_report-lightfm.md">following</a>:</p>
-<div class="cell" data-execution_count="33">
-<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ground truth</span></span>
-<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>ground_truth <span class="op">=</span> pd.DataFrame([</span>
-<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-13"><a href="#cb14-13" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb14-14"><a href="#cb14-14" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-16"><a href="#cb14-16" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-17"><a href="#cb14-17" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-18"><a href="#cb14-18" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-19"><a href="#cb14-19" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-20"><a href="#cb14-20" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-21"><a href="#cb14-21" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-22"><a href="#cb14-22" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-23"><a href="#cb14-23" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-24"><a href="#cb14-24" aria-hidden="true" tabindex="-1"></a>])</span>
-<span id="cb14-25"><a href="#cb14-25" aria-hidden="true" tabindex="-1"></a>ground_truth[ground_truth.repo <span class="op">==</span> <span class="st">'lightfm'</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="33">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th">score</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>lightfm</td>
-<td>2.1</td>
-<td>1.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>lightfm</td>
-<td>3.2</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>lightfm</td>
-<td>3.5</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>lightfm</td>
-<td>4.2</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>lightfm</td>
-<td>5.3</td>
-<td>0.5</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">5</td>
-<td>lightfm</td>
-<td>6.1</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">6</td>
-<td>lightfm</td>
-<td>6.2</td>
-<td>1.0</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="11">
-<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>generate_stat_plot(df_repo__stat, ground_truth<span class="op">=</span>ground_truth, repo<span class="op">=</span><span class="st">"lightfm"</span>, facet_col<span class="op">=</span><span class="st">'repo'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="11">
-
-<style>
-  #altair-viz-b7ec977d7ba843178f36988e02bbdab5.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-b7ec977d7ba843178f36988e02bbdab5.vega-embed details,
-  #altair-viz-b7ec977d7ba843178f36988e02bbdab5.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-b7ec977d7ba843178f36988e02bbdab5"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-b7ec977d7ba843178f36988e02bbdab5") {
-      outputDiv = document.getElementById("altair-viz-b7ec977d7ba843178f36988e02bbdab5");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}}, "data": {"name": "data-ede8ba921ae9ebafaf5e850295187276"}, "facet": {"column": {"field": "repo", "type": "nominal"}}, "spec": {"layer": [{"layer": [{"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelAngle": 0}, "field": "id", "title": "Checklist ID", "type": "ordinal"}, "y": {"field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "id", "type": "ordinal"}, "y": {"field": "min", "title": "1 SD", "type": "quantitative"}, "y2": {"field": "max"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 100}, "encoding": {"x": {"field": "id", "type": "ordinal"}, "y": {"field": "score", "type": "quantitative"}}}]}], "width": 400}, "columns": 2, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-ede8ba921ae9ebafaf5e850295187276": [{"repo": "lightfm", "id": "2.1", "count": 30.0, "mean": 0.9833333333333333, "std": 0.09128709291752768, "score": 1.0}, {"repo": "lightfm", "id": "3.2", "count": 30.0, "mean": 0.48333333333333334, "std": 0.09128709291752768, "score": 1.0}, {"repo": "lightfm", "id": "3.5", "count": 30.0, "mean": 0.26666666666666666, "std": 0.38803957613068046, "score": 0.0}, {"repo": "lightfm", "id": "4.2", "count": 30.0, "mean": 0.55, "std": 0.15256428831468233, "score": 1.0}, {"repo": "lightfm", "id": "5.3", "count": 30.0, "mean": 0.2, "std": 0.2491364395612199, "score": 0.5}, {"repo": "lightfm", "id": "6.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.23973165074269212, "score": 1.0}, {"repo": "lightfm", "id": "6.2", "count": 30.0, "mean": 0.9333333333333333, "std": 0.1728729518208802, "score": 1.0}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<p>The distribution of the scores for each checklist items:</p>
-<div class="cell" data-execution_count="12">
-<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>df_repo__count.query(<span class="st">'repo == "lightfm"'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="12">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">level_1</th>
-<th data-quarto-table-cell-role="th">2.1</th>
-<th data-quarto-table-cell-role="th">3.2</th>
-<th data-quarto-table-cell-role="th">3.5</th>
-<th data-quarto-table-cell-role="th">4.2</th>
-<th data-quarto-table-cell-role="th">5.3</th>
-<th data-quarto-table-cell-role="th">6.1</th>
-<th data-quarto-table-cell-role="th">6.2</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">6</td>
-<td>lightfm</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>1.0</td>
-<td>19.0</td>
-<td>0.0</td>
-<td>18.0</td>
-<td>0.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">7</td>
-<td>lightfm</td>
-<td>0.5</td>
-<td>1.0</td>
-<td>29.0</td>
-<td>6.0</td>
-<td>27.0</td>
-<td>12.0</td>
-<td>20.0</td>
-<td>4.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">8</td>
-<td>lightfm</td>
-<td>1.0</td>
-<td>29.0</td>
-<td>0.0</td>
-<td>5.0</td>
-<td>3.0</td>
-<td>0.0</td>
-<td>10.0</td>
-<td>26.0</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<p><strong>Observations</strong>: The system evaluation kind of aligns with our evaluation, that is, - for those items that we believe “Satisfied” (Score = 1), the system mostly output 0.5 or 1 - for those items that we believe “Partially Satisfied” or “Not Satisfied”, the system mostly output 0.5 or 0 - some checklist items display high variance, e.g.&nbsp;3.5, 5.3 and 6.1.</p>
-</section>
-<section id="overview-of-qlib" class="level4">
-<h4 class="anchored" data-anchor-id="overview-of-qlib">2. Overview of <code>qlib</code></h4>
-<p>Let the ground truth of the <code>qlib</code> is as the following (FIXME: to be confirmed):</p>
-<div class="cell" data-execution_count="13">
-<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ground truth</span></span>
-<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>ground_truth[ground_truth.repo <span class="op">==</span> <span class="st">'qlib'</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="13">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th">score</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">7</td>
-<td>qlib</td>
-<td>2.1</td>
-<td>0.5</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">8</td>
-<td>qlib</td>
-<td>3.2</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">9</td>
-<td>qlib</td>
-<td>3.5</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">10</td>
-<td>qlib</td>
-<td>4.2</td>
-<td>0.5</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">11</td>
-<td>qlib</td>
-<td>5.3</td>
-<td>1.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">12</td>
-<td>qlib</td>
-<td>6.1</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">13</td>
-<td>qlib</td>
-<td>6.2</td>
-<td>1.0</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="14">
-<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>generate_stat_plot(df_repo__stat, ground_truth<span class="op">=</span>ground_truth, repo<span class="op">=</span><span class="st">"qlib"</span>, facet_col<span class="op">=</span><span class="st">'repo'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="14">
-
-<style>
-  #altair-viz-2fe13965b16244bab46e66ec6bbd4792.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-2fe13965b16244bab46e66ec6bbd4792.vega-embed details,
-  #altair-viz-2fe13965b16244bab46e66ec6bbd4792.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-2fe13965b16244bab46e66ec6bbd4792"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-2fe13965b16244bab46e66ec6bbd4792") {
-      outputDiv = document.getElementById("altair-viz-2fe13965b16244bab46e66ec6bbd4792");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}}, "data": {"name": "data-af1db541ee80464e3d63aac6a537dd6f"}, "facet": {"column": {"field": "repo", "type": "nominal"}}, "spec": {"layer": [{"layer": [{"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelAngle": 0}, "field": "id", "title": "Checklist ID", "type": "ordinal"}, "y": {"field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "id", "type": "ordinal"}, "y": {"field": "min", "title": "1 SD", "type": "quantitative"}, "y2": {"field": "max"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 100}, "encoding": {"x": {"field": "id", "type": "ordinal"}, "y": {"field": "score", "type": "quantitative"}}}]}], "width": 400}, "columns": 2, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-af1db541ee80464e3d63aac6a537dd6f": [{"repo": "qlib", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0, "score": 0.5}, {"repo": "qlib", "id": "3.2", "count": 30.0, "mean": 0.7666666666666667, "std": 0.285673231861683, "score": 1.0}, {"repo": "qlib", "id": "3.5", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768, "score": 0.0}, {"repo": "qlib", "id": "4.2", "count": 30.0, "mean": 0.45, "std": 0.15256428831468236, "score": 0.5}, {"repo": "qlib", "id": "5.3", "count": 30.0, "mean": 0.26666666666666666, "std": 0.2537081317024624, "score": 1.0}, {"repo": "qlib", "id": "6.1", "count": 30.0, "mean": 0.4666666666666667, "std": 0.2248882225544018, "score": 1.0}, {"repo": "qlib", "id": "6.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.18257418583505536, "score": 1.0}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<div class="cell" data-execution_count="15">
-<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>df_repo__count.query(<span class="st">'repo == "qlib"'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="15">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">level_1</th>
-<th data-quarto-table-cell-role="th">2.1</th>
-<th data-quarto-table-cell-role="th">3.2</th>
-<th data-quarto-table-cell-role="th">3.5</th>
-<th data-quarto-table-cell-role="th">4.2</th>
-<th data-quarto-table-cell-role="th">5.3</th>
-<th data-quarto-table-cell-role="th">6.1</th>
-<th data-quarto-table-cell-role="th">6.2</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">24</td>
-<td>qlib</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>1.0</td>
-<td>29.0</td>
-<td>3.0</td>
-<td>14.0</td>
-<td>4.0</td>
-<td>1.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">25</td>
-<td>qlib</td>
-<td>0.5</td>
-<td>0.0</td>
-<td>12.0</td>
-<td>1.0</td>
-<td>27.0</td>
-<td>16.0</td>
-<td>24.0</td>
-<td>26.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">26</td>
-<td>qlib</td>
-<td>1.0</td>
-<td>30.0</td>
-<td>17.0</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>2.0</td>
-<td>3.0</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<p><strong>Observations</strong>: - There are more disagreement between system and manual evaluation - especially for 5.3, 6.1, 6.2. - The items consistency in this repo are not similar to those in <code>lightfm</code>. - e.g.&nbsp;Variance for 3.5 is greatly reduced. Variance for 3.2 becomes larger. - However, <code>qlib</code> is not just a machine learning project, it also contains a software inside. - e.g.&nbsp;It has a lot of randomly generated data by itself, instead of reading a data to perform analysis, it seems to deviate from the objective of 2.1.</p>
-</section>
-<section id="the-consistency-for-each-checklist-items" class="level4">
-<h4 class="anchored" data-anchor-id="the-consistency-for-each-checklist-items">3. The consistency for each checklist items</h4>
-<ul>
-<li>Why is it important? If the score of a particular item varies a lot when evaluating a repository, it might mean that its prompt (<code>Requirement</code>) is confusing to the LLM, or the checklist item itself is not well defined.</li>
-</ul>
-<div class="cell" data-execution_count="16">
-<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>df_repo__stat.pivot(index<span class="op">=</span><span class="st">'id'</span>, columns<span class="op">=</span><span class="st">'repo'</span>, values<span class="op">=</span><span class="st">'std'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="16">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">DeepSpeech</th>
-<th data-quarto-table-cell-role="th">apollo</th>
-<th data-quarto-table-cell-role="th">lightfm</th>
-<th data-quarto-table-cell-role="th">magenta</th>
-<th data-quarto-table-cell-role="th">mmf</th>
-<th data-quarto-table-cell-role="th">mycroft-core</th>
-<th data-quarto-table-cell-role="th">nanodet</th>
-<th data-quarto-table-cell-role="th">paperless-ng</th>
-<th data-quarto-table-cell-role="th">qlib</th>
-</tr>
-<tr class="odd">
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2.1</td>
-<td>0.479463</td>
-<td>0.152564</td>
-<td>0.091287</td>
-<td>0.423451</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.242117</td>
-<td>0.000000</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3.2</td>
-<td>0.406838</td>
-<td>0.215092</td>
-<td>0.091287</td>
-<td>0.189525</td>
-<td>0.245066</td>
-<td>0.278027</td>
-<td>0.239732</td>
-<td>0.091287</td>
-<td>0.285673</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">3.5</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.388040</td>
-<td>0.252003</td>
-<td>0.126854</td>
-<td>0.000000</td>
-<td>0.252003</td>
-<td>0.000000</td>
-<td>0.091287</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">4.2</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.152564</td>
-<td>0.091287</td>
-<td>0.126854</td>
-<td>0.000000</td>
-<td>0.254274</td>
-<td>0.000000</td>
-<td>0.152564</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">5.3</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.249136</td>
-<td>0.000000</td>
-<td>0.126854</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.253708</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">6.1</td>
-<td>0.351107</td>
-<td>0.172873</td>
-<td>0.239732</td>
-<td>0.252003</td>
-<td>0.233046</td>
-<td>0.000000</td>
-<td>0.285673</td>
-<td>0.000000</td>
-<td>0.224888</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">6.2</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.172873</td>
-<td>0.000000</td>
-<td>0.201289</td>
-<td>0.253708</td>
-<td>0.260415</td>
-<td>0.126854</td>
-<td>0.182574</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="17">
-<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>alt.Chart(df_repo__stat).mark_boxplot().encode(</span>
-<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span><span class="st">"std:Q"</span>,</span>
-<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">'id:N'</span></span>
-<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a>).properties(</span>
-<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">400</span></span>
-<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="17">
-
-<style>
-  #altair-viz-314f036109d441fd8fcab6bda695b7de.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-314f036109d441fd8fcab6bda695b7de.vega-embed details,
-  #altair-viz-314f036109d441fd8fcab6bda695b7de.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-314f036109d441fd8fcab6bda695b7de"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-314f036109d441fd8fcab6bda695b7de") {
-      outputDiv = document.getElementById("altair-viz-314f036109d441fd8fcab6bda695b7de");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}}, "data": {"name": "data-5faacda36c1e8a62adb5863cd86f4ae0"}, "mark": {"type": "boxplot"}, "encoding": {"x": {"field": "std", "type": "quantitative"}, "y": {"field": "id", "type": "nominal"}}, "height": 200, "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-5faacda36c1e8a62adb5863cd86f4ae0": [{"repo": "DeepSpeech", "id": "2.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.47946330148538413}, {"repo": "DeepSpeech", "id": "3.2", "count": 30.0, "mean": 0.2, "std": 0.40683810217248617}, {"repo": "DeepSpeech", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "DeepSpeech", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "DeepSpeech", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "DeepSpeech", "id": "6.1", "count": 30.0, "mean": 0.15, "std": 0.3511066249289032}, {"repo": "DeepSpeech", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "2.1", "count": 30.0, "mean": 0.95, "std": 0.15256428831468236}, {"repo": "apollo", "id": "3.2", "count": 30.0, "mean": 0.11666666666666667, "std": 0.21509153357603822}, {"repo": "apollo", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "6.1", "count": 30.0, "mean": 0.06666666666666667, "std": 0.1728729518208802}, {"repo": "apollo", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "lightfm", "id": "2.1", "count": 30.0, "mean": 0.9833333333333333, "std": 0.09128709291752768}, {"repo": "lightfm", "id": "3.2", "count": 30.0, "mean": 0.48333333333333334, "std": 0.09128709291752768}, {"repo": "lightfm", "id": "3.5", "count": 30.0, "mean": 0.26666666666666666, "std": 0.38803957613068046}, {"repo": "lightfm", "id": "4.2", "count": 30.0, "mean": 0.55, "std": 0.15256428831468233}, {"repo": "lightfm", "id": "5.3", "count": 30.0, "mean": 0.2, "std": 0.2491364395612199}, {"repo": "lightfm", "id": "6.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.23973165074269212}, {"repo": "lightfm", "id": "6.2", "count": 30.0, "mean": 0.9333333333333333, "std": 0.1728729518208802}, {"repo": "magenta", "id": "2.1", "count": 30.0, "mean": 0.6, "std": 0.4234505222898965}, {"repo": "magenta", "id": "3.2", "count": 30.0, "mean": 0.4166666666666667, "std": 0.18952451089472586}, {"repo": "magenta", "id": "3.5", "count": 30.0, "mean": 0.2833333333333333, "std": 0.25200346649686545}, {"repo": "magenta", "id": "4.2", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768}, {"repo": "magenta", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "magenta", "id": "6.1", "count": 30.0, "mean": 0.21666666666666667, "std": 0.25200346649686545}, {"repo": "magenta", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mmf", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "mmf", "id": "3.2", "count": 30.0, "mean": 0.6833333333333333, "std": 0.2450662589267805}, {"repo": "mmf", "id": "3.5", "count": 30.0, "mean": 0.03333333333333333, "std": 0.1268540658512312}, {"repo": "mmf", "id": "4.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.12685406585123124}, {"repo": "mmf", "id": "5.3", "count": 30.0, "mean": 0.5333333333333333, "std": 0.1268540658512312}, {"repo": "mmf", "id": "6.1", "count": 30.0, "mean": 0.85, "std": 0.23304579984969953}, {"repo": "mmf", "id": "6.2", "count": 30.0, "mean": 0.55, "std": 0.20128894996822438}, {"repo": "mycroft-core", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "mycroft-core", "id": "3.2", "count": 30.0, "mean": 0.5166666666666667, "std": 0.2780267083837678}, {"repo": "mycroft-core", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mycroft-core", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mycroft-core", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mycroft-core", "id": "6.1", "count": 30.0, "mean": 0.5, "std": 0.0}, {"repo": "mycroft-core", "id": "6.2", "count": 30.0, "mean": 0.23333333333333334, "std": 0.2537081317024625}, {"repo": "nanodet", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "nanodet", "id": "3.2", "count": 30.0, "mean": 0.6666666666666666, "std": 0.2397316507426921}, {"repo": "nanodet", "id": "3.5", "count": 30.0, "mean": 0.21666666666666667, "std": 0.25200346649686545}, {"repo": "nanodet", "id": "4.2", "count": 30.0, "mean": 0.25, "std": 0.2542738138578039}, {"repo": "nanodet", "id": "5.3", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "nanodet", "id": "6.1", "count": 30.0, "mean": 0.43333333333333335, "std": 0.2856732318616829}, {"repo": "nanodet", "id": "6.2", "count": 30.0, "mean": 0.36666666666666664, "std": 0.2604152298810939}, {"repo": "paperless-ng", "id": "2.1", "count": 30.0, "mean": 0.9, "std": 0.24211709905575113}, {"repo": "paperless-ng", "id": "3.2", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768}, {"repo": "paperless-ng", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "paperless-ng", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "paperless-ng", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "paperless-ng", "id": "6.1", "count": 30.0, "mean": 0.5, "std": 0.0}, {"repo": "paperless-ng", "id": "6.2", "count": 30.0, "mean": 0.03333333333333333, "std": 0.1268540658512312}, {"repo": "qlib", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "qlib", "id": "3.2", "count": 30.0, "mean": 0.7666666666666667, "std": 0.285673231861683}, {"repo": "qlib", "id": "3.5", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768}, {"repo": "qlib", "id": "4.2", "count": 30.0, "mean": 0.45, "std": 0.15256428831468236}, {"repo": "qlib", "id": "5.3", "count": 30.0, "mean": 0.26666666666666666, "std": 0.2537081317024624}, {"repo": "qlib", "id": "6.1", "count": 30.0, "mean": 0.4666666666666667, "std": 0.2248882225544018}, {"repo": "qlib", "id": "6.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.18257418583505536}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<p><strong>Observations</strong>: - The evaluation of the checklist item 2.1 <code>Ensure Data File Loads as Expected</code> is usually stable. - When evaluating a repository, 50% of the time its standard deviation is smaller than 0.05, the smallest among the others.</p>
-<p>Below shows the breakdown of item scores for each repository:<br>
-(NOTE: only <code>lightfm</code> and <code>qlib</code> have ground truth, in green diamond)</p>
-<div class="cell" data-execution_count="18">
-<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>generate_stat_plot(df_repo__stat, ground_truth<span class="op">=</span>ground_truth, facet_col<span class="op">=</span><span class="st">'id'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="18">
-
-<style>
-  #altair-viz-cd3ee5795ce04e14b1e1be2bd1bb6e17.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-cd3ee5795ce04e14b1e1be2bd1bb6e17.vega-embed details,
-  #altair-viz-cd3ee5795ce04e14b1e1be2bd1bb6e17.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-cd3ee5795ce04e14b1e1be2bd1bb6e17"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-cd3ee5795ce04e14b1e1be2bd1bb6e17") {
-      outputDiv = document.getElementById("altair-viz-cd3ee5795ce04e14b1e1be2bd1bb6e17");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}}, "data": {"name": "data-caf9bb3136627b1b25d838f76432530c"}, "facet": {"column": {"field": "id", "type": "nominal"}}, "spec": {"layer": [{"layer": [{"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelAngle": 0}, "field": "repo", "title": "Repository", "type": "ordinal"}, "y": {"field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "repo", "type": "ordinal"}, "y": {"field": "min", "title": "1 SD", "type": "quantitative"}, "y2": {"field": "max"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 100}, "encoding": {"x": {"field": "repo", "type": "ordinal"}, "y": {"field": "score", "type": "quantitative"}}}]}], "width": 400}, "columns": 2, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-caf9bb3136627b1b25d838f76432530c": [{"repo": "DeepSpeech", "id": "2.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.47946330148538413, "score": null}, {"repo": "DeepSpeech", "id": "3.2", "count": 30.0, "mean": 0.2, "std": 0.40683810217248617, "score": null}, {"repo": "DeepSpeech", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "DeepSpeech", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "DeepSpeech", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "DeepSpeech", "id": "6.1", "count": 30.0, "mean": 0.15, "std": 0.3511066249289032, "score": null}, {"repo": "DeepSpeech", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "apollo", "id": "2.1", "count": 30.0, "mean": 0.95, "std": 0.15256428831468236, "score": null}, {"repo": "apollo", "id": "3.2", "count": 30.0, "mean": 0.11666666666666667, "std": 0.21509153357603822, "score": null}, {"repo": "apollo", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "apollo", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "apollo", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "apollo", "id": "6.1", "count": 30.0, "mean": 0.06666666666666667, "std": 0.1728729518208802, "score": null}, {"repo": "apollo", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "lightfm", "id": "2.1", "count": 30.0, "mean": 0.9833333333333333, "std": 0.09128709291752768, "score": 1.0}, {"repo": "lightfm", "id": "3.2", "count": 30.0, "mean": 0.48333333333333334, "std": 0.09128709291752768, "score": 1.0}, {"repo": "lightfm", "id": "3.5", "count": 30.0, "mean": 0.26666666666666666, "std": 0.38803957613068046, "score": 0.0}, {"repo": "lightfm", "id": "4.2", "count": 30.0, "mean": 0.55, "std": 0.15256428831468233, "score": 1.0}, {"repo": "lightfm", "id": "5.3", "count": 30.0, "mean": 0.2, "std": 0.2491364395612199, "score": 0.5}, {"repo": "lightfm", "id": "6.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.23973165074269212, "score": 1.0}, {"repo": "lightfm", "id": "6.2", "count": 30.0, "mean": 0.9333333333333333, "std": 0.1728729518208802, "score": 1.0}, {"repo": "magenta", "id": "2.1", "count": 30.0, "mean": 0.6, "std": 0.4234505222898965, "score": null}, {"repo": "magenta", "id": "3.2", "count": 30.0, "mean": 0.4166666666666667, "std": 0.18952451089472586, "score": null}, {"repo": "magenta", "id": "3.5", "count": 30.0, "mean": 0.2833333333333333, "std": 0.25200346649686545, "score": null}, {"repo": "magenta", "id": "4.2", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768, "score": null}, {"repo": "magenta", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "magenta", "id": "6.1", "count": 30.0, "mean": 0.21666666666666667, "std": 0.25200346649686545, "score": null}, {"repo": "magenta", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "mmf", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0, "score": null}, {"repo": "mmf", "id": "3.2", "count": 30.0, "mean": 0.6833333333333333, "std": 0.2450662589267805, "score": null}, {"repo": "mmf", "id": "3.5", "count": 30.0, "mean": 0.03333333333333333, "std": 0.1268540658512312, "score": null}, {"repo": "mmf", "id": "4.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.12685406585123124, "score": null}, {"repo": "mmf", "id": "5.3", "count": 30.0, "mean": 0.5333333333333333, "std": 0.1268540658512312, "score": null}, {"repo": "mmf", "id": "6.1", "count": 30.0, "mean": 0.85, "std": 0.23304579984969953, "score": null}, {"repo": "mmf", "id": "6.2", "count": 30.0, "mean": 0.55, "std": 0.20128894996822438, "score": null}, {"repo": "mycroft-core", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0, "score": null}, {"repo": "mycroft-core", "id": "3.2", "count": 30.0, "mean": 0.5166666666666667, "std": 0.2780267083837678, "score": null}, {"repo": "mycroft-core", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "mycroft-core", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "mycroft-core", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "mycroft-core", "id": "6.1", "count": 30.0, "mean": 0.5, "std": 0.0, "score": null}, {"repo": "mycroft-core", "id": "6.2", "count": 30.0, "mean": 0.23333333333333334, "std": 0.2537081317024625, "score": null}, {"repo": "nanodet", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0, "score": null}, {"repo": "nanodet", "id": "3.2", "count": 30.0, "mean": 0.6666666666666666, "std": 0.2397316507426921, "score": null}, {"repo": "nanodet", "id": "3.5", "count": 30.0, "mean": 0.21666666666666667, "std": 0.25200346649686545, "score": null}, {"repo": "nanodet", "id": "4.2", "count": 30.0, "mean": 0.25, "std": 0.2542738138578039, "score": null}, {"repo": "nanodet", "id": "5.3", "count": 30.0, "mean": 1.0, "std": 0.0, "score": null}, {"repo": "nanodet", "id": "6.1", "count": 30.0, "mean": 0.43333333333333335, "std": 0.2856732318616829, "score": null}, {"repo": "nanodet", "id": "6.2", "count": 30.0, "mean": 0.36666666666666664, "std": 0.2604152298810939, "score": null}, {"repo": "paperless-ng", "id": "2.1", "count": 30.0, "mean": 0.9, "std": 0.24211709905575113, "score": null}, {"repo": "paperless-ng", "id": "3.2", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768, "score": null}, {"repo": "paperless-ng", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "paperless-ng", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "paperless-ng", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "paperless-ng", "id": "6.1", "count": 30.0, "mean": 0.5, "std": 0.0, "score": null}, {"repo": "paperless-ng", "id": "6.2", "count": 30.0, "mean": 0.03333333333333333, "std": 0.1268540658512312, "score": null}, {"repo": "qlib", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0, "score": 0.5}, {"repo": "qlib", "id": "3.2", "count": 30.0, "mean": 0.7666666666666667, "std": 0.285673231861683, "score": 1.0}, {"repo": "qlib", "id": "3.5", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768, "score": 0.0}, {"repo": "qlib", "id": "4.2", "count": 30.0, "mean": 0.45, "std": 0.15256428831468236, "score": 0.5}, {"repo": "qlib", "id": "5.3", "count": 30.0, "mean": 0.26666666666666666, "std": 0.2537081317024624, "score": 1.0}, {"repo": "qlib", "id": "6.1", "count": 30.0, "mean": 0.4666666666666667, "std": 0.2248882225544018, "score": 1.0}, {"repo": "qlib", "id": "6.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.18257418583505536, "score": 1.0}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<p><strong>Observations</strong>: - (TBC) The standard deviation for Item 3.5 and 5.3 shows great variation, which might imply that test cases in some repo might be confusing to LLM while some are clear. - (TBC) The standard deviation for Item 5.3, 6.1, 6.2 are relatively high and consistent, which might imply that there is room for refining the prompt to reduce consistency issue.</p>
-</section>
-<section id="the-consistency-for-each-checklist-items-compared-to-the-lightfm" class="level4">
-<h4 class="anchored" data-anchor-id="the-consistency-for-each-checklist-items-compared-to-the-lightfm">4. The consistency for each checklist items, compared to the <code>lightfm</code></h4>
-<ul>
-<li>Why is it important? We optimized the consistency of our system using <code>lightfm</code>. Therefore, we treat this repository as a benchmark. If a particular checklist item has a much worse consistency in other repository, that might mean that the prompt for that item is not generalizable.</li>
-</ul>
-<p>Below shows the standard deviations in a 30 runs for each checklist item for each repository:</p>
-<div class="cell" data-execution_count="19">
-<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> df_repo__stat[[<span class="st">'repo'</span>, <span class="st">'std'</span>, <span class="st">'id'</span>]].pivot(index<span class="op">=</span><span class="st">'repo'</span>, columns<span class="op">=</span><span class="st">'id'</span>)</span>
-<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>stds</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="19">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th colspan="7" data-quarto-table-cell-role="th" data-halign="left">std</th>
-</tr>
-<tr class="odd">
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th">2.1</th>
-<th data-quarto-table-cell-role="th">3.2</th>
-<th data-quarto-table-cell-role="th">3.5</th>
-<th data-quarto-table-cell-role="th">4.2</th>
-<th data-quarto-table-cell-role="th">5.3</th>
-<th data-quarto-table-cell-role="th">6.1</th>
-<th data-quarto-table-cell-role="th">6.2</th>
-</tr>
-<tr class="header">
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">DeepSpeech</td>
-<td>0.479463</td>
-<td>0.406838</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.351107</td>
-<td>0.000000</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">apollo</td>
-<td>0.152564</td>
-<td>0.215092</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.172873</td>
-<td>0.000000</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">lightfm</td>
-<td>0.091287</td>
-<td>0.091287</td>
-<td>0.388040</td>
-<td>0.152564</td>
-<td>0.249136</td>
-<td>0.239732</td>
-<td>0.172873</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">magenta</td>
-<td>0.423451</td>
-<td>0.189525</td>
-<td>0.252003</td>
-<td>0.091287</td>
-<td>0.000000</td>
-<td>0.252003</td>
-<td>0.000000</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">mmf</td>
-<td>0.000000</td>
-<td>0.245066</td>
-<td>0.126854</td>
-<td>0.126854</td>
-<td>0.126854</td>
-<td>0.233046</td>
-<td>0.201289</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">mycroft-core</td>
-<td>0.000000</td>
-<td>0.278027</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.253708</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">nanodet</td>
-<td>0.000000</td>
-<td>0.239732</td>
-<td>0.252003</td>
-<td>0.254274</td>
-<td>0.000000</td>
-<td>0.285673</td>
-<td>0.260415</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">paperless-ng</td>
-<td>0.242117</td>
-<td>0.091287</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.126854</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">qlib</td>
-<td>0.000000</td>
-<td>0.285673</td>
-<td>0.091287</td>
-<td>0.152564</td>
-<td>0.253708</td>
-<td>0.224888</td>
-<td>0.182574</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="20">
-<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a>stds_p <span class="op">=</span> stds.copy()</span>
-<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a>stds_p.columns <span class="op">=</span> [col[<span class="dv">1</span>] <span class="cf">for</span> col <span class="kw">in</span> stds_p.columns]</span>
-<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a>stds_p <span class="op">=</span> stds_p.reset_index()</span>
-<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a>stds_p <span class="op">=</span> stds_p.melt(id_vars<span class="op">=</span><span class="st">'repo'</span>, var_name<span class="op">=</span><span class="st">'id'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="21">
-<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>stds_p.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="21">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th">value</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>DeepSpeech</td>
-<td>2.1</td>
-<td>0.479463</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>apollo</td>
-<td>2.1</td>
-<td>0.152564</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>lightfm</td>
-<td>2.1</td>
-<td>0.091287</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>magenta</td>
-<td>2.1</td>
-<td>0.423451</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>mmf</td>
-<td>2.1</td>
-<td>0.000000</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="22">
-<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="co"># stripplot = (</span></span>
-<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a><span class="co">#     alt.Chart(stds_p)</span></span>
-<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a><span class="co">#     .mark_point(filled=True, size=100)</span></span>
-<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a><span class="co">#     .transform_calculate( </span></span>
-<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a><span class="co">#         # Generate Gaussian jitter with a Box-Muller transform </span></span>
-<span id="cb26-6"><a href="#cb26-6" aria-hidden="true" tabindex="-1"></a><span class="co">#         jitter='sqrt(-2*log(random()))*cos(2*PI*random())'</span></span>
-<span id="cb26-7"><a href="#cb26-7" aria-hidden="true" tabindex="-1"></a><span class="co">#         # jitter='random()'</span></span>
-<span id="cb26-8"><a href="#cb26-8" aria-hidden="true" tabindex="-1"></a><span class="co">#     ).encode( </span></span>
-<span id="cb26-9"><a href="#cb26-9" aria-hidden="true" tabindex="-1"></a><span class="co">#         y=alt.Y( </span></span>
-<span id="cb26-10"><a href="#cb26-10" aria-hidden="true" tabindex="-1"></a><span class="co">#             'jitter:Q', </span></span>
-<span id="cb26-11"><a href="#cb26-11" aria-hidden="true" tabindex="-1"></a><span class="co">#             title=None, </span></span>
-<span id="cb26-12"><a href="#cb26-12" aria-hidden="true" tabindex="-1"></a><span class="co">#             axis=alt.Axis(ticks=False, grid=True, labels=False), </span></span>
-<span id="cb26-13"><a href="#cb26-13" aria-hidden="true" tabindex="-1"></a><span class="co">#             scale=alt.Scale(), </span></span>
-<span id="cb26-14"><a href="#cb26-14" aria-hidden="true" tabindex="-1"></a><span class="co">#         ), </span></span>
-<span id="cb26-15"><a href="#cb26-15" aria-hidden="true" tabindex="-1"></a><span class="co">#         x=alt.X('value:Q'), </span></span>
-<span id="cb26-16"><a href="#cb26-16" aria-hidden="true" tabindex="-1"></a><span class="co">#         color=alt.Color('repo:N'),</span></span>
-<span id="cb26-17"><a href="#cb26-17" aria-hidden="true" tabindex="-1"></a><span class="co">#         row=alt.Row( </span></span>
-<span id="cb26-18"><a href="#cb26-18" aria-hidden="true" tabindex="-1"></a><span class="co">#             'id:N',</span></span>
-<span id="cb26-19"><a href="#cb26-19" aria-hidden="true" tabindex="-1"></a><span class="co">#             header=alt.Header(</span></span>
-<span id="cb26-20"><a href="#cb26-20" aria-hidden="true" tabindex="-1"></a><span class="co">#                 labelFontSize=16,</span></span>
-<span id="cb26-21"><a href="#cb26-21" aria-hidden="true" tabindex="-1"></a><span class="co">#                 labelAngle=0</span></span>
-<span id="cb26-22"><a href="#cb26-22" aria-hidden="true" tabindex="-1"></a><span class="co">#             )</span></span>
-<span id="cb26-23"><a href="#cb26-23" aria-hidden="true" tabindex="-1"></a><span class="co">#         ),</span></span>
-<span id="cb26-24"><a href="#cb26-24" aria-hidden="true" tabindex="-1"></a><span class="co">#         tooltip='repo'</span></span>
-<span id="cb26-25"><a href="#cb26-25" aria-hidden="true" tabindex="-1"></a><span class="co">#     ).configure_facet( </span></span>
-<span id="cb26-26"><a href="#cb26-26" aria-hidden="true" tabindex="-1"></a><span class="co">#         spacing=0</span></span>
-<span id="cb26-27"><a href="#cb26-27" aria-hidden="true" tabindex="-1"></a><span class="co">#     ).configure_view( </span></span>
-<span id="cb26-28"><a href="#cb26-28" aria-hidden="true" tabindex="-1"></a><span class="co">#         stroke=None</span></span>
-<span id="cb26-29"><a href="#cb26-29" aria-hidden="true" tabindex="-1"></a><span class="co">#     ).configure_axis( </span></span>
-<span id="cb26-30"><a href="#cb26-30" aria-hidden="true" tabindex="-1"></a><span class="co">#         labelFontSize=16, </span></span>
-<span id="cb26-31"><a href="#cb26-31" aria-hidden="true" tabindex="-1"></a><span class="co">#         titleFontSize=16</span></span>
-<span id="cb26-32"><a href="#cb26-32" aria-hidden="true" tabindex="-1"></a><span class="co">#     ).properties(</span></span>
-<span id="cb26-33"><a href="#cb26-33" aria-hidden="true" tabindex="-1"></a><span class="co">#         height=50, </span></span>
-<span id="cb26-34"><a href="#cb26-34" aria-hidden="true" tabindex="-1"></a><span class="co">#         width=600</span></span>
-<span id="cb26-35"><a href="#cb26-35" aria-hidden="true" tabindex="-1"></a><span class="co">#     ) </span></span>
-<span id="cb26-36"><a href="#cb26-36" aria-hidden="true" tabindex="-1"></a><span class="co"># )</span></span>
-<span id="cb26-37"><a href="#cb26-37" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb26-38"><a href="#cb26-38" aria-hidden="true" tabindex="-1"></a><span class="co"># stripplot </span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="23">
-<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> generate_jitterbox_plot(df_stds_p):</span>
-<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""</span></span>
-<span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a><span class="co">    Generate jitterbox plot across all repo and all checklist item</span></span>
-<span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
-<span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a>    box <span class="op">=</span> alt.Chart().mark_boxplot(</span>
-<span id="cb27-6"><a href="#cb27-6" aria-hidden="true" tabindex="-1"></a>        color<span class="op">=</span><span class="st">'grey'</span>,</span>
-<span id="cb27-7"><a href="#cb27-7" aria-hidden="true" tabindex="-1"></a>        opacity<span class="op">=</span><span class="fl">0.5</span>,</span>
-<span id="cb27-8"><a href="#cb27-8" aria-hidden="true" tabindex="-1"></a>        size<span class="op">=</span><span class="dv">20</span>,</span>
-<span id="cb27-9"><a href="#cb27-9" aria-hidden="true" tabindex="-1"></a>    ).encode(</span>
-<span id="cb27-10"><a href="#cb27-10" aria-hidden="true" tabindex="-1"></a>        x<span class="op">=</span>alt.X(<span class="st">'value:Q'</span>).title(<span class="st">'SD(Score)'</span>),</span>
-<span id="cb27-11"><a href="#cb27-11" aria-hidden="true" tabindex="-1"></a>        y<span class="op">=</span>alt.Y(<span class="st">'id:N'</span>, title<span class="op">=</span><span class="va">None</span>, axis<span class="op">=</span>alt.Axis(labelPadding<span class="op">=</span><span class="dv">10</span>, grid<span class="op">=</span><span class="va">False</span>))</span>
-<span id="cb27-12"><a href="#cb27-12" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb27-13"><a href="#cb27-13" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb27-14"><a href="#cb27-14" aria-hidden="true" tabindex="-1"></a>    stripplot <span class="op">=</span> alt.Chart().mark_circle(size<span class="op">=</span><span class="dv">100</span>).encode(</span>
-<span id="cb27-15"><a href="#cb27-15" aria-hidden="true" tabindex="-1"></a>        y<span class="op">=</span>alt.Y( </span>
-<span id="cb27-16"><a href="#cb27-16" aria-hidden="true" tabindex="-1"></a>            <span class="st">'id:N'</span>,</span>
-<span id="cb27-17"><a href="#cb27-17" aria-hidden="true" tabindex="-1"></a>            axis<span class="op">=</span>alt.Axis(ticks<span class="op">=</span><span class="va">False</span>, grid<span class="op">=</span><span class="va">True</span>, labels<span class="op">=</span><span class="va">True</span>), </span>
-<span id="cb27-18"><a href="#cb27-18" aria-hidden="true" tabindex="-1"></a>            scale<span class="op">=</span>alt.Scale(), </span>
-<span id="cb27-19"><a href="#cb27-19" aria-hidden="true" tabindex="-1"></a>        ), </span>
-<span id="cb27-20"><a href="#cb27-20" aria-hidden="true" tabindex="-1"></a>        x<span class="op">=</span><span class="st">'value:Q'</span>,</span>
-<span id="cb27-21"><a href="#cb27-21" aria-hidden="true" tabindex="-1"></a>        yOffset<span class="op">=</span><span class="st">"jitter:Q"</span>,</span>
-<span id="cb27-22"><a href="#cb27-22" aria-hidden="true" tabindex="-1"></a>        color<span class="op">=</span>alt.Color(<span class="st">'id:N'</span>, legend<span class="op">=</span><span class="va">None</span>),</span>
-<span id="cb27-23"><a href="#cb27-23" aria-hidden="true" tabindex="-1"></a>        tooltip<span class="op">=</span><span class="st">'repo'</span></span>
-<span id="cb27-24"><a href="#cb27-24" aria-hidden="true" tabindex="-1"></a>    ).transform_calculate(</span>
-<span id="cb27-25"><a href="#cb27-25" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Generate Gaussian jitter with a Box-Muller transform</span></span>
-<span id="cb27-26"><a href="#cb27-26" aria-hidden="true" tabindex="-1"></a>        jitter<span class="op">=</span><span class="st">"sqrt(-2*log(random()))*cos(2*PI*random())"</span></span>
-<span id="cb27-27"><a href="#cb27-27" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb27-28"><a href="#cb27-28" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb27-29"><a href="#cb27-29" aria-hidden="true" tabindex="-1"></a>    plot <span class="op">=</span> alt.layer(</span>
-<span id="cb27-30"><a href="#cb27-30" aria-hidden="true" tabindex="-1"></a>        box,</span>
-<span id="cb27-31"><a href="#cb27-31" aria-hidden="true" tabindex="-1"></a>        stripplot,</span>
-<span id="cb27-32"><a href="#cb27-32" aria-hidden="true" tabindex="-1"></a>        data<span class="op">=</span>df_stds_p</span>
-<span id="cb27-33"><a href="#cb27-33" aria-hidden="true" tabindex="-1"></a>    ).configure_view( </span>
-<span id="cb27-34"><a href="#cb27-34" aria-hidden="true" tabindex="-1"></a>        stroke<span class="op">=</span><span class="va">None</span></span>
-<span id="cb27-35"><a href="#cb27-35" aria-hidden="true" tabindex="-1"></a>    ).configure_axis( </span>
-<span id="cb27-36"><a href="#cb27-36" aria-hidden="true" tabindex="-1"></a>        labelFontSize<span class="op">=</span><span class="dv">16</span>, </span>
-<span id="cb27-37"><a href="#cb27-37" aria-hidden="true" tabindex="-1"></a>        titleFontSize<span class="op">=</span><span class="dv">16</span></span>
-<span id="cb27-38"><a href="#cb27-38" aria-hidden="true" tabindex="-1"></a>    ).properties(</span>
-<span id="cb27-39"><a href="#cb27-39" aria-hidden="true" tabindex="-1"></a>        height<span class="op">=</span><span class="dv">300</span>, </span>
-<span id="cb27-40"><a href="#cb27-40" aria-hidden="true" tabindex="-1"></a>        width<span class="op">=</span><span class="dv">600</span></span>
-<span id="cb27-41"><a href="#cb27-41" aria-hidden="true" tabindex="-1"></a>    ) </span>
-<span id="cb27-42"><a href="#cb27-42" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb27-43"><a href="#cb27-43" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> plot</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="24">
-<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a>generate_jitterbox_plot(stds_p)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="24">
-
-<style>
-  #altair-viz-bf718614d34e4d4fa2079ee562721d43.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-bf718614d34e4d4fa2079ee562721d43.vega-embed details,
-  #altair-viz-bf718614d34e4d4fa2079ee562721d43.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-bf718614d34e4d4fa2079ee562721d43"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-bf718614d34e4d4fa2079ee562721d43") {
-      outputDiv = document.getElementById("altair-viz-bf718614d34e4d4fa2079ee562721d43");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300, "stroke": null}, "axis": {"labelFontSize": 16, "titleFontSize": 16}}, "layer": [{"mark": {"type": "boxplot", "color": "grey", "opacity": 0.5, "size": 20}, "encoding": {"x": {"field": "value", "title": "SD(Score)", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelPadding": 10}, "field": "id", "title": null, "type": "nominal"}}}, {"mark": {"type": "circle", "size": 100}, "encoding": {"color": {"field": "id", "legend": null, "type": "nominal"}, "tooltip": {"field": "repo", "type": "nominal"}, "x": {"field": "value", "type": "quantitative"}, "y": {"axis": {"grid": true, "labels": true, "ticks": false}, "field": "id", "scale": {}, "type": "nominal"}, "yOffset": {"field": "jitter", "type": "quantitative"}}, "transform": [{"calculate": "sqrt(-2*log(random()))*cos(2*PI*random())", "as": "jitter"}]}], "data": {"name": "data-99bb98eaf09607f2cf1769cbf170cb2a"}, "height": 300, "width": 600, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-99bb98eaf09607f2cf1769cbf170cb2a": [{"repo": "DeepSpeech", "id": "2.1", "value": 0.47946330148538413}, {"repo": "apollo", "id": "2.1", "value": 0.15256428831468236}, {"repo": "lightfm", "id": "2.1", "value": 0.09128709291752768}, {"repo": "magenta", "id": "2.1", "value": 0.4234505222898965}, {"repo": "mmf", "id": "2.1", "value": 0.0}, {"repo": "mycroft-core", "id": "2.1", "value": 0.0}, {"repo": "nanodet", "id": "2.1", "value": 0.0}, {"repo": "paperless-ng", "id": "2.1", "value": 0.24211709905575113}, {"repo": "qlib", "id": "2.1", "value": 0.0}, {"repo": "DeepSpeech", "id": "3.2", "value": 0.40683810217248617}, {"repo": "apollo", "id": "3.2", "value": 0.21509153357603822}, {"repo": "lightfm", "id": "3.2", "value": 0.09128709291752768}, {"repo": "magenta", "id": "3.2", "value": 0.18952451089472586}, {"repo": "mmf", "id": "3.2", "value": 0.2450662589267805}, {"repo": "mycroft-core", "id": "3.2", "value": 0.2780267083837678}, {"repo": "nanodet", "id": "3.2", "value": 0.2397316507426921}, {"repo": "paperless-ng", "id": "3.2", "value": 0.09128709291752768}, {"repo": "qlib", "id": "3.2", "value": 0.285673231861683}, {"repo": "DeepSpeech", "id": "3.5", "value": 0.0}, {"repo": "apollo", "id": "3.5", "value": 0.0}, {"repo": "lightfm", "id": "3.5", "value": 0.38803957613068046}, {"repo": "magenta", "id": "3.5", "value": 0.25200346649686545}, {"repo": "mmf", "id": "3.5", "value": 0.1268540658512312}, {"repo": "mycroft-core", "id": "3.5", "value": 0.0}, {"repo": "nanodet", "id": "3.5", "value": 0.25200346649686545}, {"repo": "paperless-ng", "id": "3.5", "value": 0.0}, {"repo": "qlib", "id": "3.5", "value": 0.09128709291752768}, {"repo": "DeepSpeech", "id": "4.2", "value": 0.0}, {"repo": "apollo", "id": "4.2", "value": 0.0}, {"repo": "lightfm", "id": "4.2", "value": 0.15256428831468233}, {"repo": "magenta", "id": "4.2", "value": 0.09128709291752768}, {"repo": "mmf", "id": "4.2", "value": 0.12685406585123124}, {"repo": "mycroft-core", "id": "4.2", "value": 0.0}, {"repo": "nanodet", "id": "4.2", "value": 0.2542738138578039}, {"repo": "paperless-ng", "id": "4.2", "value": 0.0}, {"repo": "qlib", "id": "4.2", "value": 0.15256428831468236}, {"repo": "DeepSpeech", "id": "5.3", "value": 0.0}, {"repo": "apollo", "id": "5.3", "value": 0.0}, {"repo": "lightfm", "id": "5.3", "value": 0.2491364395612199}, {"repo": "magenta", "id": "5.3", "value": 0.0}, {"repo": "mmf", "id": "5.3", "value": 0.1268540658512312}, {"repo": "mycroft-core", "id": "5.3", "value": 0.0}, {"repo": "nanodet", "id": "5.3", "value": 0.0}, {"repo": "paperless-ng", "id": "5.3", "value": 0.0}, {"repo": "qlib", "id": "5.3", "value": 0.2537081317024624}, {"repo": "DeepSpeech", "id": "6.1", "value": 0.3511066249289032}, {"repo": "apollo", "id": "6.1", "value": 0.1728729518208802}, {"repo": "lightfm", "id": "6.1", "value": 0.23973165074269212}, {"repo": "magenta", "id": "6.1", "value": 0.25200346649686545}, {"repo": "mmf", "id": "6.1", "value": 0.23304579984969953}, {"repo": "mycroft-core", "id": "6.1", "value": 0.0}, {"repo": "nanodet", "id": "6.1", "value": 0.2856732318616829}, {"repo": "paperless-ng", "id": "6.1", "value": 0.0}, {"repo": "qlib", "id": "6.1", "value": 0.2248882225544018}, {"repo": "DeepSpeech", "id": "6.2", "value": 0.0}, {"repo": "apollo", "id": "6.2", "value": 0.0}, {"repo": "lightfm", "id": "6.2", "value": 0.1728729518208802}, {"repo": "magenta", "id": "6.2", "value": 0.0}, {"repo": "mmf", "id": "6.2", "value": 0.20128894996822438}, {"repo": "mycroft-core", "id": "6.2", "value": 0.2537081317024625}, {"repo": "nanodet", "id": "6.2", "value": 0.2604152298810939}, {"repo": "paperless-ng", "id": "6.2", "value": 0.1268540658512312}, {"repo": "qlib", "id": "6.2", "value": 0.18257418583505536}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<div class="cell" data-execution_count="25">
-<div class="sourceCode cell-code" id="cb29"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a>alt.Chart(df_repo__stat).mark_boxplot().encode(</span>
-<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span><span class="st">"std:Q"</span>,</span>
-<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">'id:N'</span></span>
-<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a>).properties(</span>
-<span id="cb29-5"><a href="#cb29-5" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb29-6"><a href="#cb29-6" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">400</span></span>
-<span id="cb29-7"><a href="#cb29-7" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="25">
-
-<style>
-  #altair-viz-4ac4f6337ac841c28a87178fb85fc13c.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-4ac4f6337ac841c28a87178fb85fc13c.vega-embed details,
-  #altair-viz-4ac4f6337ac841c28a87178fb85fc13c.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-4ac4f6337ac841c28a87178fb85fc13c"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-4ac4f6337ac841c28a87178fb85fc13c") {
-      outputDiv = document.getElementById("altair-viz-4ac4f6337ac841c28a87178fb85fc13c");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}}, "data": {"name": "data-5faacda36c1e8a62adb5863cd86f4ae0"}, "mark": {"type": "boxplot"}, "encoding": {"x": {"field": "std", "type": "quantitative"}, "y": {"field": "id", "type": "nominal"}}, "height": 200, "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-5faacda36c1e8a62adb5863cd86f4ae0": [{"repo": "DeepSpeech", "id": "2.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.47946330148538413}, {"repo": "DeepSpeech", "id": "3.2", "count": 30.0, "mean": 0.2, "std": 0.40683810217248617}, {"repo": "DeepSpeech", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "DeepSpeech", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "DeepSpeech", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "DeepSpeech", "id": "6.1", "count": 30.0, "mean": 0.15, "std": 0.3511066249289032}, {"repo": "DeepSpeech", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "2.1", "count": 30.0, "mean": 0.95, "std": 0.15256428831468236}, {"repo": "apollo", "id": "3.2", "count": 30.0, "mean": 0.11666666666666667, "std": 0.21509153357603822}, {"repo": "apollo", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "6.1", "count": 30.0, "mean": 0.06666666666666667, "std": 0.1728729518208802}, {"repo": "apollo", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "lightfm", "id": "2.1", "count": 30.0, "mean": 0.9833333333333333, "std": 0.09128709291752768}, {"repo": "lightfm", "id": "3.2", "count": 30.0, "mean": 0.48333333333333334, "std": 0.09128709291752768}, {"repo": "lightfm", "id": "3.5", "count": 30.0, "mean": 0.26666666666666666, "std": 0.38803957613068046}, {"repo": "lightfm", "id": "4.2", "count": 30.0, "mean": 0.55, "std": 0.15256428831468233}, {"repo": "lightfm", "id": "5.3", "count": 30.0, "mean": 0.2, "std": 0.2491364395612199}, {"repo": "lightfm", "id": "6.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.23973165074269212}, {"repo": "lightfm", "id": "6.2", "count": 30.0, "mean": 0.9333333333333333, "std": 0.1728729518208802}, {"repo": "magenta", "id": "2.1", "count": 30.0, "mean": 0.6, "std": 0.4234505222898965}, {"repo": "magenta", "id": "3.2", "count": 30.0, "mean": 0.4166666666666667, "std": 0.18952451089472586}, {"repo": "magenta", "id": "3.5", "count": 30.0, "mean": 0.2833333333333333, "std": 0.25200346649686545}, {"repo": "magenta", "id": "4.2", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768}, {"repo": "magenta", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "magenta", "id": "6.1", "count": 30.0, "mean": 0.21666666666666667, "std": 0.25200346649686545}, {"repo": "magenta", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mmf", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "mmf", "id": "3.2", "count": 30.0, "mean": 0.6833333333333333, "std": 0.2450662589267805}, {"repo": "mmf", "id": "3.5", "count": 30.0, "mean": 0.03333333333333333, "std": 0.1268540658512312}, {"repo": "mmf", "id": "4.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.12685406585123124}, {"repo": "mmf", "id": "5.3", "count": 30.0, "mean": 0.5333333333333333, "std": 0.1268540658512312}, {"repo": "mmf", "id": "6.1", "count": 30.0, "mean": 0.85, "std": 0.23304579984969953}, {"repo": "mmf", "id": "6.2", "count": 30.0, "mean": 0.55, "std": 0.20128894996822438}, {"repo": "mycroft-core", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "mycroft-core", "id": "3.2", "count": 30.0, "mean": 0.5166666666666667, "std": 0.2780267083837678}, {"repo": "mycroft-core", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mycroft-core", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mycroft-core", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mycroft-core", "id": "6.1", "count": 30.0, "mean": 0.5, "std": 0.0}, {"repo": "mycroft-core", "id": "6.2", "count": 30.0, "mean": 0.23333333333333334, "std": 0.2537081317024625}, {"repo": "nanodet", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "nanodet", "id": "3.2", "count": 30.0, "mean": 0.6666666666666666, "std": 0.2397316507426921}, {"repo": "nanodet", "id": "3.5", "count": 30.0, "mean": 0.21666666666666667, "std": 0.25200346649686545}, {"repo": "nanodet", "id": "4.2", "count": 30.0, "mean": 0.25, "std": 0.2542738138578039}, {"repo": "nanodet", "id": "5.3", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "nanodet", "id": "6.1", "count": 30.0, "mean": 0.43333333333333335, "std": 0.2856732318616829}, {"repo": "nanodet", "id": "6.2", "count": 30.0, "mean": 0.36666666666666664, "std": 0.2604152298810939}, {"repo": "paperless-ng", "id": "2.1", "count": 30.0, "mean": 0.9, "std": 0.24211709905575113}, {"repo": "paperless-ng", "id": "3.2", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768}, {"repo": "paperless-ng", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "paperless-ng", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "paperless-ng", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "paperless-ng", "id": "6.1", "count": 30.0, "mean": 0.5, "std": 0.0}, {"repo": "paperless-ng", "id": "6.2", "count": 30.0, "mean": 0.03333333333333333, "std": 0.1268540658512312}, {"repo": "qlib", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "qlib", "id": "3.2", "count": 30.0, "mean": 0.7666666666666667, "std": 0.285673231861683}, {"repo": "qlib", "id": "3.5", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768}, {"repo": "qlib", "id": "4.2", "count": 30.0, "mean": 0.45, "std": 0.15256428831468236}, {"repo": "qlib", "id": "5.3", "count": 30.0, "mean": 0.26666666666666666, "std": 0.2537081317024624}, {"repo": "qlib", "id": "6.1", "count": 30.0, "mean": 0.4666666666666667, "std": 0.2248882225544018}, {"repo": "qlib", "id": "6.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.18257418583505536}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<div class="cell" data-scrolled="true" data-execution_count="26">
-<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !pip install altair_catplot</span></span>
-<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !pip install seaborn</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="27">
-<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a><span class="co"># import altair_catplot</span></span>
-<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb31-3"><a href="#cb31-3" aria-hidden="true" tabindex="-1"></a><span class="co"># altair_catplot.catplot(</span></span>
-<span id="cb31-4"><a href="#cb31-4" aria-hidden="true" tabindex="-1"></a><span class="co">#     stds_p, </span></span>
-<span id="cb31-5"><a href="#cb31-5" aria-hidden="true" tabindex="-1"></a><span class="co">#     transform ='jitterbox', </span></span>
-<span id="cb31-6"><a href="#cb31-6" aria-hidden="true" tabindex="-1"></a><span class="co">#     mark ='point', </span></span>
-<span id="cb31-7"><a href="#cb31-7" aria-hidden="true" tabindex="-1"></a><span class="co">#     encoding = dict(</span></span>
-<span id="cb31-8"><a href="#cb31-8" aria-hidden="true" tabindex="-1"></a><span class="co">#         x = alt.X('value:Q'), </span></span>
-<span id="cb31-9"><a href="#cb31-9" aria-hidden="true" tabindex="-1"></a><span class="co">#         y = alt.Y('id:N'), </span></span>
-<span id="cb31-10"><a href="#cb31-10" aria-hidden="true" tabindex="-1"></a><span class="co">#         color = alt.Color('repo:N')</span></span>
-<span id="cb31-11"><a href="#cb31-11" aria-hidden="true" tabindex="-1"></a><span class="co">#     ) </span></span>
-<span id="cb31-12"><a href="#cb31-12" aria-hidden="true" tabindex="-1"></a><span class="co"># )</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="28">
-<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a>F <span class="op">=</span> stds.drop(index<span class="op">=</span><span class="st">'lightfm'</span>) <span class="op">/</span> stds.loc[<span class="st">'lightfm'</span>]</span>
-<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a>base <span class="op">=</span> alt.Chart(</span>
-<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a>    F.melt(ignore_index<span class="op">=</span><span class="va">False</span>).reset_index()[[<span class="st">'repo'</span>, <span class="st">'id'</span>, <span class="st">'value'</span>]]</span>
-<span id="cb32-5"><a href="#cb32-5" aria-hidden="true" tabindex="-1"></a>).transform_calculate(</span>
-<span id="cb32-6"><a href="#cb32-6" aria-hidden="true" tabindex="-1"></a>    benchmark<span class="op">=</span><span class="st">"1"</span>,</span>
-<span id="cb32-7"><a href="#cb32-7" aria-hidden="true" tabindex="-1"></a>    threshold<span class="op">=</span><span class="ss">f"</span><span class="sc">{</span>scipy<span class="sc">.</span>stats<span class="sc">.</span>f<span class="sc">.</span>ppf(<span class="fl">0.975</span>, <span class="dv">29</span>, <span class="dv">29</span>)<span class="sc">}</span><span class="ss">"</span></span>
-<span id="cb32-8"><a href="#cb32-8" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb32-9"><a href="#cb32-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb32-10"><a href="#cb32-10" aria-hidden="true" tabindex="-1"></a>point <span class="op">=</span> base.mark_point(</span>
-<span id="cb32-11"><a href="#cb32-11" aria-hidden="true" tabindex="-1"></a>    filled<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb32-12"><a href="#cb32-12" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">100</span>,</span>
-<span id="cb32-13"><a href="#cb32-13" aria-hidden="true" tabindex="-1"></a>).encode(</span>
-<span id="cb32-14"><a href="#cb32-14" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'value:Q'</span>).title(<span class="st">"std ratio (c.f. lightfm)"</span>),</span>
-<span id="cb32-15"><a href="#cb32-15" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">'id:N'</span>,</span>
-<span id="cb32-16"><a href="#cb32-16" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'repo'</span>,</span>
-<span id="cb32-17"><a href="#cb32-17" aria-hidden="true" tabindex="-1"></a>    tooltip<span class="op">=</span><span class="st">'repo'</span></span>
-<span id="cb32-18"><a href="#cb32-18" aria-hidden="true" tabindex="-1"></a>).properties(</span>
-<span id="cb32-19"><a href="#cb32-19" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb32-20"><a href="#cb32-20" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">400</span></span>
-<span id="cb32-21"><a href="#cb32-21" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb32-22"><a href="#cb32-22" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb32-23"><a href="#cb32-23" aria-hidden="true" tabindex="-1"></a>point <span class="op">\</span></span>
-<span id="cb32-24"><a href="#cb32-24" aria-hidden="true" tabindex="-1"></a><span class="op">+</span> base.mark_rule(color<span class="op">=</span><span class="st">'black'</span>).encode(x<span class="op">=</span><span class="st">"benchmark:Q"</span>) <span class="op">\</span></span>
-<span id="cb32-25"><a href="#cb32-25" aria-hidden="true" tabindex="-1"></a><span class="op">+</span> base.mark_rule(color<span class="op">=</span><span class="st">'red'</span>).encode(x<span class="op">=</span><span class="st">"threshold:Q"</span>)</span>
-<span id="cb32-26"><a href="#cb32-26" aria-hidden="true" tabindex="-1"></a><span class="co"># jitter instead of mark_point &lt;-- prompt vs. repo problem?</span></span>
-<span id="cb32-27"><a href="#cb32-27" aria-hidden="true" tabindex="-1"></a><span class="co"># prompt: sd of checklist item for all repo is high</span></span>
-<span id="cb32-28"><a href="#cb32-28" aria-hidden="true" tabindex="-1"></a><span class="co"># repo: most of repo have low sd, the repo we're looking at has outlier</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="28">
-
-<style>
-  #altair-viz-2ad8caf7cc9745f4929e9d82636d88e8.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-2ad8caf7cc9745f4929e9d82636d88e8.vega-embed details,
-  #altair-viz-2ad8caf7cc9745f4929e9d82636d88e8.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-2ad8caf7cc9745f4929e9d82636d88e8"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-2ad8caf7cc9745f4929e9d82636d88e8") {
-      outputDiv = document.getElementById("altair-viz-2ad8caf7cc9745f4929e9d82636d88e8");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}}, "layer": [{"mark": {"type": "point", "filled": true, "size": 100}, "encoding": {"color": {"field": "repo", "type": "nominal"}, "tooltip": {"field": "repo", "type": "nominal"}, "x": {"field": "value", "title": "std ratio (c.f. lightfm)", "type": "quantitative"}, "y": {"field": "id", "type": "nominal"}}, "transform": [{"calculate": "1", "as": "benchmark"}, {"calculate": "2.100995817284212", "as": "threshold"}]}, {"mark": {"type": "rule", "color": "black"}, "encoding": {"x": {"field": "benchmark", "type": "quantitative"}}, "transform": [{"calculate": "1", "as": "benchmark"}, {"calculate": "2.100995817284212", "as": "threshold"}]}, {"mark": {"type": "rule", "color": "red"}, "encoding": {"x": {"field": "threshold", "type": "quantitative"}}, "transform": [{"calculate": "1", "as": "benchmark"}, {"calculate": "2.100995817284212", "as": "threshold"}]}], "data": {"name": "data-bf73d5a667a373948920e49381ff5c49"}, "height": 200, "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-bf73d5a667a373948920e49381ff5c49": [{"repo": "DeepSpeech", "id": "2.1", "value": 5.252257314388903}, {"repo": "apollo", "id": "2.1", "value": 1.6712580435934672}, {"repo": "magenta", "id": "2.1", "value": 4.638668060910409}, {"repo": "mmf", "id": "2.1", "value": 0.0}, {"repo": "mycroft-core", "id": "2.1", "value": 0.0}, {"repo": "nanodet", "id": "2.1", "value": 0.0}, {"repo": "paperless-ng", "id": "2.1", "value": 2.652259934210953}, {"repo": "qlib", "id": "2.1", "value": 0.0}, {"repo": "DeepSpeech", "id": "3.2", "value": 4.456688116249245}, {"repo": "apollo", "id": "3.2", "value": 2.3562096973595192}, {"repo": "magenta", "id": "3.2", "value": 2.0761369963434992}, {"repo": "mmf", "id": "3.2", "value": 2.6845663619519895}, {"repo": "mycroft-core", "id": "3.2", "value": 3.0456299954140063}, {"repo": "nanodet", "id": "3.2", "value": 2.6261286571944518}, {"repo": "paperless-ng", "id": "3.2", "value": 1.0}, {"repo": "qlib", "id": "3.2", "value": 3.1293934633209464}, {"repo": "DeepSpeech", "id": "3.5", "value": 0.0}, {"repo": "apollo", "id": "3.5", "value": 0.0}, {"repo": "magenta", "id": "3.5", "value": 0.6494272285567027}, {"repo": "mmf", "id": "3.5", "value": 0.3269101237460131}, {"repo": "mycroft-core", "id": "3.5", "value": 0.0}, {"repo": "nanodet", "id": "3.5", "value": 0.6494272285567027}, {"repo": "paperless-ng", "id": "3.5", "value": 0.0}, {"repo": "qlib", "id": "3.5", "value": 0.2352520168890836}, {"repo": "DeepSpeech", "id": "4.2", "value": 0.0}, {"repo": "apollo", "id": "4.2", "value": 0.0}, {"repo": "magenta", "id": "4.2", "value": 0.5983516452371671}, {"repo": "mmf", "id": "4.2", "value": 0.8314794192830983}, {"repo": "mycroft-core", "id": "4.2", "value": 0.0}, {"repo": "nanodet", "id": "4.2", "value": 1.6666666666666667}, {"repo": "paperless-ng", "id": "4.2", "value": 0.0}, {"repo": "qlib", "id": "4.2", "value": 1.0000000000000002}, {"repo": "DeepSpeech", "id": "5.3", "value": 0.0}, {"repo": "apollo", "id": "5.3", "value": 0.0}, {"repo": "magenta", "id": "5.3", "value": 0.0}, {"repo": "mmf", "id": "5.3", "value": 0.5091750772173156}, {"repo": "mycroft-core", "id": "5.3", "value": 0.0}, {"repo": "nanodet", "id": "5.3", "value": 0.0}, {"repo": "paperless-ng", "id": "5.3", "value": 0.0}, {"repo": "qlib", "id": "5.3", "value": 1.0183501544346312}, {"repo": "DeepSpeech", "id": "6.1", "value": 1.4645818515876807}, {"repo": "apollo", "id": "6.1", "value": 0.7211102550927977}, {"repo": "magenta", "id": "6.1", "value": 1.0511898020814316}, {"repo": "mmf", "id": "6.1", "value": 0.9721111047611789}, {"repo": "mycroft-core", "id": "6.1", "value": 0.0}, {"repo": "nanodet", "id": "6.1", "value": 1.1916375287812981}, {"repo": "paperless-ng", "id": "6.1", "value": 0.0}, {"repo": "qlib", "id": "6.1", "value": 0.9380831519646856}, {"repo": "DeepSpeech", "id": "6.2", "value": 0.0}, {"repo": "apollo", "id": "6.2", "value": 0.0}, {"repo": "magenta", "id": "6.2", "value": 0.0}, {"repo": "mmf", "id": "6.2", "value": 1.1643750387092773}, {"repo": "mycroft-core", "id": "6.2", "value": 1.467598771410686}, {"repo": "nanodet", "id": "6.2", "value": 1.5063966175050876}, {"repo": "paperless-ng", "id": "6.2", "value": 0.7337993857053429}, {"repo": "qlib", "id": "6.2", "value": 1.0561177090573832}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<p><strong>Observations</strong>: - The evaluation of the checklist item 3.2 <code>Data in the Expected Format</code> becomes much more unstable in most of other repositories. - That of the 2.1 is significantly unstable in the repo <code>paperless-ng</code>, <code>magenta</code> and <code>DeepSpeech</code>, but it may be due to the repo itself.</p>
-<p>TODO: to look into the 3.2’s scores.</p>
-</section>
-<section id="todo-given-ground-truth-1-distribution-of-system-score" class="level4">
-<h4 class="anchored" data-anchor-id="todo-given-ground-truth-1-distribution-of-system-score">TODO: Given ground truth == 1, distribution of system score?</h4>
-</section>
-<section id="todo-given-ground-truth-0-distribution-of-system-score" class="level4">
-<h4 class="anchored" data-anchor-id="todo-given-ground-truth-0-distribution-of-system-score">TODO: Given ground truth == 0, distribution of system score?</h4>
-<div class="cell" data-execution_count="29">
-<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> generate_histogram_plot(df_repo_run_long, df_ground_truth<span class="op">=</span><span class="va">None</span>, repo<span class="op">=</span><span class="va">None</span>, <span class="bu">id</span><span class="op">=</span><span class="va">None</span>):</span>
-<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""</span></span>
-<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a><span class="co">    Generate histogram across all repo and all checklist item</span></span>
-<span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a><span class="co">    Optional to incorporate ground truth and select specific repo/checklist item</span></span>
-<span id="cb33-5"><a href="#cb33-5" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
-<span id="cb33-6"><a href="#cb33-6" aria-hidden="true" tabindex="-1"></a>    <span class="co"># data</span></span>
-<span id="cb33-7"><a href="#cb33-7" aria-hidden="true" tabindex="-1"></a>    repo_data <span class="op">=</span> df_repo_run_long.copy()</span>
-<span id="cb33-8"><a href="#cb33-8" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> repo:</span>
-<span id="cb33-9"><a href="#cb33-9" aria-hidden="true" tabindex="-1"></a>        repo_data <span class="op">=</span> repo_data.query(<span class="ss">f'repo == "</span><span class="sc">{</span>repo<span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb33-10"><a href="#cb33-10" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> <span class="bu">id</span>:</span>
-<span id="cb33-11"><a href="#cb33-11" aria-hidden="true" tabindex="-1"></a>        repo_data <span class="op">=</span> repo_data.query(<span class="ss">f'id == "</span><span class="sc">{</span><span class="bu">id</span><span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb33-12"><a href="#cb33-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb33-13"><a href="#cb33-13" aria-hidden="true" tabindex="-1"></a>    <span class="co"># base histogram chart</span></span>
-<span id="cb33-14"><a href="#cb33-14" aria-hidden="true" tabindex="-1"></a>    base <span class="op">=</span> alt.Chart().mark_bar().encode(</span>
-<span id="cb33-15"><a href="#cb33-15" aria-hidden="true" tabindex="-1"></a>                x<span class="op">=</span>alt.X(<span class="st">'eval_score:Q'</span>, title<span class="op">=</span><span class="st">'Score'</span>), </span>
-<span id="cb33-16"><a href="#cb33-16" aria-hidden="true" tabindex="-1"></a>                y<span class="op">=</span>alt.Y(<span class="st">'count()'</span>), </span>
-<span id="cb33-17"><a href="#cb33-17" aria-hidden="true" tabindex="-1"></a>                color<span class="op">=</span>alt.value(<span class="st">'grey'</span>),</span>
-<span id="cb33-18"><a href="#cb33-18" aria-hidden="true" tabindex="-1"></a>                size<span class="op">=</span>alt.value(<span class="dv">20</span>),</span>
-<span id="cb33-19"><a href="#cb33-19" aria-hidden="true" tabindex="-1"></a>            )</span>
-<span id="cb33-20"><a href="#cb33-20" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb33-21"><a href="#cb33-21" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> df_ground_truth <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
-<span id="cb33-22"><a href="#cb33-22" aria-hidden="true" tabindex="-1"></a>        <span class="co"># data</span></span>
-<span id="cb33-23"><a href="#cb33-23" aria-hidden="true" tabindex="-1"></a>        gt_data <span class="op">=</span> df_ground_truth.copy()</span>
-<span id="cb33-24"><a href="#cb33-24" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> repo:</span>
-<span id="cb33-25"><a href="#cb33-25" aria-hidden="true" tabindex="-1"></a>            gt_data <span class="op">=</span> gt_data.query(<span class="ss">f'repo == "</span><span class="sc">{</span>repo<span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb33-26"><a href="#cb33-26" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> <span class="bu">id</span>:</span>
-<span id="cb33-27"><a href="#cb33-27" aria-hidden="true" tabindex="-1"></a>            gt_data <span class="op">=</span> gt_data.query(<span class="ss">f'id == "</span><span class="sc">{</span><span class="bu">id</span><span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb33-28"><a href="#cb33-28" aria-hidden="true" tabindex="-1"></a>        </span>
-<span id="cb33-29"><a href="#cb33-29" aria-hidden="true" tabindex="-1"></a>        repo_data <span class="op">=</span> pd.merge(repo_data, gt_data, how<span class="op">=</span><span class="st">'left'</span>, on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>])</span>
-<span id="cb33-30"><a href="#cb33-30" aria-hidden="true" tabindex="-1"></a>        repo_data[<span class="st">'is_equal_to_gt'</span>] <span class="op">=</span> repo_data[<span class="st">'eval_score'</span>] <span class="op">==</span> repo_data[<span class="st">'score'</span>]</span>
-<span id="cb33-31"><a href="#cb33-31" aria-hidden="true" tabindex="-1"></a>        </span>
-<span id="cb33-32"><a href="#cb33-32" aria-hidden="true" tabindex="-1"></a>        <span class="co"># base histogram chart</span></span>
-<span id="cb33-33"><a href="#cb33-33" aria-hidden="true" tabindex="-1"></a>        base <span class="op">=</span> base.encode(</span>
-<span id="cb33-34"><a href="#cb33-34" aria-hidden="true" tabindex="-1"></a>                    color<span class="op">=</span>alt.Color(<span class="st">'is_equal_to_gt'</span>, scale<span class="op">=</span>alt.Scale(<span class="bu">range</span><span class="op">=</span>[<span class="st">'grey'</span>, <span class="st">'green'</span>]), legend<span class="op">=</span><span class="va">None</span>)</span>
-<span id="cb33-35"><a href="#cb33-35" aria-hidden="true" tabindex="-1"></a>                )</span>
-<span id="cb33-36"><a href="#cb33-36" aria-hidden="true" tabindex="-1"></a>        base <span class="op">+=</span> base.mark_text().encode(</span>
-<span id="cb33-37"><a href="#cb33-37" aria-hidden="true" tabindex="-1"></a>            text<span class="op">=</span>alt.value(<span class="st">'Ground Truth'</span>),</span>
-<span id="cb33-38"><a href="#cb33-38" aria-hidden="true" tabindex="-1"></a>            x<span class="op">=</span><span class="st">'score'</span>,</span>
-<span id="cb33-39"><a href="#cb33-39" aria-hidden="true" tabindex="-1"></a>            size<span class="op">=</span>alt.value(<span class="dv">10</span>),</span>
-<span id="cb33-40"><a href="#cb33-40" aria-hidden="true" tabindex="-1"></a>            color<span class="op">=</span>alt.value(<span class="st">'green'</span>),</span>
-<span id="cb33-41"><a href="#cb33-41" aria-hidden="true" tabindex="-1"></a>        )</span>
-<span id="cb33-42"><a href="#cb33-42" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb33-43"><a href="#cb33-43" aria-hidden="true" tabindex="-1"></a>    plot <span class="op">=</span> alt.layer(</span>
-<span id="cb33-44"><a href="#cb33-44" aria-hidden="true" tabindex="-1"></a>                base,</span>
-<span id="cb33-45"><a href="#cb33-45" aria-hidden="true" tabindex="-1"></a>                data<span class="op">=</span>repo_data</span>
-<span id="cb33-46"><a href="#cb33-46" aria-hidden="true" tabindex="-1"></a>            ).properties(</span>
-<span id="cb33-47"><a href="#cb33-47" aria-hidden="true" tabindex="-1"></a>                width<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb33-48"><a href="#cb33-48" aria-hidden="true" tabindex="-1"></a>                height<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb33-49"><a href="#cb33-49" aria-hidden="true" tabindex="-1"></a>            ).facet(</span>
-<span id="cb33-50"><a href="#cb33-50" aria-hidden="true" tabindex="-1"></a>                row<span class="op">=</span><span class="st">'repo'</span>,</span>
-<span id="cb33-51"><a href="#cb33-51" aria-hidden="true" tabindex="-1"></a>                column<span class="op">=</span><span class="st">'id'</span></span>
-<span id="cb33-52"><a href="#cb33-52" aria-hidden="true" tabindex="-1"></a>            )        </span>
-<span id="cb33-53"><a href="#cb33-53" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb33-54"><a href="#cb33-54" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> plot</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-</section>
-<section id="contingency-table" class="level4">
-<h4 class="anchored" data-anchor-id="contingency-table">Contingency Table</h4>
-<div class="cell" data-execution_count="37">
-<div class="sourceCode cell-code" id="cb34"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_p <span class="op">=</span> pd.melt(df_repo_run, id_vars<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'run'</span>], var_name<span class="op">=</span><span class="st">'id'</span>, value_name<span class="op">=</span><span class="st">'eval_score'</span>)</span>
-<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a>df_repo_run_p <span class="op">=</span> pd.merge(df_repo_run_p, ground_truth, how<span class="op">=</span><span class="st">'inner'</span>, on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>])</span>
-<span id="cb34-3"><a href="#cb34-3" aria-hidden="true" tabindex="-1"></a>df_repo_run_p <span class="op">=</span> df_repo_run_p.rename(columns<span class="op">=</span>{<span class="st">'score'</span>: <span class="st">'ground_truth'</span>})</span>
-<span id="cb34-4"><a href="#cb34-4" aria-hidden="true" tabindex="-1"></a>pd.pivot_table(df_repo_run_p, values<span class="op">=</span><span class="st">'run'</span>, index<span class="op">=</span>[<span class="st">'ground_truth'</span>], columns<span class="op">=</span>[<span class="st">'eval_score'</span>], aggfunc<span class="op">=</span><span class="st">'count'</span>, fill_value<span class="op">=</span><span class="dv">0</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="37">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th">eval_score</th>
-<th data-quarto-table-cell-role="th">0.0</th>
-<th data-quarto-table-cell-role="th">0.5</th>
-<th data-quarto-table-cell-role="th">1.0</th>
-</tr>
-<tr class="odd">
-<th data-quarto-table-cell-role="th">ground_truth</th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0.0</td>
-<td>227</td>
-<td>8</td>
-<td>35</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">0.5</td>
-<td>21</td>
-<td>39</td>
-<td>30</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>21</td>
-<td>159</td>
-<td>90</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="35">
-<div class="sourceCode cell-code" id="cb35"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_p</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="35">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">run</th>
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th">eval_score</th>
-<th data-quarto-table-cell-role="th">ground_truth</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>DeepSpeech</td>
-<td>1</td>
-<td>2.1</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>DeepSpeech</td>
-<td>2</td>
-<td>2.1</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>DeepSpeech</td>
-<td>3</td>
-<td>2.1</td>
-<td>0.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>DeepSpeech</td>
-<td>4</td>
-<td>2.1</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>DeepSpeech</td>
-<td>5</td>
-<td>2.1</td>
-<td>0.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">...</td>
-<td>...</td>
-<td>...</td>
-<td>...</td>
-<td>...</td>
-<td>...</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">625</td>
-<td>qlib</td>
-<td>26</td>
-<td>6.2</td>
-<td>0.5</td>
-<td>1.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">626</td>
-<td>qlib</td>
-<td>27</td>
-<td>6.2</td>
-<td>0.5</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">627</td>
-<td>qlib</td>
-<td>28</td>
-<td>6.2</td>
-<td>0.5</td>
-<td>1.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">628</td>
-<td>qlib</td>
-<td>29</td>
-<td>6.2</td>
-<td>1.0</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">629</td>
-<td>qlib</td>
-<td>30</td>
-<td>6.2</td>
-<td>0.5</td>
-<td>1.0</td>
-</tr>
-</tbody>
-</table>
-
-<p>630 rows × 5 columns</p>
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="31">
-<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a><span class="co"># generate_histogram_plot(df_repo_run_p, df_ground_truth=ground_truth)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-
-
-</section>
-</section>
-
-</main> <!-- /main -->
-<script id="quarto-html-after-body" type="application/javascript">
-window.document.addEventListener("DOMContentLoaded", function (event) {
-  const toggleBodyColorMode = (bsSheetEl) => {
-    const mode = bsSheetEl.getAttribute("data-mode");
-    const bodyEl = window.document.querySelector("body");
-    if (mode === "dark") {
-      bodyEl.classList.add("quarto-dark");
-      bodyEl.classList.remove("quarto-light");
-    } else {
-      bodyEl.classList.add("quarto-light");
-      bodyEl.classList.remove("quarto-dark");
-    }
-  }
-  const toggleBodyColorPrimary = () => {
-    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
-    if (bsSheetEl) {
-      toggleBodyColorMode(bsSheetEl);
-    }
-  }
-  toggleBodyColorPrimary();  
-  const icon = "";
-  const anchorJS = new window.AnchorJS();
-  anchorJS.options = {
-    placement: 'right',
-    icon: icon
-  };
-  anchorJS.add('.anchored');
-  const isCodeAnnotation = (el) => {
-    for (const clz of el.classList) {
-      if (clz.startsWith('code-annotation-')) {                     
-        return true;
-      }
-    }
-    return false;
-  }
-  const clipboard = new window.ClipboardJS('.code-copy-button', {
-    text: function(trigger) {
-      const codeEl = trigger.previousElementSibling.cloneNode(true);
-      for (const childEl of codeEl.children) {
-        if (isCodeAnnotation(childEl)) {
-          childEl.remove();
-        }
-      }
-      return codeEl.innerText;
-    }
-  });
-  clipboard.on('success', function(e) {
-    // button target
-    const button = e.trigger;
-    // don't keep focus
-    button.blur();
-    // flash "checked"
-    button.classList.add('code-copy-button-checked');
-    var currentTitle = button.getAttribute("title");
-    button.setAttribute("title", "Copied!");
-    let tooltip;
-    if (window.bootstrap) {
-      button.setAttribute("data-bs-toggle", "tooltip");
-      button.setAttribute("data-bs-placement", "left");
-      button.setAttribute("data-bs-title", "Copied!");
-      tooltip = new bootstrap.Tooltip(button, 
-        { trigger: "manual", 
-          customClass: "code-copy-button-tooltip",
-          offset: [0, -8]});
-      tooltip.show();    
-    }
-    setTimeout(function() {
-      if (tooltip) {
-        tooltip.hide();
-        button.removeAttribute("data-bs-title");
-        button.removeAttribute("data-bs-toggle");
-        button.removeAttribute("data-bs-placement");
-      }
-      button.setAttribute("title", currentTitle);
-      button.classList.remove('code-copy-button-checked');
-    }, 1000);
-    // clear code selection
-    e.clearSelection();
-  });
-  function tippyHover(el, contentFn) {
-    const config = {
-      allowHTML: true,
-      content: contentFn,
-      maxWidth: 500,
-      delay: 100,
-      arrow: false,
-      appendTo: function(el) {
-          return el.parentElement;
-      },
-      interactive: true,
-      interactiveBorder: 10,
-      theme: 'quarto',
-      placement: 'bottom-start'
-    };
-    window.tippy(el, config); 
-  }
-  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
-  for (var i=0; i<noterefs.length; i++) {
-    const ref = noterefs[i];
-    tippyHover(ref, function() {
-      // use id or data attribute instead here
-      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
-      try { href = new URL(href).hash; } catch {}
-      const id = href.replace(/^#\/?/, "");
-      const note = window.document.getElementById(id);
-      return note.innerHTML;
-    });
-  }
-      let selectedAnnoteEl;
-      const selectorForAnnotation = ( cell, annotation) => {
-        let cellAttr = 'data-code-cell="' + cell + '"';
-        let lineAttr = 'data-code-annotation="' +  annotation + '"';
-        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
-        return selector;
-      }
-      const selectCodeLines = (annoteEl) => {
-        const doc = window.document;
-        const targetCell = annoteEl.getAttribute("data-target-cell");
-        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
-        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
-        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
-        const lineIds = lines.map((line) => {
-          return targetCell + "-" + line;
-        })
-        let top = null;
-        let height = null;
-        let parent = null;
-        if (lineIds.length > 0) {
-            //compute the position of the single el (top and bottom and make a div)
-            const el = window.document.getElementById(lineIds[0]);
-            top = el.offsetTop;
-            height = el.offsetHeight;
-            parent = el.parentElement.parentElement;
-          if (lineIds.length > 1) {
-            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
-            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
-            height = bottom - top;
-          }
-          if (top !== null && height !== null && parent !== null) {
-            // cook up a div (if necessary) and position it 
-            let div = window.document.getElementById("code-annotation-line-highlight");
-            if (div === null) {
-              div = window.document.createElement("div");
-              div.setAttribute("id", "code-annotation-line-highlight");
-              div.style.position = 'absolute';
-              parent.appendChild(div);
-            }
-            div.style.top = top - 2 + "px";
-            div.style.height = height + 4 + "px";
-            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
-            if (gutterDiv === null) {
-              gutterDiv = window.document.createElement("div");
-              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
-              gutterDiv.style.position = 'absolute';
-              const codeCell = window.document.getElementById(targetCell);
-              const gutter = codeCell.querySelector('.code-annotation-gutter');
-              gutter.appendChild(gutterDiv);
-            }
-            gutterDiv.style.top = top - 2 + "px";
-            gutterDiv.style.height = height + 4 + "px";
-          }
-          selectedAnnoteEl = annoteEl;
-        }
-      };
-      const unselectCodeLines = () => {
-        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
-        elementsIds.forEach((elId) => {
-          const div = window.document.getElementById(elId);
-          if (div) {
-            div.remove();
-          }
-        });
-        selectedAnnoteEl = undefined;
-      };
-      // Attach click handler to the DT
-      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
-      for (const annoteDlNode of annoteDls) {
-        annoteDlNode.addEventListener('click', (event) => {
-          const clickedEl = event.target;
-          if (clickedEl !== selectedAnnoteEl) {
-            unselectCodeLines();
-            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
-            if (activeEl) {
-              activeEl.classList.remove('code-annotation-active');
-            }
-            selectCodeLines(clickedEl);
-            clickedEl.classList.add('code-annotation-active');
-          } else {
-            // Unselect the line
-            unselectCodeLines();
-            clickedEl.classList.remove('code-annotation-active');
-          }
-        });
-      }
-  const findCites = (el) => {
-    const parentEl = el.parentElement;
-    if (parentEl) {
-      const cites = parentEl.dataset.cites;
-      if (cites) {
-        return {
-          el,
-          cites: cites.split(' ')
-        };
-      } else {
-        return findCites(el.parentElement)
-      }
-    } else {
-      return undefined;
-    }
-  };
-  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
-  for (var i=0; i<bibliorefs.length; i++) {
-    const ref = bibliorefs[i];
-    const citeInfo = findCites(ref);
-    if (citeInfo) {
-      tippyHover(citeInfo.el, function() {
-        var popup = window.document.createElement('div');
-        citeInfo.cites.forEach(function(cite) {
-          var citeDiv = window.document.createElement('div');
-          citeDiv.classList.add('hanging-indent');
-          citeDiv.classList.add('csl-entry');
-          var biblioDiv = window.document.getElementById('ref-' + cite);
-          if (biblioDiv) {
-            citeDiv.innerHTML = biblioDiv.innerHTML;
-          }
-          popup.appendChild(citeDiv);
-        });
-        return popup.innerHTML;
-      });
-    }
-  }
-});
-</script>
-</div> <!-- /content -->
-
-
-
-</body></html>
\ No newline at end of file
diff --git a/docs/04_plots-for-presentations.html b/docs/04_plots-for-presentations.html
deleted file mode 100644
index 2168f03..0000000
--- a/docs/04_plots-for-presentations.html
+++ /dev/null
@@ -1,1036 +0,0 @@
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
-
-<meta charset="utf-8">
-<meta name="generator" content="quarto-1.3.450">
-
-<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
-
-
-<title>Accuracy: Contingency table</title>
-<style>
-code{white-space: pre-wrap;}
-span.smallcaps{font-variant: small-caps;}
-div.columns{display: flex; gap: min(4vw, 1.5em);}
-div.column{flex: auto; overflow-x: auto;}
-div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
-ul.task-list{list-style: none;}
-ul.task-list li input[type="checkbox"] {
-  width: 0.8em;
-  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
-  vertical-align: middle;
-}
-/* CSS for syntax highlighting */
-pre > code.sourceCode { white-space: pre; position: relative; }
-pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
-pre > code.sourceCode > span:empty { height: 1.2em; }
-.sourceCode { overflow: visible; }
-code.sourceCode > span { color: inherit; text-decoration: inherit; }
-div.sourceCode { margin: 1em 0; }
-pre.sourceCode { margin: 0; }
-@media screen {
-div.sourceCode { overflow: auto; }
-}
-@media print {
-pre > code.sourceCode { white-space: pre-wrap; }
-pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
-}
-pre.numberSource code
-  { counter-reset: source-line 0; }
-pre.numberSource code > span
-  { position: relative; left: -4em; counter-increment: source-line; }
-pre.numberSource code > span > a:first-child::before
-  { content: counter(source-line);
-    position: relative; left: -1em; text-align: right; vertical-align: baseline;
-    border: none; display: inline-block;
-    -webkit-touch-callout: none; -webkit-user-select: none;
-    -khtml-user-select: none; -moz-user-select: none;
-    -ms-user-select: none; user-select: none;
-    padding: 0 4px; width: 4em;
-  }
-pre.numberSource { margin-left: 3em;  padding-left: 4px; }
-div.sourceCode
-  {   }
-@media screen {
-pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
-}
-</style>
-
-
-<script src="site_libs/quarto-nav/quarto-nav.js"></script>
-<script src="site_libs/quarto-nav/headroom.min.js"></script>
-<script src="site_libs/clipboard/clipboard.min.js"></script>
-<script src="site_libs/quarto-search/autocomplete.umd.js"></script>
-<script src="site_libs/quarto-search/fuse.min.js"></script>
-<script src="site_libs/quarto-search/quarto-search.js"></script>
-<meta name="quarto:offset" content="./">
-<script src="site_libs/quarto-html/quarto.js"></script>
-<script src="site_libs/quarto-html/popper.min.js"></script>
-<script src="site_libs/quarto-html/tippy.umd.min.js"></script>
-<script src="site_libs/quarto-html/anchor.min.js"></script>
-<link href="site_libs/quarto-html/tippy.css" rel="stylesheet">
-<link href="site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
-<script src="site_libs/bootstrap/bootstrap.min.js"></script>
-<link href="site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
-<link href="site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
-<script id="quarto-search-options" type="application/json">{
-  "location": "sidebar",
-  "copy-button": false,
-  "collapse-after": 3,
-  "panel-placement": "start",
-  "type": "textbox",
-  "limit": 20,
-  "language": {
-    "search-no-results-text": "No results",
-    "search-matching-documents-text": "matching documents",
-    "search-copy-link-title": "Copy link to search",
-    "search-hide-matches-text": "Hide additional matches",
-    "search-more-match-text": "more match in this document",
-    "search-more-matches-text": "more matches in this document",
-    "search-clear-button-title": "Clear",
-    "search-detached-cancel-button-title": "Cancel",
-    "search-submit-button-title": "Submit",
-    "search-label": "Search"
-  }
-}</script>
-<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
-<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script>
-<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
-
-
-<link rel="stylesheet" href="styles.css">
-</head>
-
-<body class="nav-sidebar docked">
-
-<div id="quarto-search-results"></div>
-  <header id="quarto-header" class="headroom fixed-top">
-  <nav class="quarto-secondary-nav">
-    <div class="container-fluid d-flex">
-      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
-        <i class="bi bi-layout-text-sidebar-reverse"></i>
-      </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item">Accuracy: Contingency table</li></ol></nav>
-      <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
-      </a>
-      <button type="button" class="btn quarto-search-button" aria-label="" onclick="window.quartoOpenSearch();">
-        <i class="bi bi-search"></i>
-      </button>
-    </div>
-  </nav>
-</header>
-<!-- content -->
-<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
-<!-- sidebar -->
-  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal sidebar-navigation docked overflow-auto">
-    <div class="pt-lg-2 mt-2 text-left sidebar-header">
-      <a href="./index.html" class="sidebar-logo-link">
-      <img src="./logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
-      </a>
-      </div>
-        <div class="mt-2 flex-shrink-0 align-items-center">
-        <div class="sidebar-search">
-        <div id="quarto-search" class="" title="Search"></div>
-        </div>
-        </div>
-    <div class="sidebar-menu-container"> 
-    <ul class="list-unstyled mt-1">
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
- <span class="menu-text">Final Report</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./final_report.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
- <span class="menu-text">Proposal</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./proposal.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-    </ul>
-    </div>
-</nav>
-<div id="quarto-sidebar-glass" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass"></div>
-<!-- margin-sidebar -->
-    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
-        <nav id="TOC" role="doc-toc" class="toc-active">
-    <h2 id="toc-title">On this page</h2>
-   
-  <ul>
-  <li><a href="#consistency-jitterbox-plot" id="toc-consistency-jitterbox-plot" class="nav-link active" data-scroll-target="#consistency-jitterbox-plot">Consistency: jitterbox plot</a></li>
-  <li><a href="#improvement-from-3.5-to-4o" id="toc-improvement-from-3.5-to-4o" class="nav-link" data-scroll-target="#improvement-from-3.5-to-4o">improvement from 3.5 to 4o</a></li>
-  </ul>
-</nav>
-    </div>
-<!-- main -->
-<main class="content" id="quarto-document-content">
-
-<header id="title-block-header" class="quarto-title-block default">
-<div class="quarto-title">
-<h1 class="title">Accuracy: Contingency table</h1>
-</div>
-
-
-
-<div class="quarto-title-meta">
-
-    
-  
-    
-  </div>
-  
-
-</header>
-
-<div class="cell" data-scrolled="true" data-execution_count="5">
-<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>pip install scipy altair</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-stdout">
-<pre><code>Requirement already satisfied: scipy in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (1.13.1)
-Collecting altair
-  Using cached altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
-Requirement already satisfied: numpy&lt;2.3,&gt;=1.22.4 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from scipy) (1.26.4)
-Requirement already satisfied: jinja2 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (3.1.4)
-Requirement already satisfied: jsonschema&gt;=3.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (4.22.0)
-Requirement already satisfied: packaging in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (23.2)
-Requirement already satisfied: pandas&gt;=0.25 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (2.2.2)
-Collecting toolz (from altair)
-  Using cached toolz-0.12.1-py3-none-any.whl.metadata (5.1 kB)
-Requirement already satisfied: attrs&gt;=22.2.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (23.2.0)
-Requirement already satisfied: jsonschema-specifications&gt;=2023.03.6 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (2023.12.1)
-Requirement already satisfied: referencing&gt;=0.28.4 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (0.35.1)
-Requirement already satisfied: rpds-py&gt;=0.7.1 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (0.18.1)
-Requirement already satisfied: python-dateutil&gt;=2.8.2 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas&gt;=0.25-&gt;altair) (2.9.0.post0)
-Requirement already satisfied: pytz&gt;=2020.1 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas&gt;=0.25-&gt;altair) (2024.1)
-Requirement already satisfied: tzdata&gt;=2022.7 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas&gt;=0.25-&gt;altair) (2024.1)
-Requirement already satisfied: MarkupSafe&gt;=2.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jinja2-&gt;altair) (2.1.5)
-Requirement already satisfied: six&gt;=1.5 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from python-dateutil&gt;=2.8.2-&gt;pandas&gt;=0.25-&gt;altair) (1.16.0)
-Using cached altair-5.3.0-py3-none-any.whl (857 kB)
-Using cached toolz-0.12.1-py3-none-any.whl (56 kB)
-Installing collected packages: toolz, altair
-Successfully installed altair-5.3.0 toolz-0.12.1</code></pre>
-</div>
-</div>
-<div class="cell" data-execution_count="1">
-<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> scipy</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pickle</span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> json</span>
-<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> yaml</span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
-<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> altair <span class="im">as</span> alt</span>
-<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> Counter</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="2">
-<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_report(response):</span>
-<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>    report <span class="op">=</span> []</span>
-<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> result <span class="kw">in</span> response.call_results:</span>
-<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> result.parsed_response:</span>
-<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>            resp <span class="op">=</span> result.parsed_response[<span class="st">'results'</span>]</span>
-<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>            <span class="cf">for</span> item <span class="kw">in</span> resp:</span>
-<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'file'</span>] <span class="op">=</span> result.files_evaluated[<span class="dv">0</span>] </span>
-<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'success'</span>] <span class="op">=</span> result.success</span>
-<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>                report.append(item)</span>
-<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a>        <span class="cf">else</span>:</span>
-<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a>            report.append({</span>
-<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a>                <span class="st">'ID'</span>: <span class="st">'2.1'</span>, <span class="co"># </span><span class="al">FIXME</span></span>
-<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Title'</span>: <span class="st">''</span>,</span>
-<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Requirement'</span>: <span class="st">''</span>,</span>
-<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Observation'</span>: <span class="st">''</span>,</span>
-<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Functions'</span>: [],</span>
-<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Evaluation'</span>: <span class="st">''</span>,</span>
-<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Score'</span>: <span class="dv">0</span>,</span>
-<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a>                <span class="st">'file'</span>: result.files_evaluated[<span class="dv">0</span>],</span>
-<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a>                <span class="st">'success'</span>: result.success</span>
-<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a>            })</span>
-<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> pd.DataFrame(report)</span>
-<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_report_json(response):</span>
-<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a>    report <span class="op">=</span> []</span>
-<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> result <span class="kw">in</span> response[<span class="st">'call_results'</span>]:</span>
-<span id="cb4-27"><a href="#cb4-27" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> result[<span class="st">'parsed_response'</span>]:</span>
-<span id="cb4-28"><a href="#cb4-28" aria-hidden="true" tabindex="-1"></a>            resp <span class="op">=</span> result[<span class="st">'parsed_response'</span>][<span class="st">'results'</span>]</span>
-<span id="cb4-29"><a href="#cb4-29" aria-hidden="true" tabindex="-1"></a>            <span class="cf">for</span> item <span class="kw">in</span> resp:</span>
-<span id="cb4-30"><a href="#cb4-30" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'file'</span>] <span class="op">=</span> result[<span class="st">'files_evaluated'</span>][<span class="dv">0</span>] </span>
-<span id="cb4-31"><a href="#cb4-31" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'success'</span>] <span class="op">=</span> result[<span class="st">'success'</span>]</span>
-<span id="cb4-32"><a href="#cb4-32" aria-hidden="true" tabindex="-1"></a>                report.append(item)</span>
-<span id="cb4-33"><a href="#cb4-33" aria-hidden="true" tabindex="-1"></a>        <span class="cf">else</span>:</span>
-<span id="cb4-34"><a href="#cb4-34" aria-hidden="true" tabindex="-1"></a>            report.append({</span>
-<span id="cb4-35"><a href="#cb4-35" aria-hidden="true" tabindex="-1"></a>                <span class="st">'ID'</span>: <span class="st">'2.1'</span>, <span class="co"># </span><span class="al">FIXME</span></span>
-<span id="cb4-36"><a href="#cb4-36" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Title'</span>: <span class="st">''</span>,</span>
-<span id="cb4-37"><a href="#cb4-37" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Requirement'</span>: <span class="st">''</span>,</span>
-<span id="cb4-38"><a href="#cb4-38" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Observation'</span>: <span class="st">''</span>,</span>
-<span id="cb4-39"><a href="#cb4-39" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Functions'</span>: [],</span>
-<span id="cb4-40"><a href="#cb4-40" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Evaluation'</span>: <span class="st">''</span>,</span>
-<span id="cb4-41"><a href="#cb4-41" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Score'</span>: <span class="dv">0</span>,</span>
-<span id="cb4-42"><a href="#cb4-42" aria-hidden="true" tabindex="-1"></a>                <span class="st">'file'</span>: result.files_evaluated[<span class="dv">0</span>],</span>
-<span id="cb4-43"><a href="#cb4-43" aria-hidden="true" tabindex="-1"></a>                <span class="st">'success'</span>: result.success</span>
-<span id="cb4-44"><a href="#cb4-44" aria-hidden="true" tabindex="-1"></a>            })</span>
-<span id="cb4-45"><a href="#cb4-45" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> pd.DataFrame(report)</span>
-<span id="cb4-46"><a href="#cb4-46" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-47"><a href="#cb4-47" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> extract_file_and_scores(resp_path, verbose<span class="op">=</span><span class="va">False</span>):</span>
-<span id="cb4-48"><a href="#cb4-48" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> verbose:</span>
-<span id="cb4-49"><a href="#cb4-49" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(resp_path)</span>
-<span id="cb4-50"><a href="#cb4-50" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> <span class="bu">open</span>(resp_path, <span class="st">'rb'</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
-<span id="cb4-51"><a href="#cb4-51" aria-hidden="true" tabindex="-1"></a>        <span class="cf">try</span>:</span>
-<span id="cb4-52"><a href="#cb4-52" aria-hidden="true" tabindex="-1"></a>            response <span class="op">=</span> pickle.load(<span class="bu">file</span>)</span>
-<span id="cb4-53"><a href="#cb4-53" aria-hidden="true" tabindex="-1"></a>            report <span class="op">=</span> get_report(response)</span>
-<span id="cb4-54"><a href="#cb4-54" aria-hidden="true" tabindex="-1"></a>        <span class="cf">except</span>:</span>
-<span id="cb4-55"><a href="#cb4-55" aria-hidden="true" tabindex="-1"></a>            response <span class="op">=</span> json.load(<span class="bu">file</span>)</span>
-<span id="cb4-56"><a href="#cb4-56" aria-hidden="true" tabindex="-1"></a>            report <span class="op">=</span> get_report_json(response)</span>
-<span id="cb4-57"><a href="#cb4-57" aria-hidden="true" tabindex="-1"></a>    df <span class="op">=</span> (</span>
-<span id="cb4-58"><a href="#cb4-58" aria-hidden="true" tabindex="-1"></a>        report</span>
-<span id="cb4-59"><a href="#cb4-59" aria-hidden="true" tabindex="-1"></a>        .pivot(index<span class="op">=</span><span class="st">'file'</span>, columns<span class="op">=</span><span class="st">'ID'</span>, values<span class="op">=</span><span class="st">'Score'</span>)</span>
-<span id="cb4-60"><a href="#cb4-60" aria-hidden="true" tabindex="-1"></a>        .rename_axis(<span class="va">None</span>, axis<span class="op">=</span><span class="dv">1</span>)</span>
-<span id="cb4-61"><a href="#cb4-61" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb4-62"><a href="#cb4-62" aria-hidden="true" tabindex="-1"></a>    df[<span class="st">'success'</span>] <span class="op">=</span> report.groupby([<span class="st">'file'</span>])[<span class="st">'success'</span>].<span class="bu">all</span>()</span>
-<span id="cb4-63"><a href="#cb4-63" aria-hidden="true" tabindex="-1"></a>    df[<span class="st">'response_path'</span>] <span class="op">=</span> resp_path</span>
-<span id="cb4-64"><a href="#cb4-64" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> df.reset_index()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="3">
-<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>checklist_ids <span class="op">=</span> [<span class="st">'2.1'</span>, <span class="st">'3.2'</span>, <span class="st">'3.5'</span>, <span class="st">'4.2'</span>, <span class="st">'5.3'</span>, <span class="st">'6.1'</span>, <span class="st">'6.2'</span>]</span>
-<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> read_and_preprocess(result_path):</span>
-<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> <span class="bu">open</span>(result_path, <span class="st">'r'</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
-<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>        config <span class="op">=</span> pd.DataFrame(yaml.safe_load(<span class="bu">file</span>))</span>
-<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a>    <span class="co"># prepare score data by repo, run, file</span></span>
-<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a>    tmp <span class="op">=</span> [</span>
-<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a>        extract_file_and_scores(path) <span class="cf">for</span> path <span class="kw">in</span> config[<span class="st">'response_path'</span>] <span class="co"># </span><span class="al">FIXME</span><span class="co">: excluded deepchem</span></span>
-<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a>    ]</span>
-<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a>    tmp <span class="op">=</span> pd.concat(tmp, axis<span class="op">=</span><span class="dv">0</span>).reset_index(drop<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a>    raw_df_repo_run_file <span class="op">=</span> config.merge(tmp, on<span class="op">=</span><span class="st">'response_path'</span>, how<span class="op">=</span><span class="st">'left'</span>)</span>
-<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a>    <span class="co"># filter non-test files in qlib</span></span>
-<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a>    df_repo_run_file <span class="op">=</span> raw_df_repo_run_file.query(<span class="st">'(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))'</span>)</span>
-<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a>    <span class="co"># prepare score data by repo, run</span></span>
-<span id="cb5-19"><a href="#cb5-19" aria-hidden="true" tabindex="-1"></a>    df_repo_run <span class="op">=</span> df_repo_run_file.groupby([<span class="st">'repo'</span>, <span class="st">'run'</span>]).agg({</span>
-<span id="cb5-20"><a href="#cb5-20" aria-hidden="true" tabindex="-1"></a>        <span class="bu">id</span>: [<span class="st">'max'</span>] <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> checklist_ids</span>
-<span id="cb5-21"><a href="#cb5-21" aria-hidden="true" tabindex="-1"></a>    })</span>
-<span id="cb5-22"><a href="#cb5-22" aria-hidden="true" tabindex="-1"></a>    df_repo_run.columns <span class="op">=</span> [col[<span class="dv">0</span>] <span class="cf">for</span> col <span class="kw">in</span> df_repo_run.columns]</span>
-<span id="cb5-23"><a href="#cb5-23" aria-hidden="true" tabindex="-1"></a>    df_repo_run <span class="op">=</span> df_repo_run.reset_index()</span>
-<span id="cb5-24"><a href="#cb5-24" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-25"><a href="#cb5-25" aria-hidden="true" tabindex="-1"></a>    <span class="co"># prepare statistics of scores by repo</span></span>
-<span id="cb5-26"><a href="#cb5-26" aria-hidden="true" tabindex="-1"></a>    df_repo__stat <span class="op">=</span> df_repo_run.groupby([<span class="st">'repo'</span>]).agg({</span>
-<span id="cb5-27"><a href="#cb5-27" aria-hidden="true" tabindex="-1"></a>        <span class="bu">id</span>: [<span class="st">'mean'</span>, <span class="st">'std'</span>, <span class="st">'count'</span>] <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> checklist_ids</span>
-<span id="cb5-28"><a href="#cb5-28" aria-hidden="true" tabindex="-1"></a>    })</span>
-<span id="cb5-29"><a href="#cb5-29" aria-hidden="true" tabindex="-1"></a>    df_repo__stat <span class="op">=</span> pd.melt(df_repo__stat.reset_index(), id_vars<span class="op">=</span>[(<span class="st">'repo'</span>, <span class="st">''</span>)])</span>
-<span id="cb5-30"><a href="#cb5-30" aria-hidden="true" tabindex="-1"></a>    df_repo__stat.columns <span class="op">=</span> [<span class="st">'repo'</span>, <span class="st">'id'</span>, <span class="st">'stat'</span>, <span class="st">'value'</span>]</span>
-<span id="cb5-31"><a href="#cb5-31" aria-hidden="true" tabindex="-1"></a>    df_repo__stat <span class="op">=</span> (</span>
-<span id="cb5-32"><a href="#cb5-32" aria-hidden="true" tabindex="-1"></a>        df_repo__stat.pivot(index<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>], columns<span class="op">=</span><span class="st">'stat'</span>, values<span class="op">=</span><span class="st">'value'</span>)</span>
-<span id="cb5-33"><a href="#cb5-33" aria-hidden="true" tabindex="-1"></a>        .reset_index()</span>
-<span id="cb5-34"><a href="#cb5-34" aria-hidden="true" tabindex="-1"></a>        .rename_axis(<span class="va">None</span>, axis<span class="op">=</span><span class="dv">1</span>)</span>
-<span id="cb5-35"><a href="#cb5-35" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb5-36"><a href="#cb5-36" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-37"><a href="#cb5-37" aria-hidden="true" tabindex="-1"></a>    <span class="co"># prepare counting of scores by repo</span></span>
-<span id="cb5-38"><a href="#cb5-38" aria-hidden="true" tabindex="-1"></a>    df_repo__count <span class="op">=</span> df_repo_run.groupby([<span class="st">'repo'</span>])[<span class="st">'2.1'</span>].<span class="bu">apply</span>(Counter).reset_index()</span>
-<span id="cb5-39"><a href="#cb5-39" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> checklist_ids[<span class="dv">1</span>:]:</span>
-<span id="cb5-40"><a href="#cb5-40" aria-hidden="true" tabindex="-1"></a>        df_repo__count <span class="op">=</span> df_repo__count.merge(</span>
-<span id="cb5-41"><a href="#cb5-41" aria-hidden="true" tabindex="-1"></a>            df_repo_run.groupby([<span class="st">'repo'</span>])[<span class="bu">id</span>].<span class="bu">apply</span>(Counter).reset_index(),</span>
-<span id="cb5-42"><a href="#cb5-42" aria-hidden="true" tabindex="-1"></a>            on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'level_1'</span>],</span>
-<span id="cb5-43"><a href="#cb5-43" aria-hidden="true" tabindex="-1"></a>            how<span class="op">=</span><span class="st">'outer'</span></span>
-<span id="cb5-44"><a href="#cb5-44" aria-hidden="true" tabindex="-1"></a>        )</span>
-<span id="cb5-45"><a href="#cb5-45" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-46"><a href="#cb5-46" aria-hidden="true" tabindex="-1"></a>    df_repo__count <span class="op">=</span> df_repo__count.fillna(<span class="dv">0</span>)</span>
-<span id="cb5-47"><a href="#cb5-47" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-48"><a href="#cb5-48" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (df_repo_run_file, df_repo_run, df_repo__stat, df_repo__count)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="4">
-<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ground truth</span></span>
-<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>ground_truth <span class="op">=</span> pd.DataFrame([</span>
-<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-20"><a href="#cb6-20" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-21"><a href="#cb6-21" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-22"><a href="#cb6-22" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-23"><a href="#cb6-23" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-24"><a href="#cb6-24" aria-hidden="true" tabindex="-1"></a>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="5">
-<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>id_item_map <span class="op">=</span> {</span>
-<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>    <span class="st">'2.1'</span>: <span class="st">'Ensure Data File Loads as Expected'</span>,</span>
-<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>    <span class="st">'3.2'</span>: <span class="st">'Data in the Expected Format'</span>,</span>
-<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>    <span class="st">'3.5'</span>: <span class="st">'Check for Duplicate Records in Data'</span>,</span>
-<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>    <span class="st">'4.2'</span>: <span class="st">'Verify Data Split Proportion'</span>,</span>
-<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>    <span class="st">'5.3'</span>: <span class="st">'Ensure Model Output Shape Aligns with Expectation'</span>,</span>
-<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a>    <span class="st">'6.1'</span>: <span class="st">'Verify Evaluation Metrics Implementation'</span>,</span>
-<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>    <span class="st">'6.2'</span>: <span class="st">"Evaluate Model's Performance Against Thresholds"</span></span>
-<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="6">
-<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co">#result_path = '../draft/batch_run_results/record_combine.yml'</span></span>
-<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>df_repo_run_file, df_repo_run, df_repo__stat, df_repo__count <span class="op">=</span> read_and_preprocess(</span>
-<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>    <span class="st">'../data/processed/batch_run/record_combine.yml'</span></span>
-<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="7">
-<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>cont_table <span class="op">=</span> pd.melt(</span>
-<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>    df_repo_run.query(<span class="st">'(repo == "lightfm")'</span>)[[<span class="st">'repo'</span>, <span class="st">'run'</span>, <span class="st">'3.5'</span>, <span class="st">'4.2'</span>, <span class="st">'5.3'</span>]], </span>
-<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>    id_vars<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'run'</span>], var_name<span class="op">=</span><span class="st">'id'</span>, value_name<span class="op">=</span><span class="st">'System Output'</span>)</span>
-<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>cont_table <span class="op">=</span> pd.merge(cont_table, ground_truth, how<span class="op">=</span><span class="st">'inner'</span>, on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>])</span>
-<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>cont_table <span class="op">=</span> cont_table.rename(columns<span class="op">=</span>{<span class="st">'score'</span>: <span class="st">'ground_truth'</span>})</span>
-<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>cont_table[<span class="st">'title'</span>] <span class="op">=</span> cont_table[<span class="st">'id'</span>].<span class="bu">apply</span>(<span class="kw">lambda</span> x: id_item_map[x])</span>
-<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a><span class="co">#cont_table = cont_table[['repo', 'title', 'ground_truth', 'System Output', 'run']]</span></span>
-<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a>cont_table <span class="op">=</span> pd.pivot_table(cont_table, values<span class="op">=</span><span class="st">'run'</span>, index<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>, <span class="st">'title'</span>, <span class="st">'ground_truth'</span>], columns<span class="op">=</span>[<span class="st">'System Output'</span>], aggfunc<span class="op">=</span><span class="st">'count'</span>, fill_value<span class="op">=</span><span class="dv">0</span>)</span>
-<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a>cont_table.index.names <span class="op">=</span> [<span class="st">'Repository'</span>, <span class="st">'ID'</span>, <span class="st">'Title'</span>, <span class="st">'Ground Truth'</span>]</span>
-<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a>cont_table.sort_index(level<span class="op">=</span><span class="dv">3</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="7">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">System Output</th>
-<th data-quarto-table-cell-role="th">0.0</th>
-<th data-quarto-table-cell-role="th">0.5</th>
-<th data-quarto-table-cell-role="th">1.0</th>
-</tr>
-<tr class="odd">
-<th data-quarto-table-cell-role="th">Repository</th>
-<th data-quarto-table-cell-role="th">ID</th>
-<th data-quarto-table-cell-role="th">Title</th>
-<th data-quarto-table-cell-role="th">Ground Truth</th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td rowspan="3" data-quarto-table-cell-role="th" data-valign="top">lightfm</td>
-<td data-quarto-table-cell-role="th">3.5</td>
-<td data-quarto-table-cell-role="th">Check for Duplicate Records in Data</td>
-<td data-quarto-table-cell-role="th">0.0</td>
-<td>19</td>
-<td>6</td>
-<td>5</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">5.3</td>
-<td data-quarto-table-cell-role="th">Ensure Model Output Shape Aligns with Expectation</td>
-<td data-quarto-table-cell-role="th">0.5</td>
-<td>18</td>
-<td>12</td>
-<td>0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4.2</td>
-<td data-quarto-table-cell-role="th">Verify Data Split Proportion</td>
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>0</td>
-<td>27</td>
-<td>3</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<section id="consistency-jitterbox-plot" class="level3">
-<h3 class="anchored" data-anchor-id="consistency-jitterbox-plot">Consistency: jitterbox plot</h3>
-<div class="cell" data-execution_count="9">
-<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> df_repo__stat[[<span class="st">'repo'</span>, <span class="st">'std'</span>, <span class="st">'id'</span>]].pivot(index<span class="op">=</span><span class="st">'repo'</span>, columns<span class="op">=</span><span class="st">'id'</span>).copy()</span>
-<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>stds.columns <span class="op">=</span> [col[<span class="dv">1</span>] <span class="cf">for</span> col <span class="kw">in</span> stds.columns]</span>
-<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> stds.reset_index()</span>
-<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> stds.melt(id_vars<span class="op">=</span><span class="st">'repo'</span>, var_name<span class="op">=</span><span class="st">'id'</span>)</span>
-<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a>stds[<span class="st">'title'</span>] <span class="op">=</span> stds[<span class="st">'id'</span>].<span class="bu">apply</span>(<span class="kw">lambda</span> x: id_item_map[x])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="10">
-<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>box <span class="op">=</span> alt.Chart().mark_boxplot(</span>
-<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'grey'</span>,</span>
-<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>    opacity<span class="op">=</span><span class="fl">0.5</span>,</span>
-<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">20</span>,</span>
-<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a>).encode(</span>
-<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'value:Q'</span>).title(<span class="st">'System Output Uncertainty'</span>),</span>
-<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'title:N'</span>, title<span class="op">=</span><span class="va">None</span>, axis<span class="op">=</span>alt.Axis(labelPadding<span class="op">=</span><span class="dv">10</span>, labelLimit<span class="op">=</span><span class="dv">1000</span>, grid<span class="op">=</span><span class="va">False</span>))</span>
-<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb11-10"><a href="#cb11-10" aria-hidden="true" tabindex="-1"></a>stripplot <span class="op">=</span> alt.Chart().mark_circle(size<span class="op">=</span><span class="dv">100</span>).encode(</span>
-<span id="cb11-11"><a href="#cb11-11" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y( </span>
-<span id="cb11-12"><a href="#cb11-12" aria-hidden="true" tabindex="-1"></a>        <span class="st">'title:N'</span>,</span>
-<span id="cb11-13"><a href="#cb11-13" aria-hidden="true" tabindex="-1"></a>        axis<span class="op">=</span>alt.Axis(ticks<span class="op">=</span><span class="va">False</span>, grid<span class="op">=</span><span class="va">True</span>, labels<span class="op">=</span><span class="va">True</span>), </span>
-<span id="cb11-14"><a href="#cb11-14" aria-hidden="true" tabindex="-1"></a>        scale<span class="op">=</span>alt.Scale(), </span>
-<span id="cb11-15"><a href="#cb11-15" aria-hidden="true" tabindex="-1"></a>    ), </span>
-<span id="cb11-16"><a href="#cb11-16" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span><span class="st">'value:Q'</span>,</span>
-<span id="cb11-17"><a href="#cb11-17" aria-hidden="true" tabindex="-1"></a>    yOffset<span class="op">=</span><span class="st">"jitter:Q"</span>,</span>
-<span id="cb11-18"><a href="#cb11-18" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span>alt.Color(<span class="st">'id:N'</span>, legend<span class="op">=</span><span class="va">None</span>),</span>
-<span id="cb11-19"><a href="#cb11-19" aria-hidden="true" tabindex="-1"></a>    tooltip<span class="op">=</span><span class="st">'repo'</span></span>
-<span id="cb11-20"><a href="#cb11-20" aria-hidden="true" tabindex="-1"></a>).transform_calculate(</span>
-<span id="cb11-21"><a href="#cb11-21" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Generate Gaussian jitter with a Box-Muller transform</span></span>
-<span id="cb11-22"><a href="#cb11-22" aria-hidden="true" tabindex="-1"></a>    jitter<span class="op">=</span><span class="st">"sqrt(-2*log(random()))*cos(2*PI*random())"</span></span>
-<span id="cb11-23"><a href="#cb11-23" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb11-24"><a href="#cb11-24" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb11-25"><a href="#cb11-25" aria-hidden="true" tabindex="-1"></a>plot <span class="op">=</span> alt.layer(</span>
-<span id="cb11-26"><a href="#cb11-26" aria-hidden="true" tabindex="-1"></a>    box,</span>
-<span id="cb11-27"><a href="#cb11-27" aria-hidden="true" tabindex="-1"></a>    stripplot,</span>
-<span id="cb11-28"><a href="#cb11-28" aria-hidden="true" tabindex="-1"></a>    data<span class="op">=</span>stds</span>
-<span id="cb11-29"><a href="#cb11-29" aria-hidden="true" tabindex="-1"></a>).configure_view( </span>
-<span id="cb11-30"><a href="#cb11-30" aria-hidden="true" tabindex="-1"></a>    stroke<span class="op">=</span><span class="va">None</span></span>
-<span id="cb11-31"><a href="#cb11-31" aria-hidden="true" tabindex="-1"></a>).configure_axis( </span>
-<span id="cb11-32"><a href="#cb11-32" aria-hidden="true" tabindex="-1"></a>    labelFontSize<span class="op">=</span><span class="dv">12</span>, </span>
-<span id="cb11-33"><a href="#cb11-33" aria-hidden="true" tabindex="-1"></a>    titleFontSize<span class="op">=</span><span class="dv">12</span></span>
-<span id="cb11-34"><a href="#cb11-34" aria-hidden="true" tabindex="-1"></a>).properties(</span>
-<span id="cb11-35"><a href="#cb11-35" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">300</span>, </span>
-<span id="cb11-36"><a href="#cb11-36" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">600</span>,</span>
-<span id="cb11-37"><a href="#cb11-37" aria-hidden="true" tabindex="-1"></a>    title<span class="op">=</span><span class="st">"30 Runs on Openja's Repositories for each Checklist Item"</span></span>
-<span id="cb11-38"><a href="#cb11-38" aria-hidden="true" tabindex="-1"></a>) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="11">
-<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>plot</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="11">
-
-<style>
-  #altair-viz-98bc7ab37dbc4cd183c09ce60b37e3e6.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-98bc7ab37dbc4cd183c09ce60b37e3e6.vega-embed details,
-  #altair-viz-98bc7ab37dbc4cd183c09ce60b37e3e6.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-98bc7ab37dbc4cd183c09ce60b37e3e6"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-98bc7ab37dbc4cd183c09ce60b37e3e6") {
-      outputDiv = document.getElementById("altair-viz-98bc7ab37dbc4cd183c09ce60b37e3e6");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300, "stroke": null}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "layer": [{"mark": {"type": "boxplot", "color": "grey", "opacity": 0.5, "size": 20}, "encoding": {"x": {"field": "value", "title": "System Output Uncertainty", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "title", "title": null, "type": "nominal"}}}, {"mark": {"type": "circle", "size": 100}, "encoding": {"color": {"field": "id", "legend": null, "type": "nominal"}, "tooltip": {"field": "repo", "type": "nominal"}, "x": {"field": "value", "type": "quantitative"}, "y": {"axis": {"grid": true, "labels": true, "ticks": false}, "field": "title", "scale": {}, "type": "nominal"}, "yOffset": {"field": "jitter", "type": "quantitative"}}, "transform": [{"calculate": "sqrt(-2*log(random()))*cos(2*PI*random())", "as": "jitter"}]}], "data": {"name": "data-f627f748e40db5fba424f203fa34877c"}, "height": 300, "title": "30 Runs on Openja's Repositories for each Checklist Item", "width": 600, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-f627f748e40db5fba424f203fa34877c": [{"repo": "DeepSpeech", "id": "2.1", "value": 0.47946330148538413, "title": "Ensure Data File Loads as Expected"}, {"repo": "apollo", "id": "2.1", "value": 0.15256428831468236, "title": "Ensure Data File Loads as Expected"}, {"repo": "lightfm", "id": "2.1", "value": 0.09128709291752768, "title": "Ensure Data File Loads as Expected"}, {"repo": "magenta", "id": "2.1", "value": 0.4234505222898965, "title": "Ensure Data File Loads as Expected"}, {"repo": "mmf", "id": "2.1", "value": 0.0, "title": "Ensure Data File Loads as Expected"}, {"repo": "mycroft-core", "id": "2.1", "value": 0.0, "title": "Ensure Data File Loads as Expected"}, {"repo": "nanodet", "id": "2.1", "value": 0.0, "title": "Ensure Data File Loads as Expected"}, {"repo": "paperless-ng", "id": "2.1", "value": 0.24211709905575113, "title": "Ensure Data File Loads as Expected"}, {"repo": "qlib", "id": "2.1", "value": 0.0, "title": "Ensure Data File Loads as Expected"}, {"repo": "DeepSpeech", "id": "3.2", "value": 0.40683810217248617, "title": "Data in the Expected Format"}, {"repo": "apollo", "id": "3.2", "value": 0.21509153357603822, "title": "Data in the Expected Format"}, {"repo": "lightfm", "id": "3.2", "value": 0.09128709291752768, "title": "Data in the Expected Format"}, {"repo": "magenta", "id": "3.2", "value": 0.18952451089472586, "title": "Data in the Expected Format"}, {"repo": "mmf", "id": "3.2", "value": 0.2450662589267805, "title": "Data in the Expected Format"}, {"repo": "mycroft-core", "id": "3.2", "value": 0.2780267083837678, "title": "Data in the Expected Format"}, {"repo": "nanodet", "id": "3.2", "value": 0.2397316507426921, "title": "Data in the Expected Format"}, {"repo": "paperless-ng", "id": "3.2", "value": 0.09128709291752768, "title": "Data in the Expected Format"}, {"repo": "qlib", "id": "3.2", "value": 0.285673231861683, "title": "Data in the Expected Format"}, {"repo": "DeepSpeech", "id": "3.5", "value": 0.0, "title": "Check for Duplicate Records in Data"}, {"repo": "apollo", "id": "3.5", "value": 0.0, "title": "Check for Duplicate Records in Data"}, {"repo": "lightfm", "id": "3.5", "value": 0.38803957613068046, "title": "Check for Duplicate Records in Data"}, {"repo": "magenta", "id": "3.5", "value": 0.25200346649686545, "title": "Check for Duplicate Records in Data"}, {"repo": "mmf", "id": "3.5", "value": 0.1268540658512312, "title": "Check for Duplicate Records in Data"}, {"repo": "mycroft-core", "id": "3.5", "value": 0.0, "title": "Check for Duplicate Records in Data"}, {"repo": "nanodet", "id": "3.5", "value": 0.25200346649686545, "title": "Check for Duplicate Records in Data"}, {"repo": "paperless-ng", "id": "3.5", "value": 0.0, "title": "Check for Duplicate Records in Data"}, {"repo": "qlib", "id": "3.5", "value": 0.09128709291752768, "title": "Check for Duplicate Records in Data"}, {"repo": "DeepSpeech", "id": "4.2", "value": 0.0, "title": "Verify Data Split Proportion"}, {"repo": "apollo", "id": "4.2", "value": 0.0, "title": "Verify Data Split Proportion"}, {"repo": "lightfm", "id": "4.2", "value": 0.15256428831468233, "title": "Verify Data Split Proportion"}, {"repo": "magenta", "id": "4.2", "value": 0.09128709291752768, "title": "Verify Data Split Proportion"}, {"repo": "mmf", "id": "4.2", "value": 0.12685406585123124, "title": "Verify Data Split Proportion"}, {"repo": "mycroft-core", "id": "4.2", "value": 0.0, "title": "Verify Data Split Proportion"}, {"repo": "nanodet", "id": "4.2", "value": 0.2542738138578039, "title": "Verify Data Split Proportion"}, {"repo": "paperless-ng", "id": "4.2", "value": 0.0, "title": "Verify Data Split Proportion"}, {"repo": "qlib", "id": "4.2", "value": 0.15256428831468236, "title": "Verify Data Split Proportion"}, {"repo": "DeepSpeech", "id": "5.3", "value": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "apollo", "id": "5.3", "value": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "lightfm", "id": "5.3", "value": 0.2491364395612199, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "magenta", "id": "5.3", "value": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "mmf", "id": "5.3", "value": 0.1268540658512312, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "mycroft-core", "id": "5.3", "value": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "nanodet", "id": "5.3", "value": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "paperless-ng", "id": "5.3", "value": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "qlib", "id": "5.3", "value": 0.2537081317024624, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "DeepSpeech", "id": "6.1", "value": 0.3511066249289032, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "apollo", "id": "6.1", "value": 0.1728729518208802, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "lightfm", "id": "6.1", "value": 0.23973165074269212, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "magenta", "id": "6.1", "value": 0.25200346649686545, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "mmf", "id": "6.1", "value": 0.23304579984969953, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "mycroft-core", "id": "6.1", "value": 0.0, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "nanodet", "id": "6.1", "value": 0.2856732318616829, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "paperless-ng", "id": "6.1", "value": 0.0, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "qlib", "id": "6.1", "value": 0.2248882225544018, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "DeepSpeech", "id": "6.2", "value": 0.0, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "apollo", "id": "6.2", "value": 0.0, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "lightfm", "id": "6.2", "value": 0.1728729518208802, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "magenta", "id": "6.2", "value": 0.0, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "mmf", "id": "6.2", "value": 0.20128894996822438, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "mycroft-core", "id": "6.2", "value": 0.2537081317024625, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "nanodet", "id": "6.2", "value": 0.2604152298810939, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "paperless-ng", "id": "6.2", "value": 0.1268540658512312, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "qlib", "id": "6.2", "value": 0.18257418583505536, "title": "Evaluate Model's Performance Against Thresholds"}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-</section>
-<section id="improvement-from-3.5-to-4o" class="level3">
-<h3 class="anchored" data-anchor-id="improvement-from-3.5-to-4o">improvement from 3.5 to 4o</h3>
-<div class="cell" data-execution_count="13">
-<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="co">#result_path = '../draft/batch_run_results/record_combine.yml'</span></span>
-<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>df_repo_run_file_4o, df_repo_run_4o, df_repo_4o__stat, df_repo_4o__count <span class="op">=</span> read_and_preprocess(</span>
-<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>    <span class="st">'../data/processed/batch_run_4o/record_combine.yml'</span></span>
-<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-scrolled="true" data-execution_count="14">
-<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>df_repo_4o__stat</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="14">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th">count</th>
-<th data-quarto-table-cell-role="th">mean</th>
-<th data-quarto-table-cell-role="th">std</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>lightfm</td>
-<td>2.1</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>lightfm</td>
-<td>3.2</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>lightfm</td>
-<td>3.5</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>lightfm</td>
-<td>4.2</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>lightfm</td>
-<td>5.3</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">5</td>
-<td>lightfm</td>
-<td>6.1</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">6</td>
-<td>lightfm</td>
-<td>6.2</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="24">
-<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>df1 <span class="op">=</span> df_repo__stat.query(<span class="st">'(repo == "lightfm") &amp; (id == "4.2")'</span>).copy()</span>
-<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>df1[<span class="st">'model'</span>] <span class="op">=</span> [<span class="st">'gpt-3.5-turbo'</span>]</span>
-<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a>df2 <span class="op">=</span> df_repo_4o__stat.query(<span class="st">'(repo == "lightfm") &amp; (id == "4.2")'</span>).copy()</span>
-<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a>df2[<span class="st">'model'</span>] <span class="op">=</span> [<span class="st">'gpt-4o'</span>]</span>
-<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a>df_model_comp <span class="op">=</span> pd.concat((df1, df2), axis<span class="op">=</span><span class="dv">0</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="35">
-<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>base <span class="op">=</span> alt.Chart(df_model_comp).transform_calculate(</span>
-<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>    <span class="bu">min</span><span class="op">=</span><span class="st">"max(0, datum.mean-datum.std)"</span>,</span>
-<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a>    <span class="bu">max</span><span class="op">=</span><span class="st">"min(1, datum.mean+datum.std)"</span></span>
-<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the points</span></span>
-<span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a>points <span class="op">=</span> base.mark_point(</span>
-<span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a>    filled<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb16-9"><a href="#cb16-9" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">50</span>,</span>
-<span id="cb16-10"><a href="#cb16-10" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'black'</span></span>
-<span id="cb16-11"><a href="#cb16-11" aria-hidden="true" tabindex="-1"></a>).encode(</span>
-<span id="cb16-12"><a href="#cb16-12" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'mean:Q'</span>).scale(domainMin<span class="op">=</span><span class="dv">0</span>, domainMax<span class="op">=</span><span class="dv">1</span>).title(<span class="st">"System Output"</span>).axis(</span>
-<span id="cb16-13"><a href="#cb16-13" aria-hidden="true" tabindex="-1"></a>        labelExpr<span class="op">=</span><span class="st">"datum.value % 0.5 ? null : datum.label"</span></span>
-<span id="cb16-14"><a href="#cb16-14" aria-hidden="true" tabindex="-1"></a>    ),</span>
-<span id="cb16-15"><a href="#cb16-15" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'model:N'</span>).title(<span class="st">"Model"</span>)<span class="co">#.scale(domainMin=0, domainMax=1).title('Score'),</span></span>
-<span id="cb16-16"><a href="#cb16-16" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb16-17"><a href="#cb16-17" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb16-18"><a href="#cb16-18" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the error bars</span></span>
-<span id="cb16-19"><a href="#cb16-19" aria-hidden="true" tabindex="-1"></a>errorbars <span class="op">=</span> base.mark_errorbar().encode(</span>
-<span id="cb16-20"><a href="#cb16-20" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">"min:Q"</span>).title(<span class="st">'1 SD'</span>), <span class="co">#"id:N",</span></span>
-<span id="cb16-21"><a href="#cb16-21" aria-hidden="true" tabindex="-1"></a>    x2<span class="op">=</span><span class="st">"max:Q"</span>,</span>
-<span id="cb16-22"><a href="#cb16-22" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">"model:N"</span></span>
-<span id="cb16-23"><a href="#cb16-23" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb16-24"><a href="#cb16-24" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-25"><a href="#cb16-25" aria-hidden="true" tabindex="-1"></a>(points <span class="op">+</span> errorbars).configure_axis( </span>
-<span id="cb16-26"><a href="#cb16-26" aria-hidden="true" tabindex="-1"></a>    labelFontSize<span class="op">=</span><span class="dv">12</span>, </span>
-<span id="cb16-27"><a href="#cb16-27" aria-hidden="true" tabindex="-1"></a>    titleFontSize<span class="op">=</span><span class="dv">12</span></span>
-<span id="cb16-28"><a href="#cb16-28" aria-hidden="true" tabindex="-1"></a>).properties(</span>
-<span id="cb16-29"><a href="#cb16-29" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb16-30"><a href="#cb16-30" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">400</span>,</span>
-<span id="cb16-31"><a href="#cb16-31" aria-hidden="true" tabindex="-1"></a>    title<span class="op">=</span>{</span>
-<span id="cb16-32"><a href="#cb16-32" aria-hidden="true" tabindex="-1"></a>        <span class="st">'text'</span>: <span class="st">'30 Runs on Checklist Item: "Ensure Data File Loads as Expected"'</span>,</span>
-<span id="cb16-33"><a href="#cb16-33" aria-hidden="true" tabindex="-1"></a>        <span class="st">'subtitle'</span>: <span class="st">"Ground Truth = 1"</span></span>
-<span id="cb16-34"><a href="#cb16-34" aria-hidden="true" tabindex="-1"></a>    }</span>
-<span id="cb16-35"><a href="#cb16-35" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="35">
-
-<style>
-  #altair-viz-32d45b35f788410cb417352673c05412.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-32d45b35f788410cb417352673c05412.vega-embed details,
-  #altair-viz-32d45b35f788410cb417352673c05412.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-32d45b35f788410cb417352673c05412"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-32d45b35f788410cb417352673c05412") {
-      outputDiv = document.getElementById("altair-viz-32d45b35f788410cb417352673c05412");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "layer": [{"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelExpr": "datum.value % 0.5 ? null : datum.label"}, "field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "System Output", "type": "quantitative"}, "y": {"field": "model", "title": "Model", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "min", "title": "1 SD", "type": "quantitative"}, "x2": {"field": "max"}, "y": {"field": "model", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}], "data": {"name": "data-7cad0a7145ca39f2dfe68f7ca4f09daa"}, "height": 200, "title": {"text": "30 Runs on Checklist Item: \"Ensure Data File Loads as Expected\"", "subtitle": "Ground Truth = 1"}, "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-7cad0a7145ca39f2dfe68f7ca4f09daa": [{"repo": "lightfm", "id": "4.2", "count": 30.0, "mean": 0.55, "std": 0.15256428831468233, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": "4.2", "count": 30.0, "mean": 1.0, "std": 0.0, "model": "gpt-4o"}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-
-
-</section>
-
-</main> <!-- /main -->
-<script id="quarto-html-after-body" type="application/javascript">
-window.document.addEventListener("DOMContentLoaded", function (event) {
-  const toggleBodyColorMode = (bsSheetEl) => {
-    const mode = bsSheetEl.getAttribute("data-mode");
-    const bodyEl = window.document.querySelector("body");
-    if (mode === "dark") {
-      bodyEl.classList.add("quarto-dark");
-      bodyEl.classList.remove("quarto-light");
-    } else {
-      bodyEl.classList.add("quarto-light");
-      bodyEl.classList.remove("quarto-dark");
-    }
-  }
-  const toggleBodyColorPrimary = () => {
-    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
-    if (bsSheetEl) {
-      toggleBodyColorMode(bsSheetEl);
-    }
-  }
-  toggleBodyColorPrimary();  
-  const icon = "";
-  const anchorJS = new window.AnchorJS();
-  anchorJS.options = {
-    placement: 'right',
-    icon: icon
-  };
-  anchorJS.add('.anchored');
-  const isCodeAnnotation = (el) => {
-    for (const clz of el.classList) {
-      if (clz.startsWith('code-annotation-')) {                     
-        return true;
-      }
-    }
-    return false;
-  }
-  const clipboard = new window.ClipboardJS('.code-copy-button', {
-    text: function(trigger) {
-      const codeEl = trigger.previousElementSibling.cloneNode(true);
-      for (const childEl of codeEl.children) {
-        if (isCodeAnnotation(childEl)) {
-          childEl.remove();
-        }
-      }
-      return codeEl.innerText;
-    }
-  });
-  clipboard.on('success', function(e) {
-    // button target
-    const button = e.trigger;
-    // don't keep focus
-    button.blur();
-    // flash "checked"
-    button.classList.add('code-copy-button-checked');
-    var currentTitle = button.getAttribute("title");
-    button.setAttribute("title", "Copied!");
-    let tooltip;
-    if (window.bootstrap) {
-      button.setAttribute("data-bs-toggle", "tooltip");
-      button.setAttribute("data-bs-placement", "left");
-      button.setAttribute("data-bs-title", "Copied!");
-      tooltip = new bootstrap.Tooltip(button, 
-        { trigger: "manual", 
-          customClass: "code-copy-button-tooltip",
-          offset: [0, -8]});
-      tooltip.show();    
-    }
-    setTimeout(function() {
-      if (tooltip) {
-        tooltip.hide();
-        button.removeAttribute("data-bs-title");
-        button.removeAttribute("data-bs-toggle");
-        button.removeAttribute("data-bs-placement");
-      }
-      button.setAttribute("title", currentTitle);
-      button.classList.remove('code-copy-button-checked');
-    }, 1000);
-    // clear code selection
-    e.clearSelection();
-  });
-  function tippyHover(el, contentFn) {
-    const config = {
-      allowHTML: true,
-      content: contentFn,
-      maxWidth: 500,
-      delay: 100,
-      arrow: false,
-      appendTo: function(el) {
-          return el.parentElement;
-      },
-      interactive: true,
-      interactiveBorder: 10,
-      theme: 'quarto',
-      placement: 'bottom-start'
-    };
-    window.tippy(el, config); 
-  }
-  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
-  for (var i=0; i<noterefs.length; i++) {
-    const ref = noterefs[i];
-    tippyHover(ref, function() {
-      // use id or data attribute instead here
-      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
-      try { href = new URL(href).hash; } catch {}
-      const id = href.replace(/^#\/?/, "");
-      const note = window.document.getElementById(id);
-      return note.innerHTML;
-    });
-  }
-      let selectedAnnoteEl;
-      const selectorForAnnotation = ( cell, annotation) => {
-        let cellAttr = 'data-code-cell="' + cell + '"';
-        let lineAttr = 'data-code-annotation="' +  annotation + '"';
-        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
-        return selector;
-      }
-      const selectCodeLines = (annoteEl) => {
-        const doc = window.document;
-        const targetCell = annoteEl.getAttribute("data-target-cell");
-        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
-        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
-        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
-        const lineIds = lines.map((line) => {
-          return targetCell + "-" + line;
-        })
-        let top = null;
-        let height = null;
-        let parent = null;
-        if (lineIds.length > 0) {
-            //compute the position of the single el (top and bottom and make a div)
-            const el = window.document.getElementById(lineIds[0]);
-            top = el.offsetTop;
-            height = el.offsetHeight;
-            parent = el.parentElement.parentElement;
-          if (lineIds.length > 1) {
-            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
-            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
-            height = bottom - top;
-          }
-          if (top !== null && height !== null && parent !== null) {
-            // cook up a div (if necessary) and position it 
-            let div = window.document.getElementById("code-annotation-line-highlight");
-            if (div === null) {
-              div = window.document.createElement("div");
-              div.setAttribute("id", "code-annotation-line-highlight");
-              div.style.position = 'absolute';
-              parent.appendChild(div);
-            }
-            div.style.top = top - 2 + "px";
-            div.style.height = height + 4 + "px";
-            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
-            if (gutterDiv === null) {
-              gutterDiv = window.document.createElement("div");
-              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
-              gutterDiv.style.position = 'absolute';
-              const codeCell = window.document.getElementById(targetCell);
-              const gutter = codeCell.querySelector('.code-annotation-gutter');
-              gutter.appendChild(gutterDiv);
-            }
-            gutterDiv.style.top = top - 2 + "px";
-            gutterDiv.style.height = height + 4 + "px";
-          }
-          selectedAnnoteEl = annoteEl;
-        }
-      };
-      const unselectCodeLines = () => {
-        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
-        elementsIds.forEach((elId) => {
-          const div = window.document.getElementById(elId);
-          if (div) {
-            div.remove();
-          }
-        });
-        selectedAnnoteEl = undefined;
-      };
-      // Attach click handler to the DT
-      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
-      for (const annoteDlNode of annoteDls) {
-        annoteDlNode.addEventListener('click', (event) => {
-          const clickedEl = event.target;
-          if (clickedEl !== selectedAnnoteEl) {
-            unselectCodeLines();
-            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
-            if (activeEl) {
-              activeEl.classList.remove('code-annotation-active');
-            }
-            selectCodeLines(clickedEl);
-            clickedEl.classList.add('code-annotation-active');
-          } else {
-            // Unselect the line
-            unselectCodeLines();
-            clickedEl.classList.remove('code-annotation-active');
-          }
-        });
-      }
-  const findCites = (el) => {
-    const parentEl = el.parentElement;
-    if (parentEl) {
-      const cites = parentEl.dataset.cites;
-      if (cites) {
-        return {
-          el,
-          cites: cites.split(' ')
-        };
-      } else {
-        return findCites(el.parentElement)
-      }
-    } else {
-      return undefined;
-    }
-  };
-  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
-  for (var i=0; i<bibliorefs.length; i++) {
-    const ref = bibliorefs[i];
-    const citeInfo = findCites(ref);
-    if (citeInfo) {
-      tippyHover(citeInfo.el, function() {
-        var popup = window.document.createElement('div');
-        citeInfo.cites.forEach(function(cite) {
-          var citeDiv = window.document.createElement('div');
-          citeDiv.classList.add('hanging-indent');
-          citeDiv.classList.add('csl-entry');
-          var biblioDiv = window.document.getElementById('ref-' + cite);
-          if (biblioDiv) {
-            citeDiv.innerHTML = biblioDiv.innerHTML;
-          }
-          popup.appendChild(citeDiv);
-        });
-        return popup.innerHTML;
-      });
-    }
-  }
-});
-</script>
-</div> <!-- /content -->
-
-
-
-</body></html>
\ No newline at end of file
diff --git a/docs/final_report.html b/docs/final_report.html
index e04f18e..9f4d0ee 100644
--- a/docs/final_report.html
+++ b/docs/final_report.html
@@ -54,7 +54,26 @@
 @media screen {
 pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
 }
-</style>
+/* CSS for citations */
+div.csl-bib-body { }
+div.csl-entry {
+  clear: both;
+}
+.hanging-indent div.csl-entry {
+  margin-left:2em;
+  text-indent:-2em;
+}
+div.csl-left-margin {
+  min-width:2em;
+  float:left;
+}
+div.csl-right-inline {
+  margin-left:2em;
+  padding-left:1em;
+}
+div.csl-indent {
+  margin-left: 2em;
+}</style>
 
 
 <script src="site_libs/quarto-nav/quarto-nav.js"></script>
@@ -93,6 +112,9 @@
     "search-label": "Search"
   }
 }</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script>
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
 
 
 <link rel="stylesheet" href="styles.css">
@@ -107,7 +129,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="./final_report.html">Final Report</a></li><li class="breadcrumb-item"><a href="./final_report.html">Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="./final_report.html">Capstone Final Report</a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="" onclick="window.quartoOpenSearch();">
@@ -122,7 +144,7 @@
   <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal sidebar-navigation docked overflow-auto">
     <div class="pt-lg-2 mt-2 text-left sidebar-header">
       <a href="./index.html" class="sidebar-logo-link">
-      <img src="./logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      <img src="./img/logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
       </a>
       </div>
         <div class="mt-2 flex-shrink-0 align-items-center">
@@ -132,40 +154,18 @@
         </div>
     <div class="sidebar-menu-container"> 
     <ul class="list-unstyled mt-1">
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
- <span class="menu-text">Final Report</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
+        <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./final_report.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text">Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
+ <span class="menu-text">Capstone Final Report</span></a>
   </div>
 </li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
- <span class="menu-text">Proposal</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
+        <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./proposal.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
+ <span class="menu-text">Capstone Proposal</span></a>
   </div>
 </li>
-      </ul>
-  </li>
     </ul>
     </div>
 </nav>
@@ -198,6 +198,7 @@ <h2 id="toc-title">On this page</h2>
   <li><a href="#wrap-up" id="toc-wrap-up" class="nav-link" data-scroll-target="#wrap-up">Wrap Up</a></li>
   <li><a href="#limitation-future-improvement" id="toc-limitation-future-improvement" class="nav-link" data-scroll-target="#limitation-future-improvement">Limitation &amp; Future Improvement</a></li>
   </ul></li>
+  <li><a href="#references" id="toc-references" class="nav-link" data-scroll-target="#references">References</a></li>
   </ul>
 </nav>
     </div>
@@ -230,85 +231,102 @@ <h2 class="anchored" data-anchor-id="executive-summary">Executive Summary</h2>
 <h2 class="anchored" data-anchor-id="introduction">Introduction</h2>
 <section id="problem-statement" class="level3">
 <h3 class="anchored" data-anchor-id="problem-statement">Problem Statement</h3>
-<p>The global artificial intelligence (AI) market is growing exponentially {cite}<code>grand2021artificial</code>, driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.</p>
-<p>However, ensuring the software quality of these systems remains a significant challenge {cite}<code>openja2023studying</code>. Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as substantial financial losses ({cite}<code>Asheeta2019</code>, {cite}<code>Asheeta2019</code>, {cite}<code>Asheeta2019</code>) and safety hazards.</p>
+<p>The global artificial intelligence (AI) market is growing exponentially (<span class="citation" data-cites="grand2021artificial">(<a href="#ref-grand2021artificial" role="doc-biblioref">Grand-View-Research 2021</a>)</span>), driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.</p>
+<p>However, ensuring the software quality of these systems remains a significant challenge (<span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span>). Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as misinformation (<span class="citation" data-cites="Ashley2024">(<a href="#ref-Ashley2024" role="doc-biblioref">Belanger 2024</a>)</span>), social bias (<span class="citation" data-cites="Alice2023">(<a href="#ref-Alice2023" role="doc-biblioref">Nunwick 2023</a>)</span>), substantial financial losses (<span class="citation" data-cites="Asheeta2019">(<a href="#ref-Asheeta2019" role="doc-biblioref">Regidi 2019</a>)</span>) and safety hazards (<span class="citation" data-cites="David2023">(<a href="#ref-David2023" role="doc-biblioref">Shepardson 2023</a>)</span>)</p>
 <p>Therefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?</p>
 </section>
 <section id="our-objectives" class="level3">
 <h3 class="anchored" data-anchor-id="our-objectives">Our Objectives</h3>
-<p>We propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate checklists based on ML research papers and best practices to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software’s trustworthiness, quality, and reproducibility across both the industry and academia {cite}<code>kapoor2022leakage</code>.</p>
+<p>We propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate checklists based on ML research papers and best practices to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software’s trustworthiness, quality, and reproducibility across both the industry and academia <span class="citation" data-cites="kapoor2022leakage">(<a href="#ref-kapoor2022leakage" role="doc-biblioref">Kapoor and Narayanan 2022</a>)</span>.</p>
 </section>
 </section>
 <section id="data-science-methods" class="level2">
 <h2 class="anchored" data-anchor-id="data-science-methods">Data Science Methods</h2>
 <section id="current-approaches" class="level3">
 <h3 class="anchored" data-anchor-id="current-approaches">Current Approaches</h3>
-<p>To ensure the reproducibility, trustworthiness and free-of-bias ML system, comprehensive assessment is essential. We have observed some traditional approaches in assessing the quality of ML systems, which contain different advantages and drawbacks as follows.</p>
-<section id="code-coverage" class="level4">
-<h4 class="anchored" data-anchor-id="code-coverage">1. Code Coverage</h4>
-<p>Code coverage is a measure of the proportion of source code of a program executed when a particular test suite is run. It is widely used in software development domain as one of the measurements. It quantifies the test quality and is scalable given the short process time. However, it cannot provide the reasons and in which ML areas that the test suites fall short under the context of ML system development.</p>
-</section>
-<section id="manual-evaluation" class="level4">
-<h4 class="anchored" data-anchor-id="manual-evaluation">2. Manual Evaluation</h4>
-<p>Manual evaluation involves human expert review at the source code, whom can take the business logic into considerations and find vulnerabilites. Manual evaluation usually delivers comments for improvement under specific development context, and it is still one of the most reliable methods in practice. However, the time cost is large and it is not scalable due to the scarcity of time and human expert. Different human expert might put emphasis on different ML test areas instead of a comprehensive and holistic review on the ML system test suites.</p>
-</section>
+<p>To ensure the reproducibility, trustworthiness, and lack of bias in ML systems, comprehensive testing is essential. We outlined some traditional approaches for assessing the completeness of ML system tests with their advantages and drawbacks as follows.</p>
+<ol type="1">
+<li><strong>Code Coverage</strong></li>
+</ol>
+<p>Code coverage measures the proportion of source code of a program executed when a particular test suite is run. Widely used in software development, it quantifies test quality and is scalable due to its short processing time. However, it cannot indicate the reasons or specific ML areas where the test suites fall short under the context of ML system development.</p>
+<ol start="2" type="1">
+<li><strong>Manual Evaluation</strong></li>
+</ol>
+<p>Manual evaluation involves human experts reviewing the source code, whom can take the business logic into considerations and identify vulnerabilites. It often provides context-specific improvement suggestions and remains one of the most reliable practices (<span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span>, <span class="citation" data-cites="alexander2023evaluating">(<a href="#ref-alexander2023evaluating" role="doc-biblioref">Alexander et al. 2023</a>)</span>). However, it is time-consuming and not scalable due to the scarcity of human experts. Moreover, different experts might put emphasis on different ML test areas and lack a comprehensive and holistic review of the ML system test suites.</p>
 </section>
 <section id="our-approach" class="level3">
 <h3 class="anchored" data-anchor-id="our-approach">Our Approach</h3>
-<p>Our approach is to deliver an automated code review tool with the best practices of ML test suites embedded, which can be used by ML users to learn the best practices as well as to obtain a comprehensive evaluation on their ML system codes.</p>
-<p>To come up with the best practices of ML test suites, ML research paper and recognized online resources are our data. Under the collaboration with our partner, we have researched industrial best practices (cite: Microsoft, Jordan) and published academic literature (cite: OpenJa) and consolidated the testing strategies of ML projects into a format which is easily legible and editable by human (researchers, ML engineers, etc.). The format is also machine-friendly that can be easily incorporated into the automated tool.</p>
-<p>To develop our automated code review tool, GitHub repositories of ML projects are our data. We have collected 11 repositories studied in {cite}<code>openja2023studying</code>, where these projects include comprehensive test suites and are written in Python programming language, for our product development. Our tool is capable of understanding the test suites in these projects, comparing and contrasting the test suites with the embedded best practices, and delivering evaluations and suggestions to the current test suties.</p>
-<p>By developing our approach, we expect that it can provide reliable test suites evaluation to multiple ML projects in a scalable manner. However, we acknowledged that the consolidation of best practices currently focused on a few high priority test areas due to time constraint, where we expect to expand in the future. The test evaluation results provided by our tool are yet as reliable as human evaluation, where we will quantify its performance using the success metrics below.</p>
+<p>Our approach is to deliver an automated code review tool with the best practices of ML test suites embedded. This tool aims to educate ML users on best practices while providing comprehensive evaluations of their ML system codes.</p>
+<p>To establish these best practices, we utilized data from ML research papers and recognized online resources. In collaboration with our partner, we researched industrial best practices (<span class="citation" data-cites="msise2023">(<a href="#ref-msise2023" role="doc-biblioref">Team 2023</a>)</span>, <span class="citation" data-cites="jordan2020">(<a href="#ref-jordan2020" role="doc-biblioref">Jordan 2020</a>)</span>) and academic literature (<span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span>), and consolidated testing strategies into a human-readable and machine-friendly checklist that can be embedded into the automated tool.</p>
+<p>For development, we collected 11 GitHub repositories of ML projects as studied in <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span>. These Python-based projects include comprehensive test suites. Our tool should be able to analyze these test suites, compare them with embedded best practices, and deliver evaluations.</p>
+<p>We expect that our approach will provide scalable and reliable test suite evaluations for multiple ML projects. However, we recognize that our current best practices only focus on a few high-priority test areas due to time constraints. We plan to expand this scope in the future. While our tool’s evaluations are not yet as reliable as human evaluations, we will quantify its performance.</p>
 </section>
 <section id="success-metrics" class="level3">
 <h3 class="anchored" data-anchor-id="success-metrics">Success Metrics</h3>
-<p>To properly assess the performance of our tool which leverages the capability of LLMs, we have researched and taken reference of the methods in {cite}<code>alexander2023evaluating</code> and defined the 2 success metrics: accuracy and consistency. With these metrics, our users (researchers, ML engineers, etc.) can assess the trustworthiness while obtaining the evaluation results from our tool.</p>
+<p>To properly assess the performance of our tool which leverages LLMs capability, we have taken reference of the methods in <span class="citation" data-cites="alexander2023evaluating">(<a href="#ref-alexander2023evaluating" role="doc-biblioref">Alexander et al. 2023</a>)</span> and defined two success metrics: accuracy and consistency. These metrics will help users (researchers, ML engineers, etc.) gauge the trustworthiness of our tool’s evaluation results.</p>
 <ol type="1">
-<li><strong>Accuracy of the Application vs Human Expert Judgement</strong></li>
+<li><strong>Accuracy vs Human Expert Judgement</strong></li>
 </ol>
-<p>We run our tool on the ML projects in {cite}<code>openja2023studying</code> to obtain the evaluation results (i.e.&nbsp;completeness score) per each ML test best practice item. We then manually assess the test suites of these ML projects using the same criteria as the ground truth data. Machine evaluation results are compared and contrasted with the ground truth data. Accuracy is defined as the number of matching results over total number of results.</p>
+<p>We run our tool on ML projects from <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span> to obtain evaluation results for each ML checklist item. These results are then compared with our manually assessed ground truth data based on the same criteria. Accuracy is calculated as the proportion of matching results to the total number of results.</p>
 <ol start="2" type="1">
-<li><strong>Consistency of the Application</strong></li>
+<li><strong>Consistency</strong></li>
 </ol>
-<p>Multiple runs on each ML project are performed and the evaluation results per each ML test best practice item are obtained. Standard deviation of these results per ML projects are calculated as a measure of consistency.</p>
+<p>We perform multiple runs on each ML project to obtain evaluation results for each checklist item. Consistency is measured by calculating the standard deviation of these results across multiple runs for each project.</p>
 </section>
 </section>
 <section id="data-product-results" class="level2">
 <h2 class="anchored" data-anchor-id="data-product-results">Data Product &amp; Results</h2>
 <section id="data-products" class="level3">
 <h3 class="anchored" data-anchor-id="data-products">Data Products</h3>
-<p>Our solution offers both a curated checklist on robust ML testing, and a Python package that facilitates the use of LLMs in checklist-based evaluation on the robustness of users’ ML projects. The Python package is made publicly available for distribution on the Python Packaging Index (PyPI).</p>
-<p>The justifications for creating these products are, on one hand, checklists have been shown to decrease errors in software systems and promote code submissions (cite: Gawande 2010, Pineau et al.&nbsp;(2021) from Tiffany PDF). Moreover, Python is chosen to be the programing language of our package given its prevalence in the ML landscape, its ubiquitous presence across different OSes and the existence of Python libraries for the integration with LLMs. This lowers the barrier to use and develop our package and provides better user experience.</p>
+<p>Our solution includes a curated checklist for robust ML testing and a Python package for checklist-based evaluation of ML project testing robustness using LLMs. The package is publicly available on the Python Packaging Index (PyPI).</p>
+<p>Justifications for these products are:</p>
+<ul>
+<li>Checklists have been shown to reduce errors in software systems and promote code submissions (<span class="citation" data-cites="Atul2010">(<a href="#ref-Atul2010" role="doc-biblioref">Gawande 2010</a>)</span>, <span class="citation" data-cites="pineau2021improving">(<a href="#ref-pineau2021improving" role="doc-biblioref">Pineau et al. 2021</a>)</span>).</li>
+<li>Python is widely used in ML, compatible with various OSes, and integrates well with LLMs. These ensure the ease of use and development.</li>
+</ul>
 <section id="how-to-use-the-product" class="level4">
 <h4 class="anchored" data-anchor-id="how-to-use-the-product">How to use the product</h4>
 <p>There are two ways to make use of this package:</p>
 <ol type="1">
-<li><p><strong>As a CLI tool.</strong> A runnable command <code>fixml</code> is provided by the package. Once installed, users can perform the codebase evaluation, test function specification generation and other relevant tasks by running subcommands under <code>fixml</code> in terminal environment.</p></li>
-<li><p><strong>As a high-level API.</strong> Alternatively, one can use the package to import all components necessary for performing the tasks as part of their own system. Documentations are provided in terms of docstrings.</p></li>
+<li><p><strong>As a CLI tool.</strong> A runnable command <code>fixml</code> is provided by the package. Once installed, users can perform codebase evaluations, generate test function specifications, and more by running subcommands under <code>fixml</code> in the terminal.</p></li>
+<li><p><strong>As a high-level API.</strong> Users can import necessary components from the package into their own systems. Documentation is available through docstrings.</p></li>
 </ol>
-<p>By formating our product as a CLI tool and API, one (researchers, ML engineers, etc.) will find it user-friendly to interact with. Moreover, it is versatile to support various use cases, such as web application development, data science research, etc.</p>
+<p>By offering it as both CLI tool and API, our product is user-friendly to interact with, and versatile to support various use cases such as web application development and scientific research.</p>
 </section>
 <section id="system-design" class="level4">
 <h4 class="anchored" data-anchor-id="system-design">System Design</h4>
-<p>(To be revised) <img src="../../img/proposed_system_overview.png" class="img-fluid" alt="image"></p>
-<p>The design principle of our package adheres to object-oriented design and SOLID principles, which is fully modular. One can easily switch between different prompts, models and checklists to use. This enables code reuse and promote users’ collaboration to extend its functionality.</p>
+<p>(FIXME To be revised) <img src="img/proposed_system_overview.png" class="img-fluid" alt="image"></p>
+<p>The design of our package follows object-oriented and SOLID principles, which is fully modularity. Users can easily switch between different prompts, models, and checklists, which facilitates code reusability and collaboration to extend its functionality.</p>
 <p>There are five components in the system of our package:</p>
 <ol type="1">
-<li><p><strong>Code Analyzer</strong> This component extracts the information relevant to test suites from the input codebase, which is essential for injecting only the most relevant information to LLMs given its token limits.</p></li>
-<li><p><strong>Prompt Templates</strong> This component stores the prompt template necessary for instructing LLM to behave and return responses in consistent and expected format. Few-shot learning is applied for the instruction.</p></li>
-<li><p><strong>Checklist</strong> This component reads the curated checklist, which is stored in CSV format, as a dict with fixed schema for injection into prompt. Default checklist is also included inside the package for distribution.</p></li>
-<li><p><strong>Runners</strong> This component involves the Evaluator module, which evaluates each file from the test suites using LLMs and outputs evaluation results, and Generator module, which generates test specifications. Both modules include validation and retry logics and record all relevant information in the responses.</p></li>
-<li><p><strong>Parsers</strong> This components parses the responses from Evaluator into evaluation reports in various formats (HTML, PDF) using Jinja template engine. Adhering to our design principle, this enables flexibility in creating customized report structure.</p></li>
+<li><strong>Code Analyzer</strong></li>
+</ol>
+<p>It extracts test suites from the input codebase, to ensure only the most relevants details are provided to LLMs given token limits.</p>
+<ol start="2" type="1">
+<li><strong>Prompt Templates</strong></li>
+</ol>
+<p>It stores prompt templates for instructing LLMs to generate responses in the expected format.</p>
+<ol start="3" type="1">
+<li><strong>Checklist</strong></li>
 </ol>
+<p>It reads the curated checklist from a CSV file into a dictionary with a fixed schema for LLM injection. The package includes a default checklist for distribution.</p>
+<ol start="4" type="1">
+<li><strong>Runners</strong></li>
+</ol>
+<p>It includes the Evaluator module, which assesses each test suite file using LLMs and outputs evaluation results, and the Generator module, which creates test specifications. Both modules feature validation, retry logic, and record response and relevant information.</p>
+<ol start="5" type="1">
+<li><strong>Parsers</strong></li>
+</ol>
+<p>It converts Evaluator responses into evaluation reports in various formats (HTML, PDF) using the Jinja template engine, which enables customizable report structures.</p>
 </section>
 <section id="checklist-design" class="level4">
 <h4 class="anchored" data-anchor-id="checklist-design">Checklist Design</h4>
-<p>The package will incorporate a checklist (<a href="overview-diagram">Fig. 1</a>) which contains the best practices in testing ML pipeline and is curated manually based on ML researches and recognized online resources. Prompt engineering is applied to the checklist for better performance. This also helps combating the hallucination of LLMs ({cite}<code>zhang2023sirens</code>) during the evaluation of ML projects by prompting it to follow <strong>exactly</strong> the checklist.</p>
-<p>Here is an example of how the checklist would be structured:</p>
+<p>The embedded checklist contains best practices for testing ML pipelines, and is curated from ML research and recognized online resources. Prompt engineering further improves performance. THis helps mitigate LLM hallucinations (<span class="citation" data-cites="zhang2023sirens">(<a href="#ref-zhang2023sirens" role="doc-biblioref">Zhang et al. 2023</a>)</span>) by ensuring strict adherence to the checklist.</p>
+<p>Example checklist structure:</p>
 <table class="table">
 <colgroup>
-<col style="width: 19%">
-<col style="width: 80%">
+<col style="width: 26%">
+<col style="width: 73%">
 </colgroup>
 <thead>
 <tr class="header">
@@ -319,117 +337,702 @@ <h4 class="anchored" data-anchor-id="checklist-design">Checklist Design</h4>
 <tbody>
 <tr class="odd">
 <td style="text-align: right;">ID</td>
-<td style="text-align: left;">The Unique Identifier of the checklist item</td>
+<td style="text-align: left;">Unique Identifier of the checklist item</td>
 </tr>
 <tr class="even">
 <td style="text-align: right;">Topic</td>
-<td style="text-align: left;">The Test Area of the checklist item</td>
+<td style="text-align: left;">Test Area of the checklist item</td>
 </tr>
 <tr class="odd">
 <td style="text-align: right;">Title</td>
-<td style="text-align: left;">The Title of the checklist item</td>
+<td style="text-align: left;">Title of the checklist item</td>
 </tr>
 <tr class="even">
 <td style="text-align: right;">Requirement</td>
-<td style="text-align: left;">The Prompt of the checklist item to be injected into LLMs for evaluation</td>
+<td style="text-align: left;">Prompt for the checklist item to be injected into LLMs for evaluation</td>
 </tr>
 <tr class="odd">
 <td style="text-align: right;">Explanations</td>
-<td style="text-align: left;">Detailed explanations of the checklist item for human understanding</td>
+<td style="text-align: left;">Detailed explanations for human understanding</td>
 </tr>
 <tr class="even">
 <td style="text-align: right;">Reference</td>
-<td style="text-align: left;">References of the checklist item, e.g.&nbsp;academic paper</td>
+<td style="text-align: left;">References for the checklist item, e.g., academic papers</td>
 </tr>
 <tr class="odd">
 <td style="text-align: right;">Is Evaluator Applicable</td>
-<td style="text-align: left;">Whether the checklist item is selected to be used during evaluation. 0 indicates No, 1 indicates Yes</td>
+<td style="text-align: left;">Indicates if the checklist item is used during evaluation (0 = No, 1 = Yes)</td>
 </tr>
 </tbody>
 </table>
-<p>(To be revised) <img src="../../img/checklist_sample.png" width="200"></p>
+<p>(FIXME To be revised) <img src="img/checklist_sample.png" width="600"></p>
 </section>
 <section id="artifacts" class="level4">
 <h4 class="anchored" data-anchor-id="artifacts">Artifacts</h4>
-<p>There are three artifacts after using our package:</p>
+<p>Using our package results in three artifacts:</p>
 <ol type="1">
-<li><strong>Evaluation Responses</strong> The artifact stores both the evaluation responses from LLMs and meta-data of the process in JSON format. This supports downstream tasks, such as report render, scientific research, etc.</li>
+<li><strong>Evaluation Responses</strong></li>
 </ol>
-<p>(To be revised) schema of the JSON saved &amp; what kind of information is stored</p>
+<p>These responses include both LLM evaluation results and process metadata stored in JSON format.This supports downsteam tasks like report rendering and scientific research, etc.</p>
+<p>(FIXME To be revised) schema of the JSON saved &amp; what kind of information is stored</p>
 <ol start="2" type="1">
-<li><strong>Evaluation Report</strong> The artifact stores the evaluation results of the ML projects in a structured format, which includes completeness score breakdown and corresponding detailed reasons.</li>
+<li><strong>Evaluation Report</strong></li>
 </ol>
-<p>(To be revised) <img src="../../img/test_evaluation_report_sample.png" width="200"></p>
+<p>This report presents structured evaluation results of ML projects, which includes a detailed breakdown of completeness scores and reasons for each score.</p>
+<p>(FIXME To be revised) <img src="img/test_evaluation_report_sample.png" width="600"></p>
 <ol start="3" type="1">
-<li><strong>Test Specification Script</strong> The artifacts stores the test specification responses from LLMs in Python script format.</li>
+<li><strong>Test Specification Script</strong></li>
 </ol>
-<p>(To be revised) <img src="../../img/test_spec_sample.png" width="200"></p>
+<p>Generated test specifications are stored as Python scripts.</p>
+<p>(FIXME To be revised) <img src="img/test_spec_sample.png" width="600"></p>
 </section>
 </section>
 <section id="evaluation-results" class="level3">
 <h3 class="anchored" data-anchor-id="evaluation-results">Evaluation Results</h3>
-<p>As illustrated in <code>Success Metrics</code>, we ran 30 iterations on each of the repositories in {cite}<code>openja2023studying</code> and examined the breakdown of the ML Completeness Score to assessed the quality of evaluation determined by our tool. (FIXME: would it be better to show a table of the repos? like how the Openja does?)</p>
-<section id="accuracy" class="level4">
-<h4 class="anchored" data-anchor-id="accuracy">Accuracy</h4>
-<p>For accuracy, we targeted 3 of the repositories (<code>lightfm</code> (FIXME: link), <code>qlib</code> (FIXME: link), <code>DeepSpeech</code> (FIXME: link)) for human evaluation and compared the ground truth with the outputs from our tool.</p>
+<p>As described in <code>Success Metrics</code>, we conducted 30 iterations on each repository from <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span> and examined the breakdown of the completeness score to assess our tool’s evaluation quality.</p>
+<p>(FIXME: would it be better to show a table of the repos? like how the Openja does?)</p>
+<ol type="1">
+<li><strong>Accuracy</strong></li>
+</ol>
+<p>We targeted 3 of the repositories (<a href="https://github.com/lyst/lightfm"><code>lightfm</code></a>, <a href="https://github.com/microsoft/qlib"><code>qlib</code></a>, <a href="https://github.com/mozilla/DeepSpeech"><code>DeepSpeech</code></a>) for human evaluation compared our tool’s outputs with the ground truth.</p>
 <div class="cell" data-execution_count="1">
 <details>
 <summary>Code</summary>
-<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># </span><span class="al">FIXME</span><span class="co">: table: checklist id, title, (ground truth, (lightfm, qlib, DeepSpeech))</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>gt <span class="op">=</span> pd.read_csv(<span class="st">'ground_truth.csv'</span>)</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>gt</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </details>
+<div class="cell-output cell-output-display" data-execution_count="21">
+<div>
+
+
+<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">id</th>
+<th data-quarto-table-cell-role="th">title</th>
+<th data-quarto-table-cell-role="th">DeepSpeech</th>
+<th data-quarto-table-cell-role="th">lightfm</th>
+<th data-quarto-table-cell-role="th">qlib</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>2.1</td>
+<td>Ensure Data File Loads as Expected</td>
+<td>0.0</td>
+<td>1.0</td>
+<td>0.5</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>3.2</td>
+<td>Data in the Expected Format</td>
+<td>0.0</td>
+<td>1.0</td>
+<td>1.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>3.5</td>
+<td>Check for Duplicate Records in Data</td>
+<td>0.0</td>
+<td>0.0</td>
+<td>0.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>4.2</td>
+<td>Verify Data Split Proportion</td>
+<td>0.0</td>
+<td>1.0</td>
+<td>0.5</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>5.3</td>
+<td>Ensure Model Output Shape Aligns with Expectation</td>
+<td>0.0</td>
+<td>0.5</td>
+<td>1.0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">5</td>
+<td>6.1</td>
+<td>Verify Evaluation Metrics Implementation</td>
+<td>0.0</td>
+<td>1.0</td>
+<td>1.0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">6</td>
+<td>6.2</td>
+<td>Evaluate Model's Performance Against Thresholds</td>
+<td>0.0</td>
+<td>1.0</td>
+<td>1.0</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
 </div>
 <blockquote class="blockquote">
-<p>Caption: Ground truth data on the 3 repositories</p>
+<p>Ground truth data for the 3 repositories. (1 = fully satisfied, 0.5 = partially satisfied, 0 = not satisfied)</p>
 </blockquote>
 <div class="cell" data-execution_count="2">
 <details>
 <summary>Code</summary>
-<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># </span><span class="al">FIXME</span><span class="co">: jitter-mean-sd plot (checklist item vs. score) for each repo</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># </span><span class="al">FIXME</span><span class="co">: jitter-mean-sd plot (checklist item vs. score) for each repo</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> altair <span class="im">as</span> alt</span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>df_repo__stat <span class="op">=</span> pd.read_csv(<span class="st">'score_stat_by_repo_3.5-turbo.csv'</span>)</span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>gt <span class="op">=</span> pd.read_csv(<span class="st">'ground_truth.csv'</span>)</span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>gt <span class="op">=</span> gt.melt(id_vars<span class="op">=</span>[<span class="st">'id'</span>, <span class="st">'title'</span>], var_name<span class="op">=</span><span class="st">'repo'</span>, value_name<span class="op">=</span><span class="st">'ground_truth'</span>)</span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>df_repo__stat_with_gt <span class="op">=</span> df_repo__stat.merge(gt, on<span class="op">=</span>[<span class="st">'id'</span>, <span class="st">'title'</span>, <span class="st">'repo'</span>])</span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>base <span class="op">=</span> alt.Chart(</span>
+<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a>    df_repo__stat_with_gt.query(<span class="st">'repo in ["lightfm", "qlib", "DeepSpeech"]'</span>)</span>
+<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a>).transform_calculate(</span>
+<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a>    <span class="bu">min</span><span class="op">=</span><span class="st">"max(0, datum.mean-datum.std)"</span>,</span>
+<span id="cb2-15"><a href="#cb2-15" aria-hidden="true" tabindex="-1"></a>    <span class="bu">max</span><span class="op">=</span><span class="st">"min(1, datum.mean+datum.std)"</span></span>
+<span id="cb2-16"><a href="#cb2-16" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb2-17"><a href="#cb2-17" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb2-18"><a href="#cb2-18" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the points</span></span>
+<span id="cb2-19"><a href="#cb2-19" aria-hidden="true" tabindex="-1"></a>points <span class="op">=</span> base.mark_point(</span>
+<span id="cb2-20"><a href="#cb2-20" aria-hidden="true" tabindex="-1"></a>    filled<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb2-21"><a href="#cb2-21" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">50</span>,</span>
+<span id="cb2-22"><a href="#cb2-22" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'black'</span></span>
+<span id="cb2-23"><a href="#cb2-23" aria-hidden="true" tabindex="-1"></a>).encode(</span>
+<span id="cb2-24"><a href="#cb2-24" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'mean:Q'</span>).scale(domainMin<span class="op">=</span><span class="dv">0</span>, domainMax<span class="op">=</span><span class="dv">1</span>).title(<span class="st">"Score"</span>).axis(</span>
+<span id="cb2-25"><a href="#cb2-25" aria-hidden="true" tabindex="-1"></a>        labelExpr<span class="op">=</span><span class="st">"datum.value % 0.5 ? null : datum.label"</span></span>
+<span id="cb2-26"><a href="#cb2-26" aria-hidden="true" tabindex="-1"></a>    ),</span>
+<span id="cb2-27"><a href="#cb2-27" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'id_title:N'</span>, title<span class="op">=</span><span class="va">None</span>, axis<span class="op">=</span>alt.Axis(labelPadding<span class="op">=</span><span class="dv">10</span>, labelLimit<span class="op">=</span><span class="dv">1000</span>, grid<span class="op">=</span><span class="va">False</span>))<span class="co">#.scale(domainMin=0, domainMax=1).title('Score'),</span></span>
+<span id="cb2-28"><a href="#cb2-28" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb2-29"><a href="#cb2-29" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-30"><a href="#cb2-30" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the points for ground truth</span></span>
+<span id="cb2-31"><a href="#cb2-31" aria-hidden="true" tabindex="-1"></a>gt_points <span class="op">=</span> base.mark_point(</span>
+<span id="cb2-32"><a href="#cb2-32" aria-hidden="true" tabindex="-1"></a>    filled<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb2-33"><a href="#cb2-33" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">200</span>,</span>
+<span id="cb2-34"><a href="#cb2-34" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'green'</span>,</span>
+<span id="cb2-35"><a href="#cb2-35" aria-hidden="true" tabindex="-1"></a>    shape<span class="op">=</span><span class="st">"diamond"</span></span>
+<span id="cb2-36"><a href="#cb2-36" aria-hidden="true" tabindex="-1"></a>).encode(</span>
+<span id="cb2-37"><a href="#cb2-37" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'ground_truth:Q'</span>),</span>
+<span id="cb2-38"><a href="#cb2-38" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'id_title:N'</span>)</span>
+<span id="cb2-39"><a href="#cb2-39" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb2-40"><a href="#cb2-40" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-41"><a href="#cb2-41" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the error bars</span></span>
+<span id="cb2-42"><a href="#cb2-42" aria-hidden="true" tabindex="-1"></a>errorbars <span class="op">=</span> base.mark_errorbar().encode(</span>
+<span id="cb2-43"><a href="#cb2-43" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">"min:Q"</span>).title(<span class="st">'1 SD'</span>), <span class="co">#"id:N",</span></span>
+<span id="cb2-44"><a href="#cb2-44" aria-hidden="true" tabindex="-1"></a>    x2<span class="op">=</span><span class="st">"max:Q"</span>,</span>
+<span id="cb2-45"><a href="#cb2-45" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">"id_title:N"</span></span>
+<span id="cb2-46"><a href="#cb2-46" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb2-47"><a href="#cb2-47" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-48"><a href="#cb2-48" aria-hidden="true" tabindex="-1"></a>(gt_points <span class="op">+</span> points <span class="op">+</span> errorbars).facet(</span>
+<span id="cb2-49"><a href="#cb2-49" aria-hidden="true" tabindex="-1"></a>    column<span class="op">=</span>alt.Column(<span class="st">'repo:N'</span>).title(<span class="va">None</span>)</span>
+<span id="cb2-50"><a href="#cb2-50" aria-hidden="true" tabindex="-1"></a>).configure_axis( </span>
+<span id="cb2-51"><a href="#cb2-51" aria-hidden="true" tabindex="-1"></a>    labelFontSize<span class="op">=</span><span class="dv">12</span>, </span>
+<span id="cb2-52"><a href="#cb2-52" aria-hidden="true" tabindex="-1"></a>    titleFontSize<span class="op">=</span><span class="dv">12</span></span>
+<span id="cb2-53"><a href="#cb2-53" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </details>
+<div class="cell-output cell-output-display" data-execution_count="22">
+
+<style>
+  #altair-viz-559a5d7f63344848ba0c79151d499e2f.vega-embed {
+    width: 100%;
+    display: flex;
+  }
+
+  #altair-viz-559a5d7f63344848ba0c79151d499e2f.vega-embed details,
+  #altair-viz-559a5d7f63344848ba0c79151d499e2f.vega-embed details summary {
+    position: relative;
+  }
+</style>
+<div id="altair-viz-559a5d7f63344848ba0c79151d499e2f"></div>
+<script type="text/javascript">
+  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
+  (function(spec, embedOpt){
+    let outputDiv = document.currentScript.previousElementSibling;
+    if (outputDiv.id !== "altair-viz-559a5d7f63344848ba0c79151d499e2f") {
+      outputDiv = document.getElementById("altair-viz-559a5d7f63344848ba0c79151d499e2f");
+    }
+    const paths = {
+      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
+      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
+      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.16.3?noext",
+      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
+    };
+
+    function maybeLoadScript(lib, version) {
+      var key = `${lib.replace("-", "")}_version`;
+      return (VEGA_DEBUG[key] == version) ?
+        Promise.resolve(paths[lib]) :
+        new Promise(function(resolve, reject) {
+          var s = document.createElement('script');
+          document.getElementsByTagName("head")[0].appendChild(s);
+          s.async = true;
+          s.onload = () => {
+            VEGA_DEBUG[key] = version;
+            return resolve(paths[lib]);
+          };
+          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
+          s.src = paths[lib];
+        });
+    }
+
+    function showError(err) {
+      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
+      throw err;
+    }
+
+    function displayChart(vegaEmbed) {
+      vegaEmbed(outputDiv, spec, embedOpt)
+        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
+    }
+
+    if(typeof define === "function" && define.amd) {
+      requirejs.config({paths});
+      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
+    } else {
+      maybeLoadScript("vega", "5")
+        .then(() => maybeLoadScript("vega-lite", "5.16.3"))
+        .then(() => maybeLoadScript("vega-embed", "6"))
+        .catch(showError)
+        .then(() => displayChart(vegaEmbed));
+    }
+  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "data": {"name": "data-b66570b036e4238b498b9ed062c47ac1"}, "facet": {"column": {"field": "repo", "title": null, "type": "nominal"}}, "spec": {"layer": [{"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 200}, "encoding": {"x": {"field": "ground_truth", "type": "quantitative"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelExpr": "datum.value % 0.5 ? null : datum.label"}, "field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "id_title", "title": null, "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "min", "title": "1 SD", "type": "quantitative"}, "x2": {"field": "max"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}]}, "$schema": "https://vega.github.io/schema/vega-lite/v5.16.3.json", "datasets": {"data-b66570b036e4238b498b9ed062c47ac1": [{"repo": "lightfm", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 1.0}, {"repo": "lightfm", "id": 3.2, "count": 30.0, "mean": 0.5, "std": 0.0, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0}, {"repo": "lightfm", "id": 3.5, "count": 30.0, "mean": 0.0, "std": 0.0, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0}, {"repo": "lightfm", "id": 4.2, "count": 30.0, "mean": 0.8166666666666667, "std": 0.2450662589267805, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 1.0}, {"repo": "lightfm", "id": 5.3, "count": 30.0, "mean": 0.4833333333333333, "std": 0.0912870929175276, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 0.5}, {"repo": "lightfm", "id": 6.1, "count": 30.0, "mean": 0.9166666666666666, "std": 0.1895245108947258, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0}, {"repo": "lightfm", "id": 6.2, "count": 30.0, "mean": 0.9833333333333332, "std": 0.0912870929175276, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0}, {"repo": "qlib", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 0.5}, {"repo": "qlib", "id": 3.2, "count": 30.0, "mean": 0.7666666666666667, "std": 0.2537081317024624, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0}, {"repo": "qlib", "id": 3.5, "count": 30.0, "mean": 0.1166666666666666, "std": 0.2150915335760381, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0}, {"repo": "qlib", "id": 4.2, "count": 30.0, "mean": 0.4833333333333333, "std": 0.2069204966986668, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 0.5}, {"repo": "qlib", "id": 5.3, "count": 30.0, "mean": 0.55, "std": 0.2012889499682243, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 1.0}, {"repo": "qlib", "id": 6.1, "count": 30.0, "mean": 0.6333333333333333, "std": 0.2916461404928373, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0}, {"repo": "qlib", "id": 6.2, "count": 30.0, "mean": 0.6, "std": 0.203419051086243, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0}]}}, {"mode": "vega-lite"});
+</script>
+</div>
 </div>
 <blockquote class="blockquote">
-<p>Caption: Comparison of the satisfaction determined by our system versus the ground truth for each checklist item and repository</p>
+<p>Comparison of our system’s satisfaction determination versus the ground truth for each checklist item and repository</p>
 </blockquote>
-<p>We found that our tool tends to undermine the actual satisfying cases. For the items that are actually satisfied (score = 1), our tool tends to classify as partially satisfied (score = 0.5), while for those that are partially satisfied (score = 0.5), our tool often classfies as not satisfied (score = 0).</p>
+<p>Our tool tends to underrate satisfying cases, which often classifies fully satisfied items as partially satisfied and partially satisfied items as not satisfied.</p>
 <div class="cell" data-execution_count="3">
 <details>
 <summary>Code</summary>
-<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># </span><span class="al">FIXME</span><span class="co">: contingency table</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>df_repo_run <span class="op">=</span> pd.read_csv(<span class="st">'score_by_repo_run_3.5-turbo.csv'</span>)</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>df_repo_run <span class="op">=</span> df_repo_run.merge(gt, on<span class="op">=</span>[<span class="st">'id'</span>, <span class="st">'title'</span>, <span class="st">'repo'</span>])</span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>contingency_table <span class="op">=</span> pd.pivot_table(</span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>    df_repo_run,</span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>    values<span class="op">=</span><span class="st">'run'</span>, </span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>    index<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id_title'</span>, <span class="st">'ground_truth'</span>], </span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>    columns<span class="op">=</span>[<span class="st">'score'</span>],</span>
+<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a>    aggfunc<span class="op">=</span><span class="st">'count'</span>, </span>
+<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>    fill_value<span class="op">=</span><span class="dv">0</span></span>
+<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a>contingency_table.index.names <span class="op">=</span> [<span class="st">'Repository'</span>, <span class="st">'Checklist Item'</span>, <span class="st">'Ground Truth'</span>]</span>
+<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>contingency_table.sort_index(level<span class="op">=</span>[<span class="dv">0</span>, <span class="dv">2</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </details>
+<div class="cell-output cell-output-display" data-execution_count="23">
+<div>
+
+
+<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">score</th>
+<th data-quarto-table-cell-role="th">0.0</th>
+<th data-quarto-table-cell-role="th">0.5</th>
+<th data-quarto-table-cell-role="th">1.0</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Repository</th>
+<th data-quarto-table-cell-role="th">Checklist Item</th>
+<th data-quarto-table-cell-role="th">Ground Truth</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td rowspan="7" data-quarto-table-cell-role="th" data-valign="top">lightfm</td>
+<td data-quarto-table-cell-role="th">3.5. Check for Duplicate Records in Data</td>
+<td data-quarto-table-cell-role="th">0.0</td>
+<td>30</td>
+<td>0</td>
+<td>0</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">5.3. Ensure Model Output Shape Aligns with Expectation</td>
+<td data-quarto-table-cell-role="th">0.5</td>
+<td>1</td>
+<td>29</td>
+<td>0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2.1. Ensure Data File Loads as Expected</td>
+<td data-quarto-table-cell-role="th">1.0</td>
+<td>0</td>
+<td>0</td>
+<td>30</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3.2. Data in the Expected Format</td>
+<td data-quarto-table-cell-role="th">1.0</td>
+<td>0</td>
+<td>30</td>
+<td>0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4.2. Verify Data Split Proportion</td>
+<td data-quarto-table-cell-role="th">1.0</td>
+<td>0</td>
+<td>11</td>
+<td>19</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">6.1. Verify Evaluation Metrics Implementation</td>
+<td data-quarto-table-cell-role="th">1.0</td>
+<td>0</td>
+<td>5</td>
+<td>25</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">6.2. Evaluate Model's Performance Against Thresholds</td>
+<td data-quarto-table-cell-role="th">1.0</td>
+<td>0</td>
+<td>1</td>
+<td>29</td>
+</tr>
+<tr class="even">
+<td rowspan="7" data-quarto-table-cell-role="th" data-valign="top">qlib</td>
+<td data-quarto-table-cell-role="th">3.5. Check for Duplicate Records in Data</td>
+<td data-quarto-table-cell-role="th">0.0</td>
+<td>23</td>
+<td>7</td>
+<td>0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2.1. Ensure Data File Loads as Expected</td>
+<td data-quarto-table-cell-role="th">0.5</td>
+<td>0</td>
+<td>0</td>
+<td>30</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">4.2. Verify Data Split Proportion</td>
+<td data-quarto-table-cell-role="th">0.5</td>
+<td>3</td>
+<td>25</td>
+<td>2</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">3.2. Data in the Expected Format</td>
+<td data-quarto-table-cell-role="th">1.0</td>
+<td>0</td>
+<td>14</td>
+<td>16</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">5.3. Ensure Model Output Shape Aligns with Expectation</td>
+<td data-quarto-table-cell-role="th">1.0</td>
+<td>1</td>
+<td>25</td>
+<td>4</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">6.1. Verify Evaluation Metrics Implementation</td>
+<td data-quarto-table-cell-role="th">1.0</td>
+<td>2</td>
+<td>18</td>
+<td>10</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">6.2. Evaluate Model's Performance Against Thresholds</td>
+<td data-quarto-table-cell-role="th">1.0</td>
+<td>0</td>
+<td>24</td>
+<td>6</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
 </div>
 <blockquote class="blockquote">
-<p>Contingency table of the satisfaction determined by our system versus the ground truth</p>
+<p>Contingency table of our system’s satisfaction determination versus the ground truth</p>
 </blockquote>
-<p>The accuracy issue may be attributed to the need for improvement of prompts in our checklist.</p>
-</section>
-<section id="consistency" class="level4">
-<h4 class="anchored" data-anchor-id="consistency">Consistency</h4>
-<p>Since the completeness score from LLMs contain randomness, we further studied the consistency of scores across checklist items and reposities.</p>
+<p>The accuracy issue may be attributed to a need to improve our checklist prompts.</p>
+<ol start="2" type="1">
+<li><strong>Consistency</strong></li>
+</ol>
+<p>As the completeness scores from LLMs contain randomness, we examined the consistency of completeness scores across checklist items and repositories.</p>
 <div class="cell" data-execution_count="4">
 <details>
 <summary>Code</summary>
-<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># </span><span class="al">FIXME</span><span class="co">: jitter-boxplot, checklist item vs. SD</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> df_repo__stat[[<span class="st">'repo'</span>, <span class="st">'std'</span>, <span class="st">'id_title'</span>]].pivot(index<span class="op">=</span><span class="st">'repo'</span>, columns<span class="op">=</span><span class="st">'id_title'</span>).copy()</span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>stds.columns <span class="op">=</span> [col[<span class="dv">1</span>] <span class="cf">for</span> col <span class="kw">in</span> stds.columns]</span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> stds.reset_index()</span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> stds.melt(id_vars<span class="op">=</span><span class="st">'repo'</span>, var_name<span class="op">=</span><span class="st">'id_title'</span>)</span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>base <span class="op">=</span> alt.Chart(stds)</span>
+<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>box <span class="op">=</span> base.mark_boxplot(</span>
+<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'grey'</span>,</span>
+<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a>    opacity<span class="op">=</span><span class="fl">0.5</span>,</span>
+<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">20</span>,</span>
+<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a>).encode(</span>
+<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'value:Q'</span>).title(<span class="st">'Standard Deviation of Scores'</span>),</span>
+<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'id_title:N'</span>, title<span class="op">=</span><span class="va">None</span>, axis<span class="op">=</span>alt.Axis(labelPadding<span class="op">=</span><span class="dv">10</span>, labelLimit<span class="op">=</span><span class="dv">1000</span>, grid<span class="op">=</span><span class="va">False</span>))</span>
+<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a>stripplot <span class="op">=</span> base.mark_circle(size<span class="op">=</span><span class="dv">100</span>).encode(</span>
+<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y( </span>
+<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a>        <span class="st">'id_title:N'</span>,</span>
+<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a>        axis<span class="op">=</span>alt.Axis(ticks<span class="op">=</span><span class="va">False</span>, grid<span class="op">=</span><span class="va">True</span>, labels<span class="op">=</span><span class="va">True</span>), </span>
+<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a>        scale<span class="op">=</span>alt.Scale(), </span>
+<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a>    ), </span>
+<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span><span class="st">'value:Q'</span>,</span>
+<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a>    yOffset<span class="op">=</span><span class="st">"jitter:Q"</span>,</span>
+<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span>alt.Color(<span class="st">'id_title:N'</span>, legend<span class="op">=</span><span class="va">None</span>),</span>
+<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a>    tooltip<span class="op">=</span><span class="st">'repo'</span></span>
+<span id="cb4-27"><a href="#cb4-27" aria-hidden="true" tabindex="-1"></a>).transform_calculate(</span>
+<span id="cb4-28"><a href="#cb4-28" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Generate Gaussian jitter with a Box-Muller transform</span></span>
+<span id="cb4-29"><a href="#cb4-29" aria-hidden="true" tabindex="-1"></a>    jitter<span class="op">=</span><span class="st">"sqrt(-2*log(random()))*cos(2*PI*random())"</span></span>
+<span id="cb4-30"><a href="#cb4-30" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb4-31"><a href="#cb4-31" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-32"><a href="#cb4-32" aria-hidden="true" tabindex="-1"></a>(</span>
+<span id="cb4-33"><a href="#cb4-33" aria-hidden="true" tabindex="-1"></a>    box <span class="op">+</span> stripplot</span>
+<span id="cb4-34"><a href="#cb4-34" aria-hidden="true" tabindex="-1"></a>).configure_view( </span>
+<span id="cb4-35"><a href="#cb4-35" aria-hidden="true" tabindex="-1"></a>    stroke<span class="op">=</span><span class="va">None</span></span>
+<span id="cb4-36"><a href="#cb4-36" aria-hidden="true" tabindex="-1"></a>).configure_axis( </span>
+<span id="cb4-37"><a href="#cb4-37" aria-hidden="true" tabindex="-1"></a>    labelFontSize<span class="op">=</span><span class="dv">12</span>, </span>
+<span id="cb4-38"><a href="#cb4-38" aria-hidden="true" tabindex="-1"></a>    titleFontSize<span class="op">=</span><span class="dv">12</span></span>
+<span id="cb4-39"><a href="#cb4-39" aria-hidden="true" tabindex="-1"></a>).properties(</span>
+<span id="cb4-40"><a href="#cb4-40" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">300</span>, </span>
+<span id="cb4-41"><a href="#cb4-41" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">600</span>,</span>
+<span id="cb4-42"><a href="#cb4-42" aria-hidden="true" tabindex="-1"></a>    title<span class="op">=</span><span class="st">"30 Runs on Openja's Repositories for each Checklist Item"</span></span>
+<span id="cb4-43"><a href="#cb4-43" aria-hidden="true" tabindex="-1"></a>) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </details>
+<div class="cell-output cell-output-display" data-execution_count="24">
+
+<style>
+  #altair-viz-3e80eba69b234319b871f10f1a35af5e.vega-embed {
+    width: 100%;
+    display: flex;
+  }
+
+  #altair-viz-3e80eba69b234319b871f10f1a35af5e.vega-embed details,
+  #altair-viz-3e80eba69b234319b871f10f1a35af5e.vega-embed details summary {
+    position: relative;
+  }
+</style>
+<div id="altair-viz-3e80eba69b234319b871f10f1a35af5e"></div>
+<script type="text/javascript">
+  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
+  (function(spec, embedOpt){
+    let outputDiv = document.currentScript.previousElementSibling;
+    if (outputDiv.id !== "altair-viz-3e80eba69b234319b871f10f1a35af5e") {
+      outputDiv = document.getElementById("altair-viz-3e80eba69b234319b871f10f1a35af5e");
+    }
+    const paths = {
+      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
+      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
+      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.16.3?noext",
+      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
+    };
+
+    function maybeLoadScript(lib, version) {
+      var key = `${lib.replace("-", "")}_version`;
+      return (VEGA_DEBUG[key] == version) ?
+        Promise.resolve(paths[lib]) :
+        new Promise(function(resolve, reject) {
+          var s = document.createElement('script');
+          document.getElementsByTagName("head")[0].appendChild(s);
+          s.async = true;
+          s.onload = () => {
+            VEGA_DEBUG[key] = version;
+            return resolve(paths[lib]);
+          };
+          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
+          s.src = paths[lib];
+        });
+    }
+
+    function showError(err) {
+      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
+      throw err;
+    }
+
+    function displayChart(vegaEmbed) {
+      vegaEmbed(outputDiv, spec, embedOpt)
+        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
+    }
+
+    if(typeof define === "function" && define.amd) {
+      requirejs.config({paths});
+      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
+    } else {
+      maybeLoadScript("vega", "5")
+        .then(() => maybeLoadScript("vega-lite", "5.16.3"))
+        .then(() => maybeLoadScript("vega-embed", "6"))
+        .catch(showError)
+        .then(() => displayChart(vegaEmbed));
+    }
+  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300, "stroke": null}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "layer": [{"mark": {"type": "boxplot", "color": "grey", "opacity": 0.5, "size": 20}, "encoding": {"x": {"field": "value", "title": "Standard Deviation of Scores", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "id_title", "title": null, "type": "nominal"}}}, {"mark": {"type": "circle", "size": 100}, "encoding": {"color": {"field": "id_title", "legend": null, "type": "nominal"}, "tooltip": {"field": "repo", "type": "nominal"}, "x": {"field": "value", "type": "quantitative"}, "y": {"axis": {"grid": true, "labels": true, "ticks": false}, "field": "id_title", "scale": {}, "type": "nominal"}, "yOffset": {"field": "jitter", "type": "quantitative"}}, "transform": [{"calculate": "sqrt(-2*log(random()))*cos(2*PI*random())", "as": "jitter"}]}], "data": {"name": "data-94635df82462083524b4d13e8a96bc71"}, "height": 300, "title": "30 Runs on Openja's Repositories for each Checklist Item", "width": 600, "$schema": "https://vega.github.io/schema/vega-lite/v5.16.3.json", "datasets": {"data-94635df82462083524b4d13e8a96bc71": [{"repo": "lightfm", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "magenta", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0912870929175276}, {"repo": "mmf", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "nanodet", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "qlib", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "lightfm", "id_title": "3.2. Data in the Expected Format", "value": 0.0}, {"repo": "magenta", "id_title": "3.2. Data in the Expected Format", "value": 0.0}, {"repo": "mmf", "id_title": "3.2. Data in the Expected Format", "value": 0.2330457998496995}, {"repo": "nanodet", "id_title": "3.2. Data in the Expected Format", "value": 0.2542738138578039}, {"repo": "qlib", "id_title": "3.2. Data in the Expected Format", "value": 0.2537081317024624}, {"repo": "lightfm", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.0}, {"repo": "magenta", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.0}, {"repo": "mmf", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.2537081317024624}, {"repo": "nanodet", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.0}, {"repo": "qlib", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.2150915335760381}, {"repo": "lightfm", "id_title": "4.2. Verify Data Split Proportion", "value": 0.2450662589267805}, {"repo": "magenta", "id_title": "4.2. Verify Data Split Proportion", "value": 0.0}, {"repo": "mmf", "id_title": "4.2. Verify Data Split Proportion", "value": 0.1728729518208802}, {"repo": "nanodet", "id_title": "4.2. Verify Data Split Proportion", "value": 0.0}, {"repo": "qlib", "id_title": "4.2. Verify Data Split Proportion", "value": 0.2069204966986668}, {"repo": "lightfm", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.0912870929175276}, {"repo": "magenta", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.0}, {"repo": "mmf", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.2491364395612199}, {"repo": "nanodet", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.0}, {"repo": "qlib", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.2012889499682243}, {"repo": "lightfm", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.1895245108947258}, {"repo": "magenta", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.0}, {"repo": "mmf", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.1525642883146823}, {"repo": "nanodet", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.2491364395612199}, {"repo": "qlib", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.2916461404928373}, {"repo": "lightfm", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.0912870929175276}, {"repo": "magenta", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.0}, {"repo": "mmf", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.2248882225544018}, {"repo": "nanodet", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.1268540658512312}, {"repo": "qlib", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.203419051086243}]}}, {"mode": "vega-lite"});
+</script>
+</div>
 </div>
 <blockquote class="blockquote">
-<p>Caption: Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores of 30 runs of a sigle repository</p>
+<p>Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores from 30 runs of a single repository.</p>
 </blockquote>
-<p>We found 2 diverging cases. For example, it shows high standard deviations across repositories for item <code>3.2 Data in the Expected Format</code>. This might be a proof of poor prompt quality, making it ambiguous for the LLM and hence hard to produce consistent results. Prompt engineering might solve this problem.</p>
-<p>On the other hand, there are outliers yielding exceptionally high standard deviations for item <code>5.3 Ensure Model Output Shape Aligns with Expectation</code>. This may be because those repositories are unorthodox, and careful manual examination is required to achieve a more robust conclusion.</p>
-</section>
+<p>We identified two diverging cases:</p>
+<ol type="i">
+<li><strong>High Standard Deviations</strong></li>
+</ol>
+<p>Items like <code>3.2 Data in the Expected Format</code> showed high standard deviations across repositories. This might indicate potential poor prompt quality for the LLM to produce consistent results. Improved prompt engineering could address this issue.</p>
+<ol start="2" type="i">
+<li><strong>Outliers with High Standard Deviations</strong></li>
+</ol>
+<p>Items like <code>5.3 Ensure Model Output Shape Aligns with Expectation</code> had outliers with exceptionally high standard deviations, which is possibly due to unorthodox repositories. A careful manual examination is required for a more definitive conclusion.</p>
 <section id="comparison-of-gpt-3.5-turbo-and-gpt-4o" class="level4">
 <h4 class="anchored" data-anchor-id="comparison-of-gpt-3.5-turbo-and-gpt-4o">Comparison of <code>gpt-3.5-turbo</code> and <code>gpt-4o</code></h4>
-<p>To examine if newer LLMs help in both metrics, we preliminarily compared system outputs from <code>gpt-4o</code> and <code>gpt-3.5-turbo</code> on the <code>lightfm</code> repository, we observed that the <code>gpt-4o</code> system consistently returned “Satisfied”, which deviates from the ground truth.</p>
+<p>To evaluate if newer LLMs improve performance, we preliminarily compared outputs from <code>gpt-4o</code> and <code>gpt-3.5-turbo</code> on the <code>lightfm</code> repository. We observed that <code>gpt-4o</code> consistently returned “Satisfied,” which deviated from the ground truth.</p>
 <div class="cell" data-execution_count="5">
 <details>
 <summary>Code</summary>
-<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co"># </span><span class="al">FIXME</span><span class="co">: jitter-mean-sd plot (checklist item vs. score) for each repo</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co"># </span><span class="al">FIXME</span><span class="co">: jitter-mean-sd plot (checklist item vs. score) for each repo</span></span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>df_repo_4o__stat <span class="op">=</span> pd.read_csv(<span class="st">'score_stat_by_repo_4o.csv'</span>)</span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>df_repo_4o__stat_with_gt <span class="op">=</span> df_repo_4o__stat.merge(gt, on<span class="op">=</span>[<span class="st">'id'</span>, <span class="st">'title'</span>, <span class="st">'repo'</span>])</span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>df_repo_4o__stat_with_gt[<span class="st">'model'</span>] <span class="op">=</span> <span class="st">'gpt-4o'</span></span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>df_repo_35turbo__stat_with_gt <span class="op">=</span> df_repo__stat_with_gt.query(<span class="st">"repo == 'lightfm'"</span>).copy()</span>
+<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a>df_repo_35turbo__stat_with_gt[<span class="st">'model'</span>] <span class="op">=</span> <span class="st">'gpt-3.5-turbo'</span></span>
+<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a>df_model_comp <span class="op">=</span> pd.concat(</span>
+<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a>    (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), </span>
+<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a>    axis<span class="op">=</span><span class="dv">0</span></span>
+<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a>base <span class="op">=</span> alt.Chart(</span>
+<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a>    df_model_comp</span>
+<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a>).transform_calculate(</span>
+<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a>    <span class="bu">min</span><span class="op">=</span><span class="st">"max(0, datum.mean-datum.std)"</span>,</span>
+<span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a>    <span class="bu">max</span><span class="op">=</span><span class="st">"min(1, datum.mean+datum.std)"</span></span>
+<span id="cb5-19"><a href="#cb5-19" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb5-20"><a href="#cb5-20" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb5-21"><a href="#cb5-21" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the points</span></span>
+<span id="cb5-22"><a href="#cb5-22" aria-hidden="true" tabindex="-1"></a>points <span class="op">=</span> base.mark_point(</span>
+<span id="cb5-23"><a href="#cb5-23" aria-hidden="true" tabindex="-1"></a>    filled<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb5-24"><a href="#cb5-24" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">50</span>,</span>
+<span id="cb5-25"><a href="#cb5-25" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'black'</span></span>
+<span id="cb5-26"><a href="#cb5-26" aria-hidden="true" tabindex="-1"></a>).encode(</span>
+<span id="cb5-27"><a href="#cb5-27" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'mean:Q'</span>).scale(domainMin<span class="op">=</span><span class="dv">0</span>, domainMax<span class="op">=</span><span class="dv">1</span>).title(<span class="st">"Score"</span>).axis(</span>
+<span id="cb5-28"><a href="#cb5-28" aria-hidden="true" tabindex="-1"></a>        labelExpr<span class="op">=</span><span class="st">"datum.value % 0.5 ? null : datum.label"</span></span>
+<span id="cb5-29"><a href="#cb5-29" aria-hidden="true" tabindex="-1"></a>    ),</span>
+<span id="cb5-30"><a href="#cb5-30" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'id_title:N'</span>, title<span class="op">=</span><span class="va">None</span>, axis<span class="op">=</span>alt.Axis(labelPadding<span class="op">=</span><span class="dv">10</span>, labelLimit<span class="op">=</span><span class="dv">1000</span>, grid<span class="op">=</span><span class="va">False</span>))<span class="co">#.scale(domainMin=0, domainMax=1).title('Score'),</span></span>
+<span id="cb5-31"><a href="#cb5-31" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb5-32"><a href="#cb5-32" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-33"><a href="#cb5-33" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the points for ground truth</span></span>
+<span id="cb5-34"><a href="#cb5-34" aria-hidden="true" tabindex="-1"></a>gt_points <span class="op">=</span> base.mark_point(</span>
+<span id="cb5-35"><a href="#cb5-35" aria-hidden="true" tabindex="-1"></a>    filled<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb5-36"><a href="#cb5-36" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">200</span>,</span>
+<span id="cb5-37"><a href="#cb5-37" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'green'</span>,</span>
+<span id="cb5-38"><a href="#cb5-38" aria-hidden="true" tabindex="-1"></a>    shape<span class="op">=</span><span class="st">"diamond"</span></span>
+<span id="cb5-39"><a href="#cb5-39" aria-hidden="true" tabindex="-1"></a>).encode(</span>
+<span id="cb5-40"><a href="#cb5-40" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'ground_truth:Q'</span>),</span>
+<span id="cb5-41"><a href="#cb5-41" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'id_title:N'</span>)</span>
+<span id="cb5-42"><a href="#cb5-42" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb5-43"><a href="#cb5-43" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-44"><a href="#cb5-44" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the error bars</span></span>
+<span id="cb5-45"><a href="#cb5-45" aria-hidden="true" tabindex="-1"></a>errorbars <span class="op">=</span> base.mark_errorbar().encode(</span>
+<span id="cb5-46"><a href="#cb5-46" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">"min:Q"</span>).title(<span class="st">'1 SD'</span>), <span class="co">#"id:N",</span></span>
+<span id="cb5-47"><a href="#cb5-47" aria-hidden="true" tabindex="-1"></a>    x2<span class="op">=</span><span class="st">"max:Q"</span>,</span>
+<span id="cb5-48"><a href="#cb5-48" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">"id_title:N"</span></span>
+<span id="cb5-49"><a href="#cb5-49" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb5-50"><a href="#cb5-50" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-51"><a href="#cb5-51" aria-hidden="true" tabindex="-1"></a>(gt_points <span class="op">+</span> points <span class="op">+</span> errorbars).facet(</span>
+<span id="cb5-52"><a href="#cb5-52" aria-hidden="true" tabindex="-1"></a>    column<span class="op">=</span>alt.Column(<span class="st">'model:N'</span>).title(<span class="va">None</span>)</span>
+<span id="cb5-53"><a href="#cb5-53" aria-hidden="true" tabindex="-1"></a>).configure_axis( </span>
+<span id="cb5-54"><a href="#cb5-54" aria-hidden="true" tabindex="-1"></a>    labelFontSize<span class="op">=</span><span class="dv">12</span>, </span>
+<span id="cb5-55"><a href="#cb5-55" aria-hidden="true" tabindex="-1"></a>    titleFontSize<span class="op">=</span><span class="dv">12</span></span>
+<span id="cb5-56"><a href="#cb5-56" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </details>
+<div class="cell-output cell-output-display" data-execution_count="25">
+
+<style>
+  #altair-viz-131a6c120ebb47b19e0141d88109bfc1.vega-embed {
+    width: 100%;
+    display: flex;
+  }
+
+  #altair-viz-131a6c120ebb47b19e0141d88109bfc1.vega-embed details,
+  #altair-viz-131a6c120ebb47b19e0141d88109bfc1.vega-embed details summary {
+    position: relative;
+  }
+</style>
+<div id="altair-viz-131a6c120ebb47b19e0141d88109bfc1"></div>
+<script type="text/javascript">
+  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
+  (function(spec, embedOpt){
+    let outputDiv = document.currentScript.previousElementSibling;
+    if (outputDiv.id !== "altair-viz-131a6c120ebb47b19e0141d88109bfc1") {
+      outputDiv = document.getElementById("altair-viz-131a6c120ebb47b19e0141d88109bfc1");
+    }
+    const paths = {
+      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
+      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
+      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.16.3?noext",
+      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
+    };
+
+    function maybeLoadScript(lib, version) {
+      var key = `${lib.replace("-", "")}_version`;
+      return (VEGA_DEBUG[key] == version) ?
+        Promise.resolve(paths[lib]) :
+        new Promise(function(resolve, reject) {
+          var s = document.createElement('script');
+          document.getElementsByTagName("head")[0].appendChild(s);
+          s.async = true;
+          s.onload = () => {
+            VEGA_DEBUG[key] = version;
+            return resolve(paths[lib]);
+          };
+          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
+          s.src = paths[lib];
+        });
+    }
+
+    function showError(err) {
+      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
+      throw err;
+    }
+
+    function displayChart(vegaEmbed) {
+      vegaEmbed(outputDiv, spec, embedOpt)
+        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
+    }
+
+    if(typeof define === "function" && define.amd) {
+      requirejs.config({paths});
+      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
+    } else {
+      maybeLoadScript("vega", "5")
+        .then(() => maybeLoadScript("vega-lite", "5.16.3"))
+        .then(() => maybeLoadScript("vega-embed", "6"))
+        .catch(showError)
+        .then(() => displayChart(vegaEmbed));
+    }
+  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "data": {"name": "data-294ed38626231524251e2b4c25cd06fc"}, "facet": {"column": {"field": "model", "title": null, "type": "nominal"}}, "spec": {"layer": [{"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 200}, "encoding": {"x": {"field": "ground_truth", "type": "quantitative"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelExpr": "datum.value % 0.5 ? null : datum.label"}, "field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "id_title", "title": null, "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "min", "title": "1 SD", "type": "quantitative"}, "x2": {"field": "max"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}]}, "$schema": "https://vega.github.io/schema/vega-lite/v5.16.3.json", "datasets": {"data-294ed38626231524251e2b4c25cd06fc": [{"repo": "lightfm", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 3.2, "count": 30.0, "mean": 0.5, "std": 0.0, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 3.5, "count": 30.0, "mean": 0.0, "std": 0.0, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 4.2, "count": 30.0, "mean": 0.8166666666666667, "std": 0.2450662589267805, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 5.3, "count": 30.0, "mean": 0.4833333333333333, "std": 0.0912870929175276, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 0.5, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 6.1, "count": 30.0, "mean": 0.9166666666666666, "std": 0.1895245108947258, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 6.2, "count": 30.0, "mean": 0.9833333333333332, "std": 0.0912870929175276, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 3.2, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 3.5, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 4.2, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 5.3, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 0.5, "model": "gpt-4o"}, {"repo": "lightfm", "id": 6.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 6.2, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0, "model": "gpt-4o"}]}}, {"mode": "vega-lite"});
+</script>
+</div>
 </div>
 <blockquote class="blockquote">
-<p>Caption: Comparison of the satisfaction using <code>gpt-4o</code> versus using <code>gpt-3.5-turbo</code> for each checklist item on <code>lightfm</code></p>
+<p>Comparison of satisfaction using <code>gpt-4o</code> versus <code>gpt-3.5-turbo</code> for each checklist item on lightfm</p>
 </blockquote>
-<p>Further investigation into <code>gpt-4o</code> is required to address this issue and enhance the system performance.</p>
+<p>Further investigation into <code>gpt-4o</code> is required to determine its effectiveness in system performance.</p>
 </section>
 </section>
 </section>
@@ -437,40 +1040,82 @@ <h4 class="anchored" data-anchor-id="comparison-of-gpt-3.5-turbo-and-gpt-4o">Com
 <h2 class="anchored" data-anchor-id="conclusion">Conclusion</h2>
 <section id="wrap-up" class="level3">
 <h3 class="anchored" data-anchor-id="wrap-up">Wrap Up</h3>
-<p>Our project, FixML, represents a significant step forward in the field of machine learning (ML) testing by providing curated checklists and automated tools that enhance the evaluation and creation of test suites for ML models. The development and implementation of FixML have been driven by both the need of better quality assurance in ML systems, and the current limitations of traditional testing methods on ML projects which are either too general without comprehensive clarification, or are too human-reliant.</p>
-<p>FixML seamlessly takes in the user’s ML codebase, identifies and extracted its existing test suites. Together with the curated checklist on ML testing, FixML leverages Large Language Models (LLMs) to assess the completeness of the test suites and output detailed evaluation reports with completeness scores and specific reasons. This assists users in understanding the performance of their current test suites with insights. Additionally, FixML can generate test function specifications corresponding to the curated checklist, helping users utilizing their test suites.</p>
-<p>In return, FixML solution combines the scalability of automated testing with the reliability of expert evaluation. By automating the evaluation process, FixML significantly reduces the time and human effort required to assess the quality of ML test suites. This popularizes thorough and efficient quality assessment on ML projects.</p>
+<p>The development of FixML has been driven by the need of better quality assurance in ML systems and the current limitations of traditional testing methods on ML projects. FixML provides curated checklists and automated tools that enhance the evaluation and creation of test suites for ML projects. This in return, significantly reduces the time and effort required to assess the completeness of ML test suites, and thus promotes thorough and efficient assessment on ML projects.</p>
 </section>
 <section id="limitation-future-improvement" class="level3">
 <h3 class="anchored" data-anchor-id="limitation-future-improvement">Limitation &amp; Future Improvement</h3>
-<p>While FixML provides substantial benefits, there are limitations and areas that aim to be addressed in future development:</p>
+<p>While FixML provides substantial benefits, there are limitations and areas to be addressed in future development:</p>
 <ol type="1">
 <li><strong>Specialized Checklist</strong></li>
 </ol>
-<p>The current checklist is designed to be general and may not cover all specific requirements for different ML projects. Future development will focus on creating more specialized checklists for different domains and project types, allowing for more tailored evaluations. Since the format of the checklist is designed to allow users to easily expand, edit and select checklist items based on their specific use case, we welcome any collaboration with ML researchers on the creation of specalized checklists.</p>
+<p>The default checklist is general and may not cover all requirements for different ML projects. Future development will focus on creating specialized checklists for tailored evaluations across various domains and project types. Collaboration with ML researchers is welcomed for creating specialized checklists based on specific use cases.</p>
 <ol start="2" type="1">
 <li><strong>Enhanced Test Evaluator</strong></li>
 </ol>
-<p>Our current study unveils the varying accuracy and consistency issues on the evaluation results using OpenAI GPT models. Future improvements involves prompt enhancement with prompt engineering techniques and support for multiple LLMs for higher performance and flexibility of FixML test evaluator functionality. We also expect to deliver user guidelines in editing the prompts in our system, where ML developers can customize prompts for better performance and collaborate with us to embed them into the system.</p>
+<p>Our study reveals the accuracy and consistency issues on the evaluation results using OpenAI GPT-3.5-turbo model. Future improvements involves better prompt engineering techniques and support for multiple LLMs for enhanced performance and flexibility. User guidelines in prompt creation will be provided to facilitate collaboration with ML developers.</p>
 <ol start="3" type="1">
 <li><strong>Customized Test Specification</strong></li>
 </ol>
-<p>FixML test specification generator currently produces general test function skeletons solely based on the curated checklist without the context of the specific ML projects. Future developments will involve the integration of the ML project codebase in the generation process to output customized test functions skeletons. This further lower the barrier of ML users in creating comprehensive test suites relevant to the projects.</p>
+<p>Future developments will integrate project-specific information to produce customized test function skeletons. This may further encourage users to create comprehensive tests.</p>
 <ol start="4" type="1">
 <li>Workflow Optimization #FIXME: have to review whether to include as it seems lower priority.</li>
 </ol>
-<p>The current test evaluator and test specification generator are separate entities. This could be improved by embedding a workflow engine that allows the system to automatically take actions based on the LLM response. For instance, if the LLM response suggests that test suites are partially satisfied or non-satisfied, the system could automatically run the test generator to produce test function skeletons and then reevaluate them until they are satisfied or some threshold is met. This would create a more cohesive and efficient workflow, reducing manual intervention and improving overall system performance.</p>
+<p>The test evaluator and test specification generator are currently separate. Future improvements could embed a workflow engine that automatically takes actions based on LLM responses. This creates a more cohesive and efficient workflow, recues manual intervention, and improves overall system performance.</p>
 <ol start="5" type="1">
 <li>Performance Optimization #FIXME: have to review whether to include as it seems lower priority.</li>
 </ol>
-<p>Performance optimization is another critical area for future development. As FixML handles large codebases and complex evaluations, optimizing the system to handle these tasks more efficiently is essential. This includes improving the speed and accuracy of the LLM responses, reducing the time taken to analyze and generate reports, and ensuring the system can scale effectively to handle more extensive and more complex projects.</p>
-<p>By addressing these limitations and focusing on these future improvements, FixML will become an even more powerful tool for ensuring the quality and robustness of machine learning and data science projects.</p>
+<p>As FixML handles large codebases and complex evaluations, performance optimization is essential. Future developments will focus on improving the speed and accuracy of LLM responses, reducing analysis and report generation times, and ensuring scalability for handling larger and more complex projects.</p>
+<p>By addressing these limitations and implementing future improvements, we aim for FixML to achieve better performance and contribute to the development of better ML systems, and ultimately enhance human life.</p>
+</section>
+</section>
+<section id="references" class="level2">
+
+
 
 
-</section>
 </section>
 
-</main> <!-- /main -->
+<div id="quarto-appendix" class="default"><section class="quarto-appendix-contents" role="doc-bibliography"><h2 class="anchored quarto-appendix-heading">References</h2><div id="refs" class="references csl-bib-body hanging-indent" role="list">
+<div id="ref-alexander2023evaluating" class="csl-entry" role="listitem">
+Alexander, Rohan, Lindsay Katz, Callandra Moore, and Zane Schwartz. 2023. <span>“Evaluating the Decency and Consistency of Data Validation Tests Generated by LLMs.”</span> <em>arXiv Preprint arXiv:2310.01402</em>.
+</div>
+<div id="ref-Ashley2024" class="csl-entry" role="listitem">
+Belanger, Ashley. 2024. <span>“Air Canada Must Honor Refund Policy Invented by Airline’s Chatbot.”</span> Ars Technica. <a href="https://arstechnica.com/tech-policy/2024/02/air-canada-must-honor-refund-policy-invented-by-airlines-chatbot/">https://arstechnica.com/tech-policy/2024/02/air-canada-must-honor-refund-policy-invented-by-airlines-chatbot/</a>.
+</div>
+<div id="ref-Atul2010" class="csl-entry" role="listitem">
+Gawande, Atul. 2010. <em>Checklist Manifesto, the (HB)</em>. Penguin Books India.
+</div>
+<div id="ref-grand2021artificial" class="csl-entry" role="listitem">
+Grand-View-Research. 2021. <span>“Artificial Intelligence Market Size, Share &amp; Trends Analysis Report by Solution, by Technology (Deep Learning, Machine Learning), by End-Use, by Region, and Segment Forecasts, 2023 2030.”</span> Grand View Research San Francisco.
+</div>
+<div id="ref-jordan2020" class="csl-entry" role="listitem">
+Jordan, Jeremy. 2020. <span>“Effective Testing for Machine Learning Systems.”</span> <a href="https://www.jeremyjordan.me/testing-ml/">https://www.jeremyjordan.me/testing-ml/</a>.
+</div>
+<div id="ref-kapoor2022leakage" class="csl-entry" role="listitem">
+Kapoor, Sayash, and Arvind Narayanan. 2022. <span>“Leakage and the Reproducibility Crisis in ML-Based Science.”</span> <em>arXiv Preprint arXiv:2207.07048</em>.
+</div>
+<div id="ref-Alice2023" class="csl-entry" role="listitem">
+Nunwick, Alice. 2023. <span>“ITutorGroup Settles AI Hiring Lawsuit Alleging Age Discrimination.”</span> Verdict. <a href="https://www.verdict.co.uk/itutorgroup-settles-ai-hiring-lawsuit-alleging-age-discrimination/">https://www.verdict.co.uk/itutorgroup-settles-ai-hiring-lawsuit-alleging-age-discrimination/</a>.
+</div>
+<div id="ref-openja2023studying" class="csl-entry" role="listitem">
+Openja, Moses, Foutse Khomh, Armstrong Foundjem, Zhen Ming, Mouna Abidi, Ahmed E Hassan, et al. 2023. <span>“Studying the Practices of Testing Machine Learning Software in the Wild.”</span> <em>arXiv Preprint arXiv:2312.12604</em>.
+</div>
+<div id="ref-pineau2021improving" class="csl-entry" role="listitem">
+Pineau, Joelle, Philippe Vincent-Lamarre, Koustuv Sinha, Vincent Larivière, Alina Beygelzimer, Florence d’Alché-Buc, Emily Fox, and Hugo Larochelle. 2021. <span>“Improving Reproducibility in Machine Learning Research (a Report from the Neurips 2019 Reproducibility Program).”</span> <em>Journal of Machine Learning Research</em> 22 (164): 1–20.
+</div>
+<div id="ref-Asheeta2019" class="csl-entry" role="listitem">
+Regidi, Asheeta. 2019. <span>“SEBI’s Circular: The Black Box Conundrum and Misrepresentation in AI-Based Mutual Funds.”</span> Firstpost. <a href="https://www.firstpost.com/business/sebis-circular-the-black-box-conundrum-and-misrepresentation-in-ai-based-mutual-funds-6625161.html">https://www.firstpost.com/business/sebis-circular-the-black-box-conundrum-and-misrepresentation-in-ai-based-mutual-funds-6625161.html</a>.
+</div>
+<div id="ref-David2023" class="csl-entry" role="listitem">
+Shepardson, David. 2023. <span>“GM’s Cruise Recalling 950 Driverless Cars After Pedestrian Dragged in Crash.”</span> Reuters. <a href="https://www.reuters.com/business/autos-transportation/gms-cruise-recall-950-driverless-cars-after-accident-involving-pedestrian-2023-11-08/">https://www.reuters.com/business/autos-transportation/gms-cruise-recall-950-driverless-cars-after-accident-involving-pedestrian-2023-11-08/</a>.
+</div>
+<div id="ref-msise2023" class="csl-entry" role="listitem">
+Team, Microsoft Industry Solutions Engineering. 2023. <span>“Testing Data Science and MLOps Code.”</span> <em>Testing Data Science and MLOps Code - Engineering Fundamentals Playbook</em>. <a href="https://microsoft.github.io/code-with-engineering-playbook/machine-learning/ml-testing/">https://microsoft.github.io/code-with-engineering-playbook/machine-learning/ml-testing/</a>.
+</div>
+<div id="ref-zhang2023sirens" class="csl-entry" role="listitem">
+Zhang, Yue, Yafu Li, Leyang Cui, Deng Cai, Lemao Liu, Tingchen Fu, Xinting Huang, et al. 2023. <span>“Siren’s Song in the AI Ocean: A Survey on Hallucination in Large Language Models.”</span> <a href="https://arxiv.org/abs/2309.01219">https://arxiv.org/abs/2309.01219</a>.
+</div>
+</div></section></div></main> <!-- /main -->
 <script id="quarto-html-after-body" type="application/javascript">
 window.document.addEventListener("DOMContentLoaded", function (event) {
   const toggleBodyColorMode = (bsSheetEl) => {
diff --git a/img/checklist_sample.png b/docs/img/checklist_sample.png
similarity index 100%
rename from img/checklist_sample.png
rename to docs/img/checklist_sample.png
diff --git a/docs/logo.png b/docs/img/logo.png
similarity index 100%
rename from docs/logo.png
rename to docs/img/logo.png
diff --git a/docs/img/proposed_system_overview.png b/docs/img/proposed_system_overview.png
new file mode 100644
index 0000000..f830376
Binary files /dev/null and b/docs/img/proposed_system_overview.png differ
diff --git a/img/test_evaluation_report_sample.png b/docs/img/test_evaluation_report_sample.png
similarity index 100%
rename from img/test_evaluation_report_sample.png
rename to docs/img/test_evaluation_report_sample.png
diff --git a/img/test_spec_sample.png b/docs/img/test_spec_sample.png
similarity index 100%
rename from img/test_spec_sample.png
rename to docs/img/test_spec_sample.png
diff --git a/docs/proposal.html b/docs/proposal.html
index f5b95c6..9a14a8b 100644
--- a/docs/proposal.html
+++ b/docs/proposal.html
@@ -7,7 +7,7 @@
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 
 
-<title>Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</title>
+<title>proposal</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -20,7 +20,26 @@
   margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
   vertical-align: middle;
 }
-</style>
+/* CSS for citations */
+div.csl-bib-body { }
+div.csl-entry {
+  clear: both;
+}
+.hanging-indent div.csl-entry {
+  margin-left:2em;
+  text-indent:-2em;
+}
+div.csl-left-margin {
+  min-width:2em;
+  float:left;
+}
+div.csl-right-inline {
+  margin-left:2em;
+  padding-left:1em;
+}
+div.csl-indent {
+  margin-left: 2em;
+}</style>
 
 
 <script src="site_libs/quarto-nav/quarto-nav.js"></script>
@@ -73,7 +92,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="./proposal.html">Proposal</a></li><li class="breadcrumb-item"><a href="./proposal.html">Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="./proposal.html">Capstone Proposal</a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="" onclick="window.quartoOpenSearch();">
@@ -88,7 +107,7 @@
   <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal sidebar-navigation docked overflow-auto">
     <div class="pt-lg-2 mt-2 text-left sidebar-header">
       <a href="./index.html" class="sidebar-logo-link">
-      <img src="./logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      <img src="./img/logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
       </a>
       </div>
         <div class="mt-2 flex-shrink-0 align-items-center">
@@ -98,40 +117,18 @@
         </div>
     <div class="sidebar-menu-container"> 
     <ul class="list-unstyled mt-1">
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
- <span class="menu-text">Final Report</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
+        <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./final_report.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
+ <span class="menu-text">Capstone Final Report</span></a>
   </div>
 </li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
- <span class="menu-text">Proposal</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
+        <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./proposal.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text">Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
+ <span class="menu-text">Capstone Proposal</span></a>
   </div>
 </li>
-      </ul>
-  </li>
     </ul>
     </div>
 </nav>
@@ -190,44 +187,29 @@ <h2 class="anchored" data-anchor-id="executive-summary">Executive Summary</h2>
 <h2 class="anchored" data-anchor-id="introduction">Introduction</h2>
 <section id="problem-statement" class="level3">
 <h3 class="anchored" data-anchor-id="problem-statement">Problem Statement</h3>
-<p>The global artificial intelligence (AI) market is growing exponentially {cite}<code>grand2021artificial</code>, driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.</p>
-<p>However, ensuring the software quality of these systems remains a significant challenge {cite}<code>openja2023studying</code>. Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as substantial financial losses {cite}<code>Asheeta2019</code> and safety hazards.</p>
+<p>The global artificial intelligence (AI) market is growing exponentially <span class="citation" data-cites="grand2021artificial">(<a href="#ref-grand2021artificial" role="doc-biblioref">Grand-View-Research 2021</a>)</span>, driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.</p>
+<p>However, ensuring the software quality of these systems remains a significant challenge <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span>. Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as substantial financial losses <span class="citation" data-cites="Asheeta2019">(<a href="#ref-Asheeta2019" role="doc-biblioref">Regidi 2019</a>)</span> and safety hazards.</p>
 <p>Therefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?</p>
 </section>
 <section id="our-objectives" class="level3">
 <h3 class="anchored" data-anchor-id="our-objectives">Our Objectives</h3>
-<p>We propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate a checklist to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software’s trustworthiness, quality, and reproducibility across both the industry and academia {cite}<code>kapoor2022leakage</code>.</p>
+<p>We propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate a checklist to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software’s trustworthiness, quality, and reproducibility across both the industry and academia <span class="citation" data-cites="kapoor2022leakage">(<a href="#ref-kapoor2022leakage" role="doc-biblioref">Kapoor and Narayanan 2022</a>)</span>.</p>
 </section>
 </section>
 <section id="our-product" class="level2">
 <h2 class="anchored" data-anchor-id="our-product">Our Product</h2>
 <p>Our solution offers an end-to-end application for evaluating and enhancing the robustness of users’ ML systems.</p>
-<table class="table">
-<thead>
-<tr class="header">
-<th>```tuqkogfb ../../img/proposed_system_overview.png</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td>name: overview-diagram</td>
-</tr>
-</tbody>
-</table>
-<p>Main components and workflow of the proposed system. The checklist would be written in <a href="https://yaml.org/">YAML</a> to maximize readability for both humans and machines. We hope this will encourage researchers/users to read, understand and modify the checklist items, while keeping the checklist closely integrated with other components in our system.</p>
-<pre><code>
-One big challenge in utilizing LLMs to reliably and consistently evaluate ML systems is their tendency to generate illogical and/or factually wrong information known as hallucination {cite}`zhang2023sirens`.
-
-To combat this, the proposed system will incorporate a checklist ([Fig. 1](overview-diagram)) which would be curated manually and incorporate best practices in software testing and identified areas to be tested inside ML pipeline from human experts and past research.
-
-This checklist will be our basis in evaluating the effectiveness and completeness of existing tests in a given codebase. Relevant information will be injected into a prompt template, which the LLMs would then be prompted to follow the checklist **exactly** during the evaluation.
-
-Here is an example of how the proposed checklist would be structured:
-
-
-```{toggle}
-```yaml
-%YAML 1.2
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure">
+<p><img src="img/proposed_system_overview.png" class="img-fluid figure-img"></p>
+<figcaption class="figure-caption">Main components and workflow of the proposed system. The checklist would be written in <a href="https://yaml.org/">YAML</a> to maximize readability for both humans and machines. We hope this will encourage researchers/users to read, understand and modify the checklist items, while keeping the checklist closely integrated with other components in our system.</figcaption>
+</figure>
+</div>
+<p>One big challenge in utilizing LLMs to reliably and consistently evaluate ML systems is their tendency to generate illogical and/or factually wrong information known as hallucination <span class="citation" data-cites="zhang2023sirens">(<a href="#ref-zhang2023sirens" role="doc-biblioref">Zhang et al. 2023</a>)</span>.</p>
+<p>To combat this, the proposed system will incorporate a checklist (<a href="overview-diagram">Fig. 1</a>) which would be curated manually and incorporate best practices in software testing and identified areas to be tested inside ML pipeline from human experts and past research.</p>
+<p>This checklist will be our basis in evaluating the effectiveness and completeness of existing tests in a given codebase. Relevant information will be injected into a prompt template, which the LLMs would then be prompted to follow the checklist <strong>exactly</strong> during the evaluation.</p>
+<p>Here is an example of how the proposed checklist would be structured:</p>
+<pre class="{yaml}"><code>%YAML 1.2
 ---
 Title: Checklist for Tests in Machine Learning Projects
 Description: &gt;
@@ -370,7 +352,7 @@ <h3 class="anchored" data-anchor-id="data-science-approach">Data Science Approac
 <section id="data-github-repositories" class="level4">
 <h4 class="anchored" data-anchor-id="data-github-repositories">Data: GitHub Repositories</h4>
 <p>In this project, GitHub repositories are our data.</p>
-<p>To develop our testing checklist, we will collect 11 repositories studied in {cite}<code>openja2023studying</code>. Additionally, we will collect 377 repositories identified in the study by {cite}<code>wattanakriengkrai2022github</code> for our product development.</p>
+<p>To develop our testing checklist, we will collect 11 repositories studied in <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span>. Additionally, we will collect 377 repositories identified in the study by <span class="citation" data-cites="wattanakriengkrai2022github">(<a href="#ref-wattanakriengkrai2022github" role="doc-biblioref">Wattanakriengkrai et al. 2022</a>)</span> for our product development.</p>
 <p>For each repository, we are interested in the metadata and the ML modeling- and test-related source code. The metadata will be retrieved using the GitHub API, while the source code will be downloaded and filtered using our custom scripts. To ensure the relevance of the repositories to our study, we will apply the following criteria for filtering: 1. Repositories that are related to ML systems. 2. Repositories that include test cases. 3. Repositories whose development is written in the Python programming language.</p>
 </section>
 <section id="methodologies" class="level4">
@@ -378,13 +360,13 @@ <h4 class="anchored" data-anchor-id="methodologies">Methodologies</h4>
 <p>Our data science methodology incorporates human expert evaluation and prompt engineering to assess and enhance the test quality of ML systems.</p>
 <ul>
 <li><p>Human Expert Evaluation</p>
-<p>We will begin by formulating a comprehensive checklist for evaluating the data and ML pipeline based on the established testing strategies outlined in {cite}<code>openja2023studying</code> as the foundational framework. Based on the formulated checklist, our team will manually assess the test quality within each repository data. We will refine the checklist to ensure applicability and robustness when testing general ML systems.</p></li>
+<p>We will begin by formulating a comprehensive checklist for evaluating the data and ML pipeline based on the established testing strategies outlined in <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span> as the foundational framework. Based on the formulated checklist, our team will manually assess the test quality within each repository data. We will refine the checklist to ensure applicability and robustness when testing general ML systems.</p></li>
 <li><p>Prompt Engineering</p>
 <p>We will engineer the prompts for LLM to incorporate with the ML system code and the curated checklist and to serve various purposes across the three-stage process:</p>
 <ol type="1">
 <li>Prompts to examine test cases within the ML system source codes and deliver test completeness scores.</li>
 <li>Prompts to compare and contrast the existing tests and the checklist and deliver recommendations.</li>
-<li>Prompts to generate system-specific test specifications based on user-selected testing recommendations {cite}<code>schafer2023empirical</code></li>
+<li>Prompts to generate system-specific test specifications based on user-selected testing recommendations <span class="citation" data-cites="schafer2023empirical">(<a href="#ref-schafer2023empirical" role="doc-biblioref">Schäfer et al. 2023</a>)</span></li>
 </ol></li>
 </ul>
 </section>
@@ -438,13 +420,35 @@ <h2 class="anchored" data-anchor-id="delivery-timeline">Delivery Timeline</h2>
 </table>
 </section>
 <section id="references" class="level2">
-<h2 class="anchored" data-anchor-id="references">References</h2>
-<pre class="{bibliography}"><code></code></pre>
+
+
 
 
 </section>
 
-</main> <!-- /main -->
+<div id="quarto-appendix" class="default"><section class="quarto-appendix-contents" role="doc-bibliography"><h2 class="anchored quarto-appendix-heading">References</h2><div id="refs" class="references csl-bib-body hanging-indent" role="list">
+<div id="ref-grand2021artificial" class="csl-entry" role="listitem">
+Grand-View-Research. 2021. <span>“Artificial Intelligence Market Size, Share &amp; Trends Analysis Report by Solution, by Technology (Deep Learning, Machine Learning), by End-Use, by Region, and Segment Forecasts, 2023 2030.”</span> Grand View Research San Francisco.
+</div>
+<div id="ref-kapoor2022leakage" class="csl-entry" role="listitem">
+Kapoor, Sayash, and Arvind Narayanan. 2022. <span>“Leakage and the Reproducibility Crisis in ML-Based Science.”</span> <em>arXiv Preprint arXiv:2207.07048</em>.
+</div>
+<div id="ref-openja2023studying" class="csl-entry" role="listitem">
+Openja, Moses, Foutse Khomh, Armstrong Foundjem, Zhen Ming, Mouna Abidi, Ahmed E Hassan, et al. 2023. <span>“Studying the Practices of Testing Machine Learning Software in the Wild.”</span> <em>arXiv Preprint arXiv:2312.12604</em>.
+</div>
+<div id="ref-Asheeta2019" class="csl-entry" role="listitem">
+Regidi, Asheeta. 2019. <span>“SEBI’s Circular: The Black Box Conundrum and Misrepresentation in AI-Based Mutual Funds.”</span> Firstpost. <a href="https://www.firstpost.com/business/sebis-circular-the-black-box-conundrum-and-misrepresentation-in-ai-based-mutual-funds-6625161.html">https://www.firstpost.com/business/sebis-circular-the-black-box-conundrum-and-misrepresentation-in-ai-based-mutual-funds-6625161.html</a>.
+</div>
+<div id="ref-schafer2023empirical" class="csl-entry" role="listitem">
+Schäfer, Max, Sarah Nadi, Aryaz Eghbali, and Frank Tip. 2023. <span>“An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation.”</span> <em>IEEE Transactions on Software Engineering</em>.
+</div>
+<div id="ref-wattanakriengkrai2022github" class="csl-entry" role="listitem">
+Wattanakriengkrai, Supatsara, Bodin Chinthanet, Hideaki Hata, Raula Gaikovina Kula, Christoph Treude, Jin Guo, and Kenichi Matsumoto. 2022. <span>“GitHub Repositories with Links to Academic Papers: Public Access, Traceability, and Evolution.”</span> <em>Journal of Systems and Software</em> 183: 111117.
+</div>
+<div id="ref-zhang2023sirens" class="csl-entry" role="listitem">
+Zhang, Yue, Yafu Li, Leyang Cui, Deng Cai, Lemao Liu, Tingchen Fu, Xinting Huang, et al. 2023. <span>“Siren’s Song in the AI Ocean: A Survey on Hallucination in Large Language Models.”</span> <a href="https://arxiv.org/abs/2309.01219">https://arxiv.org/abs/2309.01219</a>.
+</div>
+</div></section></div></main> <!-- /main -->
 <script id="quarto-html-after-body" type="application/javascript">
 window.document.addEventListener("DOMContentLoaded", function (event) {
   const toggleBodyColorMode = (bsSheetEl) => {
diff --git a/docs/search.json b/docs/search.json
index b85bca0..0f3e480 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -1,17 +1,38 @@
 [
   {
-    "objectID": "02_finding-report.html",
-    "href": "02_finding-report.html",
-    "title": "NOTE: the result is based on the code base abb9a21, which is similar to the commit 69d61a9 in the main branch",
+    "objectID": "proposal.html",
+    "href": "proposal.html",
+    "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
     "section": "",
-    "text": "import scipy\nimport pickle\nimport yaml\nimport pandas as pd\nimport altair as alt\nfrom collections import Counter\n\n\ndef get_report(response):\n    report = []\n    for result in response.call_results:\n        if result.parsed_response:\n            resp = result.parsed_response['results']\n            for item in resp:\n                item['file'] = result.files_evaluated[0] \n                item['success'] = result.success\n                report.append(item)\n        else:\n            report.append({\n                'ID': '2.1', # FIXME\n                'Title': '',\n                'Requirement': '',\n                'Observation': '',\n                'Functions': [],\n                'Evaluation': '',\n                'Score': 0,\n                'file': result.files_evaluated[0],\n                'success': result.success\n            })\n    return pd.DataFrame(report)\n\ndef extract_file_and_scores(resp_path, verbose=False):\n    if verbose:\n        print(resp_path)\n    with open(resp_path, 'rb') as file:\n        response = pickle.load(file)\n    report = get_report(response)\n    df = (\n        report\n        .pivot(index='file', columns='ID', values='Score')\n        .rename_axis(None, axis=1)\n    )\n    df['success'] = report.groupby(['file'])['success'].all()\n    df['response_path'] = resp_path\n    return df.reset_index()\n\ndef generate_stat_plot(df_repo__stat, ground_truth=None, facet_col='repo', repo=None, id=None):\n    \"\"\"\n    Generate Stat plot across all repo and all checklist item\n    Optional to incorporate ground truth and select specific repo/checklist item\n    \"\"\"\n    if facet_col == 'repo':\n        x_col = 'id'\n        x_title = 'Checklist ID'\n    elif facet_col == 'id':\n        x_col = 'repo'\n        x_title = 'Repository'\n    \n    # the base chart\n    if repo:\n        df_repo__stat = df_repo__stat.query(f'repo == \"{repo}\"')\n    if id:\n        df_repo__stat = df_repo__stat.query(f'id == \"{id}\"')\n    \n    base = alt.Chart().transform_calculate(\n        min=\"max(0, datum.mean-datum.std)\",\n        max=\"min(1, datum.mean+datum.std)\"\n    )\n    \n    # generate the points\n    points = base.mark_point(\n        filled=True,\n        size=50,\n        color='black'\n    ).encode(\n        x=alt.X(f'{x_col}:O').axis(labelAngle=0).title(x_title),\n        y=alt.Y('mean:Q').scale(domainMin=0, domainMax=1).title('Score'),\n    )\n    \n    # generate the error bars\n    errorbars = base.mark_errorbar().encode(\n        x=f\"{x_col}:O\",\n        y=alt.Y(\"min:Q\").title('1 SD'),\n        y2=\"max:Q\"\n    )\n\n    plot = points + errorbars\n    \n    if ground_truth is not None:\n        # generate points of ground truth\n        if repo:\n            ground_truth = ground_truth.query(f'repo == \"{repo}\"')\n        if id:\n            ground_truth = ground_truth.query(f'id == \"{id}\"')\n        \n        df_repo__stat = pd.merge(df_repo__stat, ground_truth, how='left', on=['repo', 'id'])\n        \n        gt_points = alt.Chart().mark_point(\n            filled=True,\n            size=100,\n            color='green',\n            shape=\"diamond\"\n        ).encode(\n            x=alt.X(f'{x_col}:O'),\n            y=alt.Y('score:Q')\n        )\n\n        plot += gt_points\n\n    plot = alt.layer(\n                plot,\n                data=df_repo__stat\n            ).properties(\n                width=400,\n            ).facet(\n                column=f'{facet_col}',\n                columns=2\n            )\n\n    return plot\n\n\npreprocess data\n\nchecklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']\n\n#result_path = '../draft/batch_run_results/record_combine.yml'\nresult_path = '../data/processed/batch_run/record_combine.yml'\nwith open(result_path, 'r') as file:\n    config = pd.DataFrame(yaml.safe_load(file))\n\n# prepare score data by repo, run, file\ntmp = [\n    extract_file_and_scores(path) for path in config['response_path'] # FIXME: excluded deepchem\n]\ntmp = pd.concat(tmp, axis=0).reset_index(drop=True)\n\nraw_df_repo_run_file = config.merge(tmp, on='response_path', how='left')\n\n\n# filter non-test files in qlib\ndf_repo_run_file = raw_df_repo_run_file.query('(repo != \"qlib\") | (file.str.contains(\"../data/raw/openja/qlib/tests/\"))')\n\n# prepare score data by repo, run\ndf_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({\n    id: ['max'] for id in checklist_ids\n})\ndf_repo_run.columns = [col[0] for col in df_repo_run.columns]\ndf_repo_run = df_repo_run.reset_index()\n\n# prepare statistics of scores by repo\ndf_repo__stat = df_repo_run.groupby(['repo']).agg({\n    id: ['mean', 'std', 'count'] for id in checklist_ids\n})\ndf_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])\ndf_repo__stat.columns = ['repo', 'id', 'stat', 'value']\ndf_repo__stat = (\n    df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')\n    .reset_index()\n    .rename_axis(None, axis=1)\n)\n\n# prepare counting of scores by repo\ndf_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()\nfor id in checklist_ids[1:]:\n    df_repo__count = df_repo__count.merge(\n        df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),\n        on=['repo', 'level_1'],\n        how='outer'\n    )\n\ndf_repo__count = df_repo__count.fillna(0)\n\n\n\nRuns Quality\n\n1. Some non-test files are included in the evaluation\nFor example, the ./nanodet/nanodet/trainer/task.py\n\nraw_df_repo_run_file.query('repo == \"nanodet\"')['file'].unique()[:3]\n\narray(['../data/raw/openja/nanodet/nanodet/trainer/task.py',\n       '../data/raw/openja/nanodet/tests/test_configs/test_config.py',\n       '../data/raw/openja/nanodet/tests/test_data/test_batch_process.py'],\n      dtype=object)\n\n\n\n\n2. Evaluation on the file magenta/magenta/models/music_vae/data_test.py is always failed\n\ndf_repo_run_file[~df_repo_run_file.success]['file'].unique()\n\narray(['../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',\n       '../data/raw/openja/paperless-ng/src/documents/tests/test_api.py'],\n      dtype=object)\n\n\n\n\n3. DeepSpeech, lightfm and magenta have the least (Python) test files\n\ndf_repo_run_file.query('run == 1').groupby(['repo'])['file'].count().reset_index()\n\n\n\n\n\n\n\n\nrepo\nfile\n\n\n\n\n0\nDeepSpeech\n3\n\n\n1\napollo\n14\n\n\n2\nlightfm\n7\n\n\n3\nmagenta\n8\n\n\n4\nmmf\n70\n\n\n5\nmycroft-core\n64\n\n\n6\nnanodet\n42\n\n\n7\npaperless-ng\n35\n\n\n8\nqlib\n31\n\n\n\n\n\n\n\n\n\n4. The test files are not always in a tests/ folder. Is it be good practice to always do that? Should it be one of the checklist item to ensure all tests placed under tests/ folder?\nFor example, magenta\n\ndf_repo_run_file.query('repo == \"magenta\"')['file'].unique()\n\narray(['../data/raw/openja/magenta/conftest.py',\n       '../data/raw/openja/magenta/magenta/common/state_util_test.py',\n       '../data/raw/openja/magenta/magenta/models/coconet/export_saved_model_test.py',\n       '../data/raw/openja/magenta/magenta/models/coconet/lib_data.py',\n       '../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',\n       '../data/raw/openja/magenta/magenta/models/onsets_frames_transcription/create_dataset_lib_test.py',\n       '../data/raw/openja/magenta/magenta/models/score2perf/datagen_beam_test.py',\n       '../data/raw/openja/magenta/magenta/pipelines/pipeline_test.py'],\n      dtype=object)\n\n\n\n\n\nFindings on 8 repos\n\ndf_repo_run_file.repo.unique()\n\narray(['lightfm', 'qlib', 'mmf', 'nanodet', 'magenta', 'DeepSpeech',\n       'paperless-ng', 'mycroft-core', 'apollo'], dtype=object)\n\n\n\n1. Overview of accuracy and consistency lightfm evaluation\nLet the ground truth of the lightfm is as the following:\n\n# Ground truth\nground_truth = pd.DataFrame([\n    {'repo': 'lightfm', 'id': '2.1', 'score': 1},\n    {'repo': 'lightfm', 'id': '3.2', 'score': 1},\n    {'repo': 'lightfm', 'id': '3.5', 'score': 0},\n    {'repo': 'lightfm', 'id': '4.2', 'score': 1},\n    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},\n    {'repo': 'lightfm', 'id': '6.1', 'score': 1},\n    {'repo': 'lightfm', 'id': '6.2', 'score': 1},\n    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},\n    {'repo': 'qlib', 'id': '3.2', 'score': 1},\n    {'repo': 'qlib', 'id': '3.5', 'score': 0},\n    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},\n    {'repo': 'qlib', 'id': '5.3', 'score': 1},\n    {'repo': 'qlib', 'id': '6.1', 'score': 1},\n    {'repo': 'qlib', 'id': '6.2', 'score': 1},\n    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},\n])\nground_truth[ground_truth.repo == 'lightfm']\n\n\n\n\n\n\n\n\nrepo\nid\nscore\n\n\n\n\n0\nlightfm\n2.1\n1.0\n\n\n1\nlightfm\n3.2\n1.0\n\n\n2\nlightfm\n3.5\n0.0\n\n\n3\nlightfm\n4.2\n1.0\n\n\n4\nlightfm\n5.3\n0.5\n\n\n5\nlightfm\n6.1\n1.0\n\n\n6\nlightfm\n6.2\n1.0\n\n\n\n\n\n\n\n\ngenerate_stat_plot(df_repo__stat, ground_truth=ground_truth, repo=\"lightfm\", facet_col='repo')\n\n\n\n\n\n\n\nThe distribution of the scores for each checklist items:\n\ndf_repo__count.query('repo == \"lightfm\"')\n\n\n\n\n\n\n\n\nrepo\nlevel_1\n2.1\n3.2\n3.5\n4.2\n5.3\n6.1\n6.2\n\n\n\n\n6\nlightfm\n0.0\n0.0\n1.0\n19.0\n0.0\n18.0\n0.0\n0.0\n\n\n7\nlightfm\n0.5\n1.0\n29.0\n6.0\n27.0\n12.0\n20.0\n4.0\n\n\n8\nlightfm\n1.0\n29.0\n0.0\n5.0\n3.0\n0.0\n10.0\n26.0\n\n\n\n\n\n\n\nObservations: The system evaluation kind of aligns with our evaluation, that is, - for those items that we believe “Satisfied” (Score = 1), the system mostly output 0.5 or 1 - for those items that we believe “Partially Satisfied” or “Not Satisfied”, the system mostly output 0.5 or 0 - some checklist items display high variance, e.g. 3.5, 5.3 and 6.1.\n\n\n2. Overview of qlib\nLet the ground truth of the qlib is as the following (FIXME: to be confirmed):\n\n# Ground truth\nground_truth[ground_truth.repo == 'qlib']\n\n\n\n\n\n\n\n\nrepo\nid\nscore\n\n\n\n\n7\nqlib\n2.1\n0.5\n\n\n8\nqlib\n3.2\n1.0\n\n\n9\nqlib\n3.5\n0.0\n\n\n10\nqlib\n4.2\n0.5\n\n\n11\nqlib\n5.3\n1.0\n\n\n12\nqlib\n6.1\n1.0\n\n\n13\nqlib\n6.2\n1.0\n\n\n\n\n\n\n\n\ngenerate_stat_plot(df_repo__stat, ground_truth=ground_truth, repo=\"qlib\", facet_col='repo')\n\n\n\n\n\n\n\n\ndf_repo__count.query('repo == \"qlib\"')\n\n\n\n\n\n\n\n\nrepo\nlevel_1\n2.1\n3.2\n3.5\n4.2\n5.3\n6.1\n6.2\n\n\n\n\n24\nqlib\n0.0\n0.0\n1.0\n29.0\n3.0\n14.0\n4.0\n1.0\n\n\n25\nqlib\n0.5\n0.0\n12.0\n1.0\n27.0\n16.0\n24.0\n26.0\n\n\n26\nqlib\n1.0\n30.0\n17.0\n0.0\n0.0\n0.0\n2.0\n3.0\n\n\n\n\n\n\n\nObservations: - There are more disagreement between system and manual evaluation - especially for 5.3, 6.1, 6.2. - The items consistency in this repo are not similar to those in lightfm. - e.g. Variance for 3.5 is greatly reduced. Variance for 3.2 becomes larger. - However, qlib is not just a machine learning project, it also contains a software inside. - e.g. It has a lot of randomly generated data by itself, instead of reading a data to perform analysis, it seems to deviate from the objective of 2.1.\n\n\n3. The consistency for each checklist items\n\nWhy is it important? If the score of a particular item varies a lot when evaluating a repository, it might mean that its prompt (Requirement) is confusing to the LLM, or the checklist item itself is not well defined.\n\n\ndf_repo__stat.pivot(index='id', columns='repo', values='std')\n\n\n\n\n\n\n\nrepo\nDeepSpeech\napollo\nlightfm\nmagenta\nmmf\nmycroft-core\nnanodet\npaperless-ng\nqlib\n\n\nid\n\n\n\n\n\n\n\n\n\n\n\n\n\n2.1\n0.479463\n0.152564\n0.091287\n0.423451\n0.000000\n0.000000\n0.000000\n0.242117\n0.000000\n\n\n3.2\n0.406838\n0.215092\n0.091287\n0.189525\n0.245066\n0.278027\n0.239732\n0.091287\n0.285673\n\n\n3.5\n0.000000\n0.000000\n0.388040\n0.252003\n0.126854\n0.000000\n0.252003\n0.000000\n0.091287\n\n\n4.2\n0.000000\n0.000000\n0.152564\n0.091287\n0.126854\n0.000000\n0.254274\n0.000000\n0.152564\n\n\n5.3\n0.000000\n0.000000\n0.249136\n0.000000\n0.126854\n0.000000\n0.000000\n0.000000\n0.253708\n\n\n6.1\n0.351107\n0.172873\n0.239732\n0.252003\n0.233046\n0.000000\n0.285673\n0.000000\n0.224888\n\n\n6.2\n0.000000\n0.000000\n0.172873\n0.000000\n0.201289\n0.253708\n0.260415\n0.126854\n0.182574\n\n\n\n\n\n\n\n\nalt.Chart(df_repo__stat).mark_boxplot().encode(\n    x=\"std:Q\",\n    y='id:N'\n).properties(\n    height=200,\n    width=400\n)\n\n\n\n\n\n\n\nObservations: - The evaluation of the checklist item 2.1 Ensure Data File Loads as Expected is usually stable. - When evaluating a repository, 50% of the time its standard deviation is smaller than 0.05, the smallest among the others.\nBelow shows the breakdown of item scores for each repository:\n(NOTE: only lightfm and qlib have ground truth, in green diamond)\n\ngenerate_stat_plot(df_repo__stat, ground_truth=ground_truth, facet_col='id')\n\n\n\n\n\n\n\nObservations: - (TBC) The standard deviation for Item 3.5 and 5.3 shows great variation, which might imply that test cases in some repo might be confusing to LLM while some are clear. - (TBC) The standard deviation for Item 5.3, 6.1, 6.2 are relatively high and consistent, which might imply that there is room for refining the prompt to reduce consistency issue.\n\n\n4. The consistency for each checklist items, compared to the lightfm\n\nWhy is it important? We optimized the consistency of our system using lightfm. Therefore, we treat this repository as a benchmark. If a particular checklist item has a much worse consistency in other repository, that might mean that the prompt for that item is not generalizable.\n\nBelow shows the standard deviations in a 30 runs for each checklist item for each repository:\n\nstds = df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id')\nstds\n\n\n\n\n\n\n\n\nstd\n\n\nid\n2.1\n3.2\n3.5\n4.2\n5.3\n6.1\n6.2\n\n\nrepo\n\n\n\n\n\n\n\n\n\n\n\nDeepSpeech\n0.479463\n0.406838\n0.000000\n0.000000\n0.000000\n0.351107\n0.000000\n\n\napollo\n0.152564\n0.215092\n0.000000\n0.000000\n0.000000\n0.172873\n0.000000\n\n\nlightfm\n0.091287\n0.091287\n0.388040\n0.152564\n0.249136\n0.239732\n0.172873\n\n\nmagenta\n0.423451\n0.189525\n0.252003\n0.091287\n0.000000\n0.252003\n0.000000\n\n\nmmf\n0.000000\n0.245066\n0.126854\n0.126854\n0.126854\n0.233046\n0.201289\n\n\nmycroft-core\n0.000000\n0.278027\n0.000000\n0.000000\n0.000000\n0.000000\n0.253708\n\n\nnanodet\n0.000000\n0.239732\n0.252003\n0.254274\n0.000000\n0.285673\n0.260415\n\n\npaperless-ng\n0.242117\n0.091287\n0.000000\n0.000000\n0.000000\n0.000000\n0.126854\n\n\nqlib\n0.000000\n0.285673\n0.091287\n0.152564\n0.253708\n0.224888\n0.182574\n\n\n\n\n\n\n\n\nstds_p = stds.copy()\nstds_p.columns = [col[1] for col in stds_p.columns]\nstds_p = stds_p.reset_index()\nstds_p = stds_p.melt(id_vars='repo', var_name='id')\n\n\nstds_p.head()\n\n\n\n\n\n\n\n\nrepo\nid\nvalue\n\n\n\n\n0\nDeepSpeech\n2.1\n0.479463\n\n\n1\napollo\n2.1\n0.152564\n\n\n2\nlightfm\n2.1\n0.091287\n\n\n3\nmagenta\n2.1\n0.423451\n\n\n4\nmmf\n2.1\n0.000000\n\n\n\n\n\n\n\n\n# stripplot = (\n#     alt.Chart(stds_p)\n#     .mark_point(filled=True, size=100)\n#     .transform_calculate( \n#         # Generate Gaussian jitter with a Box-Muller transform \n#         jitter='sqrt(-2*log(random()))*cos(2*PI*random())'\n#         # jitter='random()'\n#     ).encode( \n#         y=alt.Y( \n#             'jitter:Q', \n#             title=None, \n#             axis=alt.Axis(ticks=False, grid=True, labels=False), \n#             scale=alt.Scale(), \n#         ), \n#         x=alt.X('value:Q'), \n#         color=alt.Color('repo:N'),\n#         row=alt.Row( \n#             'id:N',\n#             header=alt.Header(\n#                 labelFontSize=16,\n#                 labelAngle=0\n#             )\n#         ),\n#         tooltip='repo'\n#     ).configure_facet( \n#         spacing=0\n#     ).configure_view( \n#         stroke=None\n#     ).configure_axis( \n#         labelFontSize=16, \n#         titleFontSize=16\n#     ).properties(\n#         height=50, \n#         width=600\n#     ) \n# )\n    \n# stripplot \n\n\ndef generate_jitterbox_plot(df_stds_p):\n    \"\"\"\n    Generate jitterbox plot across all repo and all checklist item\n    \"\"\"\n    box = alt.Chart().mark_boxplot(\n        color='grey',\n        opacity=0.5,\n        size=20,\n    ).encode(\n        x=alt.X('value:Q').title('SD(Score)'),\n        y=alt.Y('id:N', title=None, axis=alt.Axis(labelPadding=10, grid=False))\n    )\n    \n    stripplot = alt.Chart().mark_circle(size=100).encode(\n        y=alt.Y( \n            'id:N',\n            axis=alt.Axis(ticks=False, grid=True, labels=True), \n            scale=alt.Scale(), \n        ), \n        x='value:Q',\n        yOffset=\"jitter:Q\",\n        color=alt.Color('id:N', legend=None),\n        tooltip='repo'\n    ).transform_calculate(\n        # Generate Gaussian jitter with a Box-Muller transform\n        jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n    )\n    \n    plot = alt.layer(\n        box,\n        stripplot,\n        data=df_stds_p\n    ).configure_view( \n        stroke=None\n    ).configure_axis( \n        labelFontSize=16, \n        titleFontSize=16\n    ).properties(\n        height=300, \n        width=600\n    ) \n    \n    return plot\n\n\ngenerate_jitterbox_plot(stds_p)\n\n\n\n\n\n\n\n\nalt.Chart(df_repo__stat).mark_boxplot().encode(\n    x=\"std:Q\",\n    y='id:N'\n).properties(\n    height=200,\n    width=400\n)\n\n\n\n\n\n\n\n\n# !pip install altair_catplot\n# !pip install seaborn\n\n\n# import altair_catplot\n\n# altair_catplot.catplot(\n#     stds_p, \n#     transform ='jitterbox', \n#     mark ='point', \n#     encoding = dict(\n#         x = alt.X('value:Q'), \n#         y = alt.Y('id:N'), \n#         color = alt.Color('repo:N')\n#     ) \n# )\n\n\nF = stds.drop(index='lightfm') / stds.loc['lightfm']\n\nbase = alt.Chart(\n    F.melt(ignore_index=False).reset_index()[['repo', 'id', 'value']]\n).transform_calculate(\n    benchmark=\"1\",\n    threshold=f\"{scipy.stats.f.ppf(0.975, 29, 29)}\"\n)\n\npoint = base.mark_point(\n    filled=True,\n    size=100,\n).encode(\n    x=alt.X('value:Q').title(\"std ratio (c.f. lightfm)\"),\n    y='id:N',\n    color='repo',\n    tooltip='repo'\n).properties(\n    height=200,\n    width=400\n)\n\npoint \\\n+ base.mark_rule(color='black').encode(x=\"benchmark:Q\") \\\n+ base.mark_rule(color='red').encode(x=\"threshold:Q\")\n# jitter instead of mark_point &lt;-- prompt vs. repo problem?\n# prompt: sd of checklist item for all repo is high\n# repo: most of repo have low sd, the repo we're looking at has outlier\n\n\n\n\n\n\n\nObservations: - The evaluation of the checklist item 3.2 Data in the Expected Format becomes much more unstable in most of other repositories. - That of the 2.1 is significantly unstable in the repo paperless-ng, magenta and DeepSpeech, but it may be due to the repo itself.\nTODO: to look into the 3.2’s scores.\n\n\nTODO: Given ground truth == 1, distribution of system score?\n\n\nTODO: Given ground truth == 0, distribution of system score?\n\ndef generate_histogram_plot(df_repo_run_long, df_ground_truth=None, repo=None, id=None):\n    \"\"\"\n    Generate histogram across all repo and all checklist item\n    Optional to incorporate ground truth and select specific repo/checklist item\n    \"\"\"\n    # data\n    repo_data = df_repo_run_long.copy()\n    if repo:\n        repo_data = repo_data.query(f'repo == \"{repo}\"')\n    if id:\n        repo_data = repo_data.query(f'id == \"{id}\"')\n\n    # base histogram chart\n    base = alt.Chart().mark_bar().encode(\n                x=alt.X('eval_score:Q', title='Score'), \n                y=alt.Y('count()'), \n                color=alt.value('grey'),\n                size=alt.value(20),\n            )\n    \n    if df_ground_truth is not None:\n        # data\n        gt_data = df_ground_truth.copy()\n        if repo:\n            gt_data = gt_data.query(f'repo == \"{repo}\"')\n        if id:\n            gt_data = gt_data.query(f'id == \"{id}\"')\n        \n        repo_data = pd.merge(repo_data, gt_data, how='left', on=['repo', 'id'])\n        repo_data['is_equal_to_gt'] = repo_data['eval_score'] == repo_data['score']\n        \n        # base histogram chart\n        base = base.encode(\n                    color=alt.Color('is_equal_to_gt', scale=alt.Scale(range=['grey', 'green']), legend=None)\n                )\n        base += base.mark_text().encode(\n            text=alt.value('Ground Truth'),\n            x='score',\n            size=alt.value(10),\n            color=alt.value('green'),\n        )\n\n    plot = alt.layer(\n                base,\n                data=repo_data\n            ).properties(\n                width=200,\n                height=200,\n            ).facet(\n                row='repo',\n                column='id'\n            )        \n    \n    return plot\n\n\n\nContingency Table\n\ndf_repo_run_p = pd.melt(df_repo_run, id_vars=['repo', 'run'], var_name='id', value_name='eval_score')\ndf_repo_run_p = pd.merge(df_repo_run_p, ground_truth, how='inner', on=['repo', 'id'])\ndf_repo_run_p = df_repo_run_p.rename(columns={'score': 'ground_truth'})\npd.pivot_table(df_repo_run_p, values='run', index=['ground_truth'], columns=['eval_score'], aggfunc='count', fill_value=0)\n\n\n\n\n\n\n\neval_score\n0.0\n0.5\n1.0\n\n\nground_truth\n\n\n\n\n\n\n\n0.0\n227\n8\n35\n\n\n0.5\n21\n39\n30\n\n\n1.0\n21\n159\n90\n\n\n\n\n\n\n\n\ndf_repo_run_p\n\n\n\n\n\n\n\n\nrepo\nrun\nid\neval_score\nground_truth\n\n\n\n\n0\nDeepSpeech\n1\n2.1\n1.0\n0.0\n\n\n1\nDeepSpeech\n2\n2.1\n1.0\n0.0\n\n\n2\nDeepSpeech\n3\n2.1\n0.0\n0.0\n\n\n3\nDeepSpeech\n4\n2.1\n1.0\n0.0\n\n\n4\nDeepSpeech\n5\n2.1\n0.0\n0.0\n\n\n...\n...\n...\n...\n...\n...\n\n\n625\nqlib\n26\n6.2\n0.5\n1.0\n\n\n626\nqlib\n27\n6.2\n0.5\n1.0\n\n\n627\nqlib\n28\n6.2\n0.5\n1.0\n\n\n628\nqlib\n29\n6.2\n1.0\n1.0\n\n\n629\nqlib\n30\n6.2\n0.5\n1.0\n\n\n\n\n630 rows × 5 columns\n\n\n\n\n# generate_histogram_plot(df_repo_run_p, df_ground_truth=ground_truth)"
+    "text": "by John Shiu, Orix Au Yeung, Tony Shum, Yingzi Jin"
   },
   {
-    "objectID": "04_plots-for-presentations.html",
-    "href": "04_plots-for-presentations.html",
-    "title": "Accuracy: Contingency table",
-    "section": "",
-    "text": "!pip install scipy altair\n\nRequirement already satisfied: scipy in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (1.13.1)\nCollecting altair\n  Using cached altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)\nRequirement already satisfied: numpy&lt;2.3,&gt;=1.22.4 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from scipy) (1.26.4)\nRequirement already satisfied: jinja2 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (3.1.4)\nRequirement already satisfied: jsonschema&gt;=3.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (4.22.0)\nRequirement already satisfied: packaging in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (23.2)\nRequirement already satisfied: pandas&gt;=0.25 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (2.2.2)\nCollecting toolz (from altair)\n  Using cached toolz-0.12.1-py3-none-any.whl.metadata (5.1 kB)\nRequirement already satisfied: attrs&gt;=22.2.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (23.2.0)\nRequirement already satisfied: jsonschema-specifications&gt;=2023.03.6 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (2023.12.1)\nRequirement already satisfied: referencing&gt;=0.28.4 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (0.35.1)\nRequirement already satisfied: rpds-py&gt;=0.7.1 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (0.18.1)\nRequirement already satisfied: python-dateutil&gt;=2.8.2 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas&gt;=0.25-&gt;altair) (2.9.0.post0)\nRequirement already satisfied: pytz&gt;=2020.1 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas&gt;=0.25-&gt;altair) (2024.1)\nRequirement already satisfied: tzdata&gt;=2022.7 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas&gt;=0.25-&gt;altair) (2024.1)\nRequirement already satisfied: MarkupSafe&gt;=2.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jinja2-&gt;altair) (2.1.5)\nRequirement already satisfied: six&gt;=1.5 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from python-dateutil&gt;=2.8.2-&gt;pandas&gt;=0.25-&gt;altair) (1.16.0)\nUsing cached altair-5.3.0-py3-none-any.whl (857 kB)\nUsing cached toolz-0.12.1-py3-none-any.whl (56 kB)\nInstalling collected packages: toolz, altair\nSuccessfully installed altair-5.3.0 toolz-0.12.1\n\n\n\nimport scipy\nimport pickle\nimport json\nimport yaml\nimport pandas as pd\nimport altair as alt\nfrom collections import Counter\n\n\ndef get_report(response):\n    report = []\n    for result in response.call_results:\n        if result.parsed_response:\n            resp = result.parsed_response['results']\n            for item in resp:\n                item['file'] = result.files_evaluated[0] \n                item['success'] = result.success\n                report.append(item)\n        else:\n            report.append({\n                'ID': '2.1', # FIXME\n                'Title': '',\n                'Requirement': '',\n                'Observation': '',\n                'Functions': [],\n                'Evaluation': '',\n                'Score': 0,\n                'file': result.files_evaluated[0],\n                'success': result.success\n            })\n    return pd.DataFrame(report)\n\ndef get_report_json(response):\n    report = []\n    for result in response['call_results']:\n        if result['parsed_response']:\n            resp = result['parsed_response']['results']\n            for item in resp:\n                item['file'] = result['files_evaluated'][0] \n                item['success'] = result['success']\n                report.append(item)\n        else:\n            report.append({\n                'ID': '2.1', # FIXME\n                'Title': '',\n                'Requirement': '',\n                'Observation': '',\n                'Functions': [],\n                'Evaluation': '',\n                'Score': 0,\n                'file': result.files_evaluated[0],\n                'success': result.success\n            })\n    return pd.DataFrame(report)\n\ndef extract_file_and_scores(resp_path, verbose=False):\n    if verbose:\n        print(resp_path)\n    with open(resp_path, 'rb') as file:\n        try:\n            response = pickle.load(file)\n            report = get_report(response)\n        except:\n            response = json.load(file)\n            report = get_report_json(response)\n    df = (\n        report\n        .pivot(index='file', columns='ID', values='Score')\n        .rename_axis(None, axis=1)\n    )\n    df['success'] = report.groupby(['file'])['success'].all()\n    df['response_path'] = resp_path\n    return df.reset_index()\n\n\nchecklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']\n\ndef read_and_preprocess(result_path):\n    with open(result_path, 'r') as file:\n        config = pd.DataFrame(yaml.safe_load(file))\n    \n    # prepare score data by repo, run, file\n    tmp = [\n        extract_file_and_scores(path) for path in config['response_path'] # FIXME: excluded deepchem\n    ]\n    tmp = pd.concat(tmp, axis=0).reset_index(drop=True)\n    \n    raw_df_repo_run_file = config.merge(tmp, on='response_path', how='left')\n\n    # filter non-test files in qlib\n    df_repo_run_file = raw_df_repo_run_file.query('(repo != \"qlib\") | (file.str.contains(\"../data/raw/openja/qlib/tests/\"))')\n    \n    # prepare score data by repo, run\n    df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({\n        id: ['max'] for id in checklist_ids\n    })\n    df_repo_run.columns = [col[0] for col in df_repo_run.columns]\n    df_repo_run = df_repo_run.reset_index()\n    \n    # prepare statistics of scores by repo\n    df_repo__stat = df_repo_run.groupby(['repo']).agg({\n        id: ['mean', 'std', 'count'] for id in checklist_ids\n    })\n    df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])\n    df_repo__stat.columns = ['repo', 'id', 'stat', 'value']\n    df_repo__stat = (\n        df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')\n        .reset_index()\n        .rename_axis(None, axis=1)\n    )\n    \n    # prepare counting of scores by repo\n    df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()\n    for id in checklist_ids[1:]:\n        df_repo__count = df_repo__count.merge(\n            df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),\n            on=['repo', 'level_1'],\n            how='outer'\n        )\n    \n    df_repo__count = df_repo__count.fillna(0)\n\n    return (df_repo_run_file, df_repo_run, df_repo__stat, df_repo__count)\n\n\n# Ground truth\nground_truth = pd.DataFrame([\n    {'repo': 'lightfm', 'id': '2.1', 'score': 1},\n    {'repo': 'lightfm', 'id': '3.2', 'score': 1},\n    {'repo': 'lightfm', 'id': '3.5', 'score': 0},\n    {'repo': 'lightfm', 'id': '4.2', 'score': 1},\n    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},\n    {'repo': 'lightfm', 'id': '6.1', 'score': 1},\n    {'repo': 'lightfm', 'id': '6.2', 'score': 1},\n    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},\n    {'repo': 'qlib', 'id': '3.2', 'score': 1},\n    {'repo': 'qlib', 'id': '3.5', 'score': 0},\n    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},\n    {'repo': 'qlib', 'id': '5.3', 'score': 1},\n    {'repo': 'qlib', 'id': '6.1', 'score': 1},\n    {'repo': 'qlib', 'id': '6.2', 'score': 1},\n    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},\n])\n\n\nid_item_map = {\n    '2.1': 'Ensure Data File Loads as Expected',\n    '3.2': 'Data in the Expected Format',\n    '3.5': 'Check for Duplicate Records in Data',\n    '4.2': 'Verify Data Split Proportion',\n    '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n    '6.1': 'Verify Evaluation Metrics Implementation',\n    '6.2': \"Evaluate Model's Performance Against Thresholds\"\n}\n\n\n#result_path = '../draft/batch_run_results/record_combine.yml'\ndf_repo_run_file, df_repo_run, df_repo__stat, df_repo__count = read_and_preprocess(\n    '../data/processed/batch_run/record_combine.yml'\n)\n\n\ncont_table = pd.melt(\n    df_repo_run.query('(repo == \"lightfm\")')[['repo', 'run', '3.5', '4.2', '5.3']], \n    id_vars=['repo', 'run'], var_name='id', value_name='System Output')\ncont_table = pd.merge(cont_table, ground_truth, how='inner', on=['repo', 'id'])\ncont_table = cont_table.rename(columns={'score': 'ground_truth'})\ncont_table['title'] = cont_table['id'].apply(lambda x: id_item_map[x])\n#cont_table = cont_table[['repo', 'title', 'ground_truth', 'System Output', 'run']]\ncont_table = pd.pivot_table(cont_table, values='run', index=['repo', 'id', 'title', 'ground_truth'], columns=['System Output'], aggfunc='count', fill_value=0)\ncont_table.index.names = ['Repository', 'ID', 'Title', 'Ground Truth']\ncont_table.sort_index(level=3)\n\n\n\n\n\n\n\n\n\n\nSystem Output\n0.0\n0.5\n1.0\n\n\nRepository\nID\nTitle\nGround Truth\n\n\n\n\n\n\n\nlightfm\n3.5\nCheck for Duplicate Records in Data\n0.0\n19\n6\n5\n\n\n5.3\nEnsure Model Output Shape Aligns with Expectation\n0.5\n18\n12\n0\n\n\n4.2\nVerify Data Split Proportion\n1.0\n0\n27\n3\n\n\n\n\n\n\n\n\nConsistency: jitterbox plot\n\nstds = df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id').copy()\nstds.columns = [col[1] for col in stds.columns]\nstds = stds.reset_index()\nstds = stds.melt(id_vars='repo', var_name='id')\nstds['title'] = stds['id'].apply(lambda x: id_item_map[x])\n\n\nbox = alt.Chart().mark_boxplot(\n    color='grey',\n    opacity=0.5,\n    size=20,\n).encode(\n    x=alt.X('value:Q').title('System Output Uncertainty'),\n    y=alt.Y('title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))\n)\n\nstripplot = alt.Chart().mark_circle(size=100).encode(\n    y=alt.Y( \n        'title:N',\n        axis=alt.Axis(ticks=False, grid=True, labels=True), \n        scale=alt.Scale(), \n    ), \n    x='value:Q',\n    yOffset=\"jitter:Q\",\n    color=alt.Color('id:N', legend=None),\n    tooltip='repo'\n).transform_calculate(\n    # Generate Gaussian jitter with a Box-Muller transform\n    jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n)\n\nplot = alt.layer(\n    box,\n    stripplot,\n    data=stds\n).configure_view( \n    stroke=None\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n).properties(\n    height=300, \n    width=600,\n    title=\"30 Runs on Openja's Repositories for each Checklist Item\"\n) \n\n\nplot\n\n\n\n\n\n\n\n\n\nimprovement from 3.5 to 4o\n\n#result_path = '../draft/batch_run_results/record_combine.yml'\ndf_repo_run_file_4o, df_repo_run_4o, df_repo_4o__stat, df_repo_4o__count = read_and_preprocess(\n    '../data/processed/batch_run_4o/record_combine.yml'\n)\n\n\ndf_repo_4o__stat\n\n\n\n\n\n\n\n\nrepo\nid\ncount\nmean\nstd\n\n\n\n\n0\nlightfm\n2.1\n30.0\n1.0\n0.0\n\n\n1\nlightfm\n3.2\n30.0\n1.0\n0.0\n\n\n2\nlightfm\n3.5\n30.0\n1.0\n0.0\n\n\n3\nlightfm\n4.2\n30.0\n1.0\n0.0\n\n\n4\nlightfm\n5.3\n30.0\n1.0\n0.0\n\n\n5\nlightfm\n6.1\n30.0\n1.0\n0.0\n\n\n6\nlightfm\n6.2\n30.0\n1.0\n0.0\n\n\n\n\n\n\n\n\ndf1 = df_repo__stat.query('(repo == \"lightfm\") & (id == \"4.2\")').copy()\ndf1['model'] = ['gpt-3.5-turbo']\n\ndf2 = df_repo_4o__stat.query('(repo == \"lightfm\") & (id == \"4.2\")').copy()\ndf2['model'] = ['gpt-4o']\n\ndf_model_comp = pd.concat((df1, df2), axis=0)\n\n\nbase = alt.Chart(df_model_comp).transform_calculate(\n    min=\"max(0, datum.mean-datum.std)\",\n    max=\"min(1, datum.mean+datum.std)\"\n)\n    \n# generate the points\npoints = base.mark_point(\n    filled=True,\n    size=50,\n    color='black'\n).encode(\n    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"System Output\").axis(\n        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n    ),\n    y=alt.Y('model:N').title(\"Model\")#.scale(domainMin=0, domainMax=1).title('Score'),\n)\n    \n# generate the error bars\nerrorbars = base.mark_errorbar().encode(\n    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n    x2=\"max:Q\",\n    y=\"model:N\"\n)\n\n(points + errorbars).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n).properties(\n    height=200,\n    width=400,\n    title={\n        'text': '30 Runs on Checklist Item: \"Ensure Data File Loads as Expected\"',\n        'subtitle': \"Ground Truth = 1\"\n    }\n)"
+    "objectID": "proposal.html#executive-summary",
+    "href": "proposal.html#executive-summary",
+    "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
+    "section": "Executive Summary",
+    "text": "Executive Summary\nThe rapid growth of global artificial intelligence (AI) markets presents opportunities and challenges. While AI systems have the potential to impact various aspects of human life, ensuring their software quality remains a significant concern. Current testing strategies for machine learning (ML) systems lack standardization and comprehensiveness, which poses risks to stakeholders, such as financial losses and safety hazards.\nOur proposal addresses this challenge by developing a manually curated checklist which contains best practices and recommendations in testing ML systems. Additionally, an end-to-end application incorporating the checklist and Large Language Model (LLM) will be developed to analyze given ML system source codes and provide test completeness evaluation, missing test recommendations, and test function specification generation. Our proposed solution will enable users to systematically assess, improve, and include tests tailored to their ML systems through a combination of human expertise codified within the checklist and parametric memory from LLMs.\nIn the following weeks, we will develop and refine our product through a swift and efficient iterative development approach, with the aim to deliver a rigorously tested and fully-documented system to our partners by the end of the project."
+  },
+  {
+    "objectID": "proposal.html#introduction",
+    "href": "proposal.html#introduction",
+    "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
+    "section": "Introduction",
+    "text": "Introduction\n\nProblem Statement\nThe global artificial intelligence (AI) market is growing exponentially (Grand-View-Research 2021), driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.\nHowever, ensuring the software quality of these systems remains a significant challenge (Openja et al. 2023). Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as substantial financial losses (Regidi 2019) and safety hazards.\nTherefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?\n\n\nOur Objectives\nWe propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate a checklist to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software’s trustworthiness, quality, and reproducibility across both the industry and academia (Kapoor and Narayanan 2022)."
+  },
+  {
+    "objectID": "proposal.html#our-product",
+    "href": "proposal.html#our-product",
+    "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
+    "section": "Our Product",
+    "text": "Our Product\nOur solution offers an end-to-end application for evaluating and enhancing the robustness of users’ ML systems.\n\n\n\nMain components and workflow of the proposed system. The checklist would be written in YAML to maximize readability for both humans and machines. We hope this will encourage researchers/users to read, understand and modify the checklist items, while keeping the checklist closely integrated with other components in our system.\n\n\nOne big challenge in utilizing LLMs to reliably and consistently evaluate ML systems is their tendency to generate illogical and/or factually wrong information known as hallucination (Zhang et al. 2023).\nTo combat this, the proposed system will incorporate a checklist (Fig. 1) which would be curated manually and incorporate best practices in software testing and identified areas to be tested inside ML pipeline from human experts and past research.\nThis checklist will be our basis in evaluating the effectiveness and completeness of existing tests in a given codebase. Relevant information will be injected into a prompt template, which the LLMs would then be prompted to follow the checklist exactly during the evaluation.\nHere is an example of how the proposed checklist would be structured:\n%YAML 1.2\n---\nTitle: Checklist for Tests in Machine Learning Projects\nDescription: &gt;\n  This is a comprehensive checklist for evaluating the data and ML pipeline\n  based on identified testing strategies from experts in the field.\nTest Areas:\n  - Topic: General\n    Description: &gt;\n      The following items describe best practices for all tests to be\n      written.\n    Tests:\n      - Title: Write Descriptive Test Names\n        Requirement: &gt;\n          Every test function should have a clear, descriptive name\n        Explanation: &gt;\n          If out tests are narrow and sufficiently descriptive, the test\n          name itself may give us enough information to start debugging.\n          This also helps us to identify what is being tested inside the\n          function.\n        References:\n          - https://testing.googleblog.com/2014/10/testing-on-toilet-writing-descriptive.html\n          - https://testing.googleblog.com/2024/05/test-failures-should-be-actionable.html\n\n      - Title: Keep Tests Focused\n        Requirement: &gt;\n          Each test should only test one scenario, meaning that in each\n          test we should only use one set of mock data.\n        Explanation: &gt;\n          If we test multiple scenarios in a single test, it is hard to\n          idenitfy exactly what went wrong. Keeping one scenario in a\n          single test helps us to isolate problematic scenarios.\n        References:\n          - https://testing.googleblog.com/2018/06/testing-on-toilet-keep-tests-focused.html\n\n      - Title: Prefer Narrow Assertions in Unit Tests\n        Requirement: &gt;\n          The assertions inside the tests should be narrow, meaning that\n          when checking a complex object, any unrelated behavior should\n          not be tested - Assert on only relevant behaviors.\n        Explanation: &gt;\n          If we have overly wide assertions (such as depending on every\n          field of a complex output proto), the test may fail for many\n          unimportant reasons. False positives are the opposite of\n          actionable.\n        References:\n          - https://testing.googleblog.com/2024/04/prefer-narrow-assertions-in-unit-tests.html\n\n      - Title: Keep Cause and Effect Clear\n        Requirement: &gt;\n          The modifications and the assertions of an object's behavior\n          in a single test should not be far away from each other.\n        Explanation: &gt;\n          Refrain from using large global test data structures shared\n          across multiple unit tests. This will allow for clear\n          identification of each test's setup and the cause and effect.\n        References:\n          - https://testing.googleblog.com/2017/01/testing-on-toilet-keep-cause-and-effect.html\n\n  - Topic: Data Presence\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the presence of data.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Data Quality\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the quality of data.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Data Ingestion\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      if the data is ingestion properly.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Model Fitting\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the model fitting process.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Model Evaluation\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the model evaluation process.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Artifact Testing\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      any artifacts that are created from the project.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\nEvaluation Artifacts\nThe end goal of our product is to generate the following three artifacts in relation to the evaluation of a given ML system codebase:\n\nML Test Completeness Score: The application utilizes LLMs and our curated checklist to analyze users’ ML system source code and returns a comprehensive score of the system’s test quality.\nMissing Test Recommendations: The application evaluates the adequacy of existing tests for users’ ML code and offers recommendations for additional, system-specific tests to enhance testing effectiveness.\nTest Function Specification Generation: Users select desired test recommendations and prompt the application to generate test function specifications and references. These are reliable starting points for users to enrich the ML system test suites.\n\n\n\nSuccess Metrics\nOur product’s success will depend on mutation testing of the test functions developed based on our application-generated specifications. The evaluation metric is the success rate of detecting the perturbations introduced to the ML project code.\nOur partners and stakeholders expect a significant improvement in the testing suites of their ML systems post-application usage. As a result, the testing suites will demonstrate high accuracy in detecting faults, ensuring consistency and high quality of ML projects during updates.\n\n\nData Science Approach\n\nData: GitHub Repositories\nIn this project, GitHub repositories are our data.\nTo develop our testing checklist, we will collect 11 repositories studied in (Openja et al. 2023). Additionally, we will collect 377 repositories identified in the study by (Wattanakriengkrai et al. 2022) for our product development.\nFor each repository, we are interested in the metadata and the ML modeling- and test-related source code. The metadata will be retrieved using the GitHub API, while the source code will be downloaded and filtered using our custom scripts. To ensure the relevance of the repositories to our study, we will apply the following criteria for filtering: 1. Repositories that are related to ML systems. 2. Repositories that include test cases. 3. Repositories whose development is written in the Python programming language.\n\n\nMethodologies\nOur data science methodology incorporates human expert evaluation and prompt engineering to assess and enhance the test quality of ML systems.\n\nHuman Expert Evaluation\nWe will begin by formulating a comprehensive checklist for evaluating the data and ML pipeline based on the established testing strategies outlined in (Openja et al. 2023) as the foundational framework. Based on the formulated checklist, our team will manually assess the test quality within each repository data. We will refine the checklist to ensure applicability and robustness when testing general ML systems.\nPrompt Engineering\nWe will engineer the prompts for LLM to incorporate with the ML system code and the curated checklist and to serve various purposes across the three-stage process:\n\nPrompts to examine test cases within the ML system source codes and deliver test completeness scores.\nPrompts to compare and contrast the existing tests and the checklist and deliver recommendations.\nPrompts to generate system-specific test specifications based on user-selected testing recommendations (Schäfer et al. 2023)\n\n\n\n\nIterative Development Approach\nWe begin by setting up a foundational framework based on the selected GitHub repositories and research on ML testing. The framework might not cover all ML systems or testing practices. Therefore, we adopt an iterative development approach by establishing an open and scalable framework to address these considerations. The application will be continuously refined based on contributors’ insights.\nUsers are encouraged to interpret the generated artifacts with a grain of salt and recognize the evolving nature of ML system testing practices."
+  },
+  {
+    "objectID": "proposal.html#delivery-timeline",
+    "href": "proposal.html#delivery-timeline",
+    "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
+    "section": "Delivery Timeline",
+    "text": "Delivery Timeline\nOur team follows the timeline below for our product delivery and prioritizes close communication with our partners to ensure that our developments align closely with their expectations.\n\n\n\n\n\n\n\nTimeline\nMilestones\n\n\n\n\nWeek 1 (Apr 29 - May 3)\nPrepare and Present Initial Proposal. Scrape repository data.\n\n\nWeek 2 - 3 (May 6 - 17)\nDeliver Proposal. Deliver Draft of ML Pipeline Test Checklist. Develop Minimum Viable Product (Test Completeness Score, Missing Test Recommendation)\n\n\nWeek 4 - 5 (May 20 - May 31)\nUpdate Test Checklist. Develop Test Function Specification Generator.\n\n\nWeek 6 (Jun 3 - Jun 7)\nUpdate Test Checklist. Wrap Up Product.\n\n\nWeek 7 (Jun 10 - Jun 14)\nFinalize Test Checklist. Perform Product System Test. Present Final Product. Prepare Final Product Report.\n\n\nWeek 8 (Jun 17 - Jun 21)\nDeliver Final Product. Deliver Final Product Report."
   },
   {
     "objectID": "final_report.html",
@@ -32,69 +53,27 @@
     "href": "final_report.html#introduction",
     "title": "Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
     "section": "Introduction",
-    "text": "Introduction\n\nProblem Statement\nThe global artificial intelligence (AI) market is growing exponentially {cite}grand2021artificial, driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.\nHowever, ensuring the software quality of these systems remains a significant challenge {cite}openja2023studying. Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as substantial financial losses ({cite}Asheeta2019, {cite}Asheeta2019, {cite}Asheeta2019) and safety hazards.\nTherefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?\n\n\nOur Objectives\nWe propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate checklists based on ML research papers and best practices to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software’s trustworthiness, quality, and reproducibility across both the industry and academia {cite}kapoor2022leakage."
+    "text": "Introduction\n\nProblem Statement\nThe global artificial intelligence (AI) market is growing exponentially ((Grand-View-Research 2021)), driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.\nHowever, ensuring the software quality of these systems remains a significant challenge ((Openja et al. 2023)). Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as misinformation ((Belanger 2024)), social bias ((Nunwick 2023)), substantial financial losses ((Regidi 2019)) and safety hazards ((Shepardson 2023))\nTherefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?\n\n\nOur Objectives\nWe propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate checklists based on ML research papers and best practices to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software’s trustworthiness, quality, and reproducibility across both the industry and academia (Kapoor and Narayanan 2022)."
   },
   {
     "objectID": "final_report.html#data-science-methods",
     "href": "final_report.html#data-science-methods",
     "title": "Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
     "section": "Data Science Methods",
-    "text": "Data Science Methods\n\nCurrent Approaches\nTo ensure the reproducibility, trustworthiness and free-of-bias ML system, comprehensive assessment is essential. We have observed some traditional approaches in assessing the quality of ML systems, which contain different advantages and drawbacks as follows.\n\n1. Code Coverage\nCode coverage is a measure of the proportion of source code of a program executed when a particular test suite is run. It is widely used in software development domain as one of the measurements. It quantifies the test quality and is scalable given the short process time. However, it cannot provide the reasons and in which ML areas that the test suites fall short under the context of ML system development.\n\n\n2. Manual Evaluation\nManual evaluation involves human expert review at the source code, whom can take the business logic into considerations and find vulnerabilites. Manual evaluation usually delivers comments for improvement under specific development context, and it is still one of the most reliable methods in practice. However, the time cost is large and it is not scalable due to the scarcity of time and human expert. Different human expert might put emphasis on different ML test areas instead of a comprehensive and holistic review on the ML system test suites.\n\n\n\nOur Approach\nOur approach is to deliver an automated code review tool with the best practices of ML test suites embedded, which can be used by ML users to learn the best practices as well as to obtain a comprehensive evaluation on their ML system codes.\nTo come up with the best practices of ML test suites, ML research paper and recognized online resources are our data. Under the collaboration with our partner, we have researched industrial best practices (cite: Microsoft, Jordan) and published academic literature (cite: OpenJa) and consolidated the testing strategies of ML projects into a format which is easily legible and editable by human (researchers, ML engineers, etc.). The format is also machine-friendly that can be easily incorporated into the automated tool.\nTo develop our automated code review tool, GitHub repositories of ML projects are our data. We have collected 11 repositories studied in {cite}openja2023studying, where these projects include comprehensive test suites and are written in Python programming language, for our product development. Our tool is capable of understanding the test suites in these projects, comparing and contrasting the test suites with the embedded best practices, and delivering evaluations and suggestions to the current test suties.\nBy developing our approach, we expect that it can provide reliable test suites evaluation to multiple ML projects in a scalable manner. However, we acknowledged that the consolidation of best practices currently focused on a few high priority test areas due to time constraint, where we expect to expand in the future. The test evaluation results provided by our tool are yet as reliable as human evaluation, where we will quantify its performance using the success metrics below.\n\n\nSuccess Metrics\nTo properly assess the performance of our tool which leverages the capability of LLMs, we have researched and taken reference of the methods in {cite}alexander2023evaluating and defined the 2 success metrics: accuracy and consistency. With these metrics, our users (researchers, ML engineers, etc.) can assess the trustworthiness while obtaining the evaluation results from our tool.\n\nAccuracy of the Application vs Human Expert Judgement\n\nWe run our tool on the ML projects in {cite}openja2023studying to obtain the evaluation results (i.e. completeness score) per each ML test best practice item. We then manually assess the test suites of these ML projects using the same criteria as the ground truth data. Machine evaluation results are compared and contrasted with the ground truth data. Accuracy is defined as the number of matching results over total number of results.\n\nConsistency of the Application\n\nMultiple runs on each ML project are performed and the evaluation results per each ML test best practice item are obtained. Standard deviation of these results per ML projects are calculated as a measure of consistency."
+    "text": "Data Science Methods\n\nCurrent Approaches\nTo ensure the reproducibility, trustworthiness, and lack of bias in ML systems, comprehensive testing is essential. We outlined some traditional approaches for assessing the completeness of ML system tests with their advantages and drawbacks as follows.\n\nCode Coverage\n\nCode coverage measures the proportion of source code of a program executed when a particular test suite is run. Widely used in software development, it quantifies test quality and is scalable due to its short processing time. However, it cannot indicate the reasons or specific ML areas where the test suites fall short under the context of ML system development.\n\nManual Evaluation\n\nManual evaluation involves human experts reviewing the source code, whom can take the business logic into considerations and identify vulnerabilites. It often provides context-specific improvement suggestions and remains one of the most reliable practices ((Openja et al. 2023), (Alexander et al. 2023)). However, it is time-consuming and not scalable due to the scarcity of human experts. Moreover, different experts might put emphasis on different ML test areas and lack a comprehensive and holistic review of the ML system test suites.\n\n\nOur Approach\nOur approach is to deliver an automated code review tool with the best practices of ML test suites embedded. This tool aims to educate ML users on best practices while providing comprehensive evaluations of their ML system codes.\nTo establish these best practices, we utilized data from ML research papers and recognized online resources. In collaboration with our partner, we researched industrial best practices ((Team 2023), (Jordan 2020)) and academic literature ((Openja et al. 2023)), and consolidated testing strategies into a human-readable and machine-friendly checklist that can be embedded into the automated tool.\nFor development, we collected 11 GitHub repositories of ML projects as studied in (Openja et al. 2023). These Python-based projects include comprehensive test suites. Our tool should be able to analyze these test suites, compare them with embedded best practices, and deliver evaluations.\nWe expect that our approach will provide scalable and reliable test suite evaluations for multiple ML projects. However, we recognize that our current best practices only focus on a few high-priority test areas due to time constraints. We plan to expand this scope in the future. While our tool’s evaluations are not yet as reliable as human evaluations, we will quantify its performance.\n\n\nSuccess Metrics\nTo properly assess the performance of our tool which leverages LLMs capability, we have taken reference of the methods in (Alexander et al. 2023) and defined two success metrics: accuracy and consistency. These metrics will help users (researchers, ML engineers, etc.) gauge the trustworthiness of our tool’s evaluation results.\n\nAccuracy vs Human Expert Judgement\n\nWe run our tool on ML projects from (Openja et al. 2023) to obtain evaluation results for each ML checklist item. These results are then compared with our manually assessed ground truth data based on the same criteria. Accuracy is calculated as the proportion of matching results to the total number of results.\n\nConsistency\n\nWe perform multiple runs on each ML project to obtain evaluation results for each checklist item. Consistency is measured by calculating the standard deviation of these results across multiple runs for each project."
   },
   {
     "objectID": "final_report.html#data-product-results",
     "href": "final_report.html#data-product-results",
     "title": "Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
     "section": "Data Product & Results",
-    "text": "Data Product & Results\n\nData Products\nOur solution offers both a curated checklist on robust ML testing, and a Python package that facilitates the use of LLMs in checklist-based evaluation on the robustness of users’ ML projects. The Python package is made publicly available for distribution on the Python Packaging Index (PyPI).\nThe justifications for creating these products are, on one hand, checklists have been shown to decrease errors in software systems and promote code submissions (cite: Gawande 2010, Pineau et al. (2021) from Tiffany PDF). Moreover, Python is chosen to be the programing language of our package given its prevalence in the ML landscape, its ubiquitous presence across different OSes and the existence of Python libraries for the integration with LLMs. This lowers the barrier to use and develop our package and provides better user experience.\n\nHow to use the product\nThere are two ways to make use of this package:\n\nAs a CLI tool. A runnable command fixml is provided by the package. Once installed, users can perform the codebase evaluation, test function specification generation and other relevant tasks by running subcommands under fixml in terminal environment.\nAs a high-level API. Alternatively, one can use the package to import all components necessary for performing the tasks as part of their own system. Documentations are provided in terms of docstrings.\n\nBy formating our product as a CLI tool and API, one (researchers, ML engineers, etc.) will find it user-friendly to interact with. Moreover, it is versatile to support various use cases, such as web application development, data science research, etc.\n\n\nSystem Design\n(To be revised) \nThe design principle of our package adheres to object-oriented design and SOLID principles, which is fully modular. One can easily switch between different prompts, models and checklists to use. This enables code reuse and promote users’ collaboration to extend its functionality.\nThere are five components in the system of our package:\n\nCode Analyzer This component extracts the information relevant to test suites from the input codebase, which is essential for injecting only the most relevant information to LLMs given its token limits.\nPrompt Templates This component stores the prompt template necessary for instructing LLM to behave and return responses in consistent and expected format. Few-shot learning is applied for the instruction.\nChecklist This component reads the curated checklist, which is stored in CSV format, as a dict with fixed schema for injection into prompt. Default checklist is also included inside the package for distribution.\nRunners This component involves the Evaluator module, which evaluates each file from the test suites using LLMs and outputs evaluation results, and Generator module, which generates test specifications. Both modules include validation and retry logics and record all relevant information in the responses.\nParsers This components parses the responses from Evaluator into evaluation reports in various formats (HTML, PDF) using Jinja template engine. Adhering to our design principle, this enables flexibility in creating customized report structure.\n\n\n\nChecklist Design\nThe package will incorporate a checklist (Fig. 1) which contains the best practices in testing ML pipeline and is curated manually based on ML researches and recognized online resources. Prompt engineering is applied to the checklist for better performance. This also helps combating the hallucination of LLMs ({cite}zhang2023sirens) during the evaluation of ML projects by prompting it to follow exactly the checklist.\nHere is an example of how the checklist would be structured:\n\n\n\n\n\n\n\nColumn\nDescription\n\n\n\n\nID\nThe Unique Identifier of the checklist item\n\n\nTopic\nThe Test Area of the checklist item\n\n\nTitle\nThe Title of the checklist item\n\n\nRequirement\nThe Prompt of the checklist item to be injected into LLMs for evaluation\n\n\nExplanations\nDetailed explanations of the checklist item for human understanding\n\n\nReference\nReferences of the checklist item, e.g. academic paper\n\n\nIs Evaluator Applicable\nWhether the checklist item is selected to be used during evaluation. 0 indicates No, 1 indicates Yes\n\n\n\n(To be revised) \n\n\nArtifacts\nThere are three artifacts after using our package:\n\nEvaluation Responses The artifact stores both the evaluation responses from LLMs and meta-data of the process in JSON format. This supports downstream tasks, such as report render, scientific research, etc.\n\n(To be revised) schema of the JSON saved & what kind of information is stored\n\nEvaluation Report The artifact stores the evaluation results of the ML projects in a structured format, which includes completeness score breakdown and corresponding detailed reasons.\n\n(To be revised) \n\nTest Specification Script The artifacts stores the test specification responses from LLMs in Python script format.\n\n(To be revised) \n\n\n\nEvaluation Results\nAs illustrated in Success Metrics, we ran 30 iterations on each of the repositories in {cite}openja2023studying and examined the breakdown of the ML Completeness Score to assessed the quality of evaluation determined by our tool. (FIXME: would it be better to show a table of the repos? like how the Openja does?)\n\nAccuracy\nFor accuracy, we targeted 3 of the repositories (lightfm (FIXME: link), qlib (FIXME: link), DeepSpeech (FIXME: link)) for human evaluation and compared the ground truth with the outputs from our tool.\n\n\nCode\n# FIXME: table: checklist id, title, (ground truth, (lightfm, qlib, DeepSpeech))\n\n\n\nCaption: Ground truth data on the 3 repositories\n\n\n\nCode\n# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo\n\n\n\nCaption: Comparison of the satisfaction determined by our system versus the ground truth for each checklist item and repository\n\nWe found that our tool tends to undermine the actual satisfying cases. For the items that are actually satisfied (score = 1), our tool tends to classify as partially satisfied (score = 0.5), while for those that are partially satisfied (score = 0.5), our tool often classfies as not satisfied (score = 0).\n\n\nCode\n# FIXME: contingency table\n\n\n\nContingency table of the satisfaction determined by our system versus the ground truth\n\nThe accuracy issue may be attributed to the need for improvement of prompts in our checklist.\n\n\nConsistency\nSince the completeness score from LLMs contain randomness, we further studied the consistency of scores across checklist items and reposities.\n\n\nCode\n# FIXME: jitter-boxplot, checklist item vs. SD\n\n\n\nCaption: Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores of 30 runs of a sigle repository\n\nWe found 2 diverging cases. For example, it shows high standard deviations across repositories for item 3.2 Data in the Expected Format. This might be a proof of poor prompt quality, making it ambiguous for the LLM and hence hard to produce consistent results. Prompt engineering might solve this problem.\nOn the other hand, there are outliers yielding exceptionally high standard deviations for item 5.3 Ensure Model Output Shape Aligns with Expectation. This may be because those repositories are unorthodox, and careful manual examination is required to achieve a more robust conclusion.\n\n\nComparison of gpt-3.5-turbo and gpt-4o\nTo examine if newer LLMs help in both metrics, we preliminarily compared system outputs from gpt-4o and gpt-3.5-turbo on the lightfm repository, we observed that the gpt-4o system consistently returned “Satisfied”, which deviates from the ground truth.\n\n\nCode\n# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo\n\n\n\nCaption: Comparison of the satisfaction using gpt-4o versus using gpt-3.5-turbo for each checklist item on lightfm\n\nFurther investigation into gpt-4o is required to address this issue and enhance the system performance."
+    "text": "Data Product & Results\n\nData Products\nOur solution includes a curated checklist for robust ML testing and a Python package for checklist-based evaluation of ML project testing robustness using LLMs. The package is publicly available on the Python Packaging Index (PyPI).\nJustifications for these products are:\n\nChecklists have been shown to reduce errors in software systems and promote code submissions ((Gawande 2010), (Pineau et al. 2021)).\nPython is widely used in ML, compatible with various OSes, and integrates well with LLMs. These ensure the ease of use and development.\n\n\nHow to use the product\nThere are two ways to make use of this package:\n\nAs a CLI tool. A runnable command fixml is provided by the package. Once installed, users can perform codebase evaluations, generate test function specifications, and more by running subcommands under fixml in the terminal.\nAs a high-level API. Users can import necessary components from the package into their own systems. Documentation is available through docstrings.\n\nBy offering it as both CLI tool and API, our product is user-friendly to interact with, and versatile to support various use cases such as web application development and scientific research.\n\n\nSystem Design\n(FIXME To be revised) \nThe design of our package follows object-oriented and SOLID principles, which is fully modularity. Users can easily switch between different prompts, models, and checklists, which facilitates code reusability and collaboration to extend its functionality.\nThere are five components in the system of our package:\n\nCode Analyzer\n\nIt extracts test suites from the input codebase, to ensure only the most relevants details are provided to LLMs given token limits.\n\nPrompt Templates\n\nIt stores prompt templates for instructing LLMs to generate responses in the expected format.\n\nChecklist\n\nIt reads the curated checklist from a CSV file into a dictionary with a fixed schema for LLM injection. The package includes a default checklist for distribution.\n\nRunners\n\nIt includes the Evaluator module, which assesses each test suite file using LLMs and outputs evaluation results, and the Generator module, which creates test specifications. Both modules feature validation, retry logic, and record response and relevant information.\n\nParsers\n\nIt converts Evaluator responses into evaluation reports in various formats (HTML, PDF) using the Jinja template engine, which enables customizable report structures.\n\n\nChecklist Design\nThe embedded checklist contains best practices for testing ML pipelines, and is curated from ML research and recognized online resources. Prompt engineering further improves performance. THis helps mitigate LLM hallucinations ((Zhang et al. 2023)) by ensuring strict adherence to the checklist.\nExample checklist structure:\n\n\n\n\n\n\n\nColumn\nDescription\n\n\n\n\nID\nUnique Identifier of the checklist item\n\n\nTopic\nTest Area of the checklist item\n\n\nTitle\nTitle of the checklist item\n\n\nRequirement\nPrompt for the checklist item to be injected into LLMs for evaluation\n\n\nExplanations\nDetailed explanations for human understanding\n\n\nReference\nReferences for the checklist item, e.g., academic papers\n\n\nIs Evaluator Applicable\nIndicates if the checklist item is used during evaluation (0 = No, 1 = Yes)\n\n\n\n(FIXME To be revised) \n\n\nArtifacts\nUsing our package results in three artifacts:\n\nEvaluation Responses\n\nThese responses include both LLM evaluation results and process metadata stored in JSON format.This supports downsteam tasks like report rendering and scientific research, etc.\n(FIXME To be revised) schema of the JSON saved & what kind of information is stored\n\nEvaluation Report\n\nThis report presents structured evaluation results of ML projects, which includes a detailed breakdown of completeness scores and reasons for each score.\n(FIXME To be revised) \n\nTest Specification Script\n\nGenerated test specifications are stored as Python scripts.\n(FIXME To be revised) \n\n\n\nEvaluation Results\nAs described in Success Metrics, we conducted 30 iterations on each repository from (Openja et al. 2023) and examined the breakdown of the completeness score to assess our tool’s evaluation quality.\n(FIXME: would it be better to show a table of the repos? like how the Openja does?)\n\nAccuracy\n\nWe targeted 3 of the repositories (lightfm, qlib, DeepSpeech) for human evaluation compared our tool’s outputs with the ground truth.\n\n\nCode\nimport pandas as pd\ngt = pd.read_csv('ground_truth.csv')\ngt\n\n\n\n\n\n\n\n\n\nid\ntitle\nDeepSpeech\nlightfm\nqlib\n\n\n\n\n0\n2.1\nEnsure Data File Loads as Expected\n0.0\n1.0\n0.5\n\n\n1\n3.2\nData in the Expected Format\n0.0\n1.0\n1.0\n\n\n2\n3.5\nCheck for Duplicate Records in Data\n0.0\n0.0\n0.0\n\n\n3\n4.2\nVerify Data Split Proportion\n0.0\n1.0\n0.5\n\n\n4\n5.3\nEnsure Model Output Shape Aligns with Expectation\n0.0\n0.5\n1.0\n\n\n5\n6.1\nVerify Evaluation Metrics Implementation\n0.0\n1.0\n1.0\n\n\n6\n6.2\nEvaluate Model's Performance Against Thresholds\n0.0\n1.0\n1.0\n\n\n\n\n\n\n\n\nGround truth data for the 3 repositories. (1 = fully satisfied, 0.5 = partially satisfied, 0 = not satisfied)\n\n\n\nCode\n# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo\nimport altair as alt\nimport pandas as pd\n\ndf_repo__stat = pd.read_csv('score_stat_by_repo_3.5-turbo.csv')\ngt = pd.read_csv('ground_truth.csv')\ngt = gt.melt(id_vars=['id', 'title'], var_name='repo', value_name='ground_truth')\n\ndf_repo__stat_with_gt = df_repo__stat.merge(gt, on=['id', 'title', 'repo'])\n\nbase = alt.Chart(\n    df_repo__stat_with_gt.query('repo in [\"lightfm\", \"qlib\", \"DeepSpeech\"]')\n).transform_calculate(\n    min=\"max(0, datum.mean-datum.std)\",\n    max=\"min(1, datum.mean+datum.std)\"\n)\n    \n# generate the points\npoints = base.mark_point(\n    filled=True,\n    size=50,\n    color='black'\n).encode(\n    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n    ),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n)\n\n# generate the points for ground truth\ngt_points = base.mark_point(\n    filled=True,\n    size=200,\n    color='green',\n    shape=\"diamond\"\n).encode(\n    x=alt.X('ground_truth:Q'),\n    y=alt.Y('id_title:N')\n)\n\n# generate the error bars\nerrorbars = base.mark_errorbar().encode(\n    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n    x2=\"max:Q\",\n    y=\"id_title:N\"\n)\n\n(gt_points + points + errorbars).facet(\n    column=alt.Column('repo:N').title(None)\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n)\n\n\n\n\n\n\n\n\n\nComparison of our system’s satisfaction determination versus the ground truth for each checklist item and repository\n\nOur tool tends to underrate satisfying cases, which often classifies fully satisfied items as partially satisfied and partially satisfied items as not satisfied.\n\n\nCode\ndf_repo_run = pd.read_csv('score_by_repo_run_3.5-turbo.csv')\n\ndf_repo_run = df_repo_run.merge(gt, on=['id', 'title', 'repo'])\n\ncontingency_table = pd.pivot_table(\n    df_repo_run,\n    values='run', \n    index=['repo', 'id_title', 'ground_truth'], \n    columns=['score'],\n    aggfunc='count', \n    fill_value=0\n)\ncontingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth']\ncontingency_table.sort_index(level=[0, 2])\n\n\n\n\n\n\n\n\n\n\nscore\n0.0\n0.5\n1.0\n\n\nRepository\nChecklist Item\nGround Truth\n\n\n\n\n\n\n\nlightfm\n3.5. Check for Duplicate Records in Data\n0.0\n30\n0\n0\n\n\n5.3. Ensure Model Output Shape Aligns with Expectation\n0.5\n1\n29\n0\n\n\n2.1. Ensure Data File Loads as Expected\n1.0\n0\n0\n30\n\n\n3.2. Data in the Expected Format\n1.0\n0\n30\n0\n\n\n4.2. Verify Data Split Proportion\n1.0\n0\n11\n19\n\n\n6.1. Verify Evaluation Metrics Implementation\n1.0\n0\n5\n25\n\n\n6.2. Evaluate Model's Performance Against Thresholds\n1.0\n0\n1\n29\n\n\nqlib\n3.5. Check for Duplicate Records in Data\n0.0\n23\n7\n0\n\n\n2.1. Ensure Data File Loads as Expected\n0.5\n0\n0\n30\n\n\n4.2. Verify Data Split Proportion\n0.5\n3\n25\n2\n\n\n3.2. Data in the Expected Format\n1.0\n0\n14\n16\n\n\n5.3. Ensure Model Output Shape Aligns with Expectation\n1.0\n1\n25\n4\n\n\n6.1. Verify Evaluation Metrics Implementation\n1.0\n2\n18\n10\n\n\n6.2. Evaluate Model's Performance Against Thresholds\n1.0\n0\n24\n6\n\n\n\n\n\n\n\n\nContingency table of our system’s satisfaction determination versus the ground truth\n\nThe accuracy issue may be attributed to a need to improve our checklist prompts.\n\nConsistency\n\nAs the completeness scores from LLMs contain randomness, we examined the consistency of completeness scores across checklist items and repositories.\n\n\nCode\nstds = df_repo__stat[['repo', 'std', 'id_title']].pivot(index='repo', columns='id_title').copy()\nstds.columns = [col[1] for col in stds.columns]\nstds = stds.reset_index()\nstds = stds.melt(id_vars='repo', var_name='id_title')\n\nbase = alt.Chart(stds)\n\nbox = base.mark_boxplot(\n    color='grey',\n    opacity=0.5,\n    size=20,\n).encode(\n    x=alt.X('value:Q').title('Standard Deviation of Scores'),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))\n)\n\nstripplot = base.mark_circle(size=100).encode(\n    y=alt.Y( \n        'id_title:N',\n        axis=alt.Axis(ticks=False, grid=True, labels=True), \n        scale=alt.Scale(), \n    ), \n    x='value:Q',\n    yOffset=\"jitter:Q\",\n    color=alt.Color('id_title:N', legend=None),\n    tooltip='repo'\n).transform_calculate(\n    # Generate Gaussian jitter with a Box-Muller transform\n    jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n)\n\n(\n    box + stripplot\n).configure_view( \n    stroke=None\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n).properties(\n    height=300, \n    width=600,\n    title=\"30 Runs on Openja's Repositories for each Checklist Item\"\n) \n\n\n\n\n\n\n\n\n\nStandard deviations of the score for each checklist item. Each dot represents the standard deviation of scores from 30 runs of a single repository.\n\nWe identified two diverging cases:\n\nHigh Standard Deviations\n\nItems like 3.2 Data in the Expected Format showed high standard deviations across repositories. This might indicate potential poor prompt quality for the LLM to produce consistent results. Improved prompt engineering could address this issue.\n\nOutliers with High Standard Deviations\n\nItems like 5.3 Ensure Model Output Shape Aligns with Expectation had outliers with exceptionally high standard deviations, which is possibly due to unorthodox repositories. A careful manual examination is required for a more definitive conclusion.\n\nComparison of gpt-3.5-turbo and gpt-4o\nTo evaluate if newer LLMs improve performance, we preliminarily compared outputs from gpt-4o and gpt-3.5-turbo on the lightfm repository. We observed that gpt-4o consistently returned “Satisfied,” which deviated from the ground truth.\n\n\nCode\n# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo\ndf_repo_4o__stat = pd.read_csv('score_stat_by_repo_4o.csv')\ndf_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])\ndf_repo_4o__stat_with_gt['model'] = 'gpt-4o'\n\ndf_repo_35turbo__stat_with_gt = df_repo__stat_with_gt.query(\"repo == 'lightfm'\").copy()\ndf_repo_35turbo__stat_with_gt['model'] = 'gpt-3.5-turbo'\n\ndf_model_comp = pd.concat(\n    (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), \n    axis=0\n)\n\nbase = alt.Chart(\n    df_model_comp\n).transform_calculate(\n    min=\"max(0, datum.mean-datum.std)\",\n    max=\"min(1, datum.mean+datum.std)\"\n)\n    \n# generate the points\npoints = base.mark_point(\n    filled=True,\n    size=50,\n    color='black'\n).encode(\n    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n    ),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n)\n\n# generate the points for ground truth\ngt_points = base.mark_point(\n    filled=True,\n    size=200,\n    color='green',\n    shape=\"diamond\"\n).encode(\n    x=alt.X('ground_truth:Q'),\n    y=alt.Y('id_title:N')\n)\n\n# generate the error bars\nerrorbars = base.mark_errorbar().encode(\n    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n    x2=\"max:Q\",\n    y=\"id_title:N\"\n)\n\n(gt_points + points + errorbars).facet(\n    column=alt.Column('model:N').title(None)\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n)\n\n\n\n\n\n\n\n\n\nComparison of satisfaction using gpt-4o versus gpt-3.5-turbo for each checklist item on lightfm\n\nFurther investigation into gpt-4o is required to determine its effectiveness in system performance."
   },
   {
     "objectID": "final_report.html#conclusion",
     "href": "final_report.html#conclusion",
     "title": "Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
     "section": "Conclusion",
-    "text": "Conclusion\n\nWrap Up\nOur project, FixML, represents a significant step forward in the field of machine learning (ML) testing by providing curated checklists and automated tools that enhance the evaluation and creation of test suites for ML models. The development and implementation of FixML have been driven by both the need of better quality assurance in ML systems, and the current limitations of traditional testing methods on ML projects which are either too general without comprehensive clarification, or are too human-reliant.\nFixML seamlessly takes in the user’s ML codebase, identifies and extracted its existing test suites. Together with the curated checklist on ML testing, FixML leverages Large Language Models (LLMs) to assess the completeness of the test suites and output detailed evaluation reports with completeness scores and specific reasons. This assists users in understanding the performance of their current test suites with insights. Additionally, FixML can generate test function specifications corresponding to the curated checklist, helping users utilizing their test suites.\nIn return, FixML solution combines the scalability of automated testing with the reliability of expert evaluation. By automating the evaluation process, FixML significantly reduces the time and human effort required to assess the quality of ML test suites. This popularizes thorough and efficient quality assessment on ML projects.\n\n\nLimitation & Future Improvement\nWhile FixML provides substantial benefits, there are limitations and areas that aim to be addressed in future development:\n\nSpecialized Checklist\n\nThe current checklist is designed to be general and may not cover all specific requirements for different ML projects. Future development will focus on creating more specialized checklists for different domains and project types, allowing for more tailored evaluations. Since the format of the checklist is designed to allow users to easily expand, edit and select checklist items based on their specific use case, we welcome any collaboration with ML researchers on the creation of specalized checklists.\n\nEnhanced Test Evaluator\n\nOur current study unveils the varying accuracy and consistency issues on the evaluation results using OpenAI GPT models. Future improvements involves prompt enhancement with prompt engineering techniques and support for multiple LLMs for higher performance and flexibility of FixML test evaluator functionality. We also expect to deliver user guidelines in editing the prompts in our system, where ML developers can customize prompts for better performance and collaborate with us to embed them into the system.\n\nCustomized Test Specification\n\nFixML test specification generator currently produces general test function skeletons solely based on the curated checklist without the context of the specific ML projects. Future developments will involve the integration of the ML project codebase in the generation process to output customized test functions skeletons. This further lower the barrier of ML users in creating comprehensive test suites relevant to the projects.\n\nWorkflow Optimization #FIXME: have to review whether to include as it seems lower priority.\n\nThe current test evaluator and test specification generator are separate entities. This could be improved by embedding a workflow engine that allows the system to automatically take actions based on the LLM response. For instance, if the LLM response suggests that test suites are partially satisfied or non-satisfied, the system could automatically run the test generator to produce test function skeletons and then reevaluate them until they are satisfied or some threshold is met. This would create a more cohesive and efficient workflow, reducing manual intervention and improving overall system performance.\n\nPerformance Optimization #FIXME: have to review whether to include as it seems lower priority.\n\nPerformance optimization is another critical area for future development. As FixML handles large codebases and complex evaluations, optimizing the system to handle these tasks more efficiently is essential. This includes improving the speed and accuracy of the LLM responses, reducing the time taken to analyze and generate reports, and ensuring the system can scale effectively to handle more extensive and more complex projects.\nBy addressing these limitations and focusing on these future improvements, FixML will become an even more powerful tool for ensuring the quality and robustness of machine learning and data science projects."
-  },
-  {
-    "objectID": "proposal.html",
-    "href": "proposal.html",
-    "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
-    "section": "",
-    "text": "by John Shiu, Orix Au Yeung, Tony Shum, Yingzi Jin"
-  },
-  {
-    "objectID": "proposal.html#executive-summary",
-    "href": "proposal.html#executive-summary",
-    "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
-    "section": "Executive Summary",
-    "text": "Executive Summary\nThe rapid growth of global artificial intelligence (AI) markets presents opportunities and challenges. While AI systems have the potential to impact various aspects of human life, ensuring their software quality remains a significant concern. Current testing strategies for machine learning (ML) systems lack standardization and comprehensiveness, which poses risks to stakeholders, such as financial losses and safety hazards.\nOur proposal addresses this challenge by developing a manually curated checklist which contains best practices and recommendations in testing ML systems. Additionally, an end-to-end application incorporating the checklist and Large Language Model (LLM) will be developed to analyze given ML system source codes and provide test completeness evaluation, missing test recommendations, and test function specification generation. Our proposed solution will enable users to systematically assess, improve, and include tests tailored to their ML systems through a combination of human expertise codified within the checklist and parametric memory from LLMs.\nIn the following weeks, we will develop and refine our product through a swift and efficient iterative development approach, with the aim to deliver a rigorously tested and fully-documented system to our partners by the end of the project."
-  },
-  {
-    "objectID": "proposal.html#introduction",
-    "href": "proposal.html#introduction",
-    "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
-    "section": "Introduction",
-    "text": "Introduction\n\nProblem Statement\nThe global artificial intelligence (AI) market is growing exponentially {cite}grand2021artificial, driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.\nHowever, ensuring the software quality of these systems remains a significant challenge {cite}openja2023studying. Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as substantial financial losses {cite}Asheeta2019 and safety hazards.\nTherefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?\n\n\nOur Objectives\nWe propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate a checklist to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software’s trustworthiness, quality, and reproducibility across both the industry and academia {cite}kapoor2022leakage."
-  },
-  {
-    "objectID": "proposal.html#our-product",
-    "href": "proposal.html#our-product",
-    "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
-    "section": "Our Product",
-    "text": "Our Product\nOur solution offers an end-to-end application for evaluating and enhancing the robustness of users’ ML systems.\n\n\n\n```tuqkogfb ../../img/proposed_system_overview.png\n\n\n\n\nname: overview-diagram\n\n\n\nMain components and workflow of the proposed system. The checklist would be written in YAML to maximize readability for both humans and machines. We hope this will encourage researchers/users to read, understand and modify the checklist items, while keeping the checklist closely integrated with other components in our system.\n\nOne big challenge in utilizing LLMs to reliably and consistently evaluate ML systems is their tendency to generate illogical and/or factually wrong information known as hallucination {cite}`zhang2023sirens`.\n\nTo combat this, the proposed system will incorporate a checklist ([Fig. 1](overview-diagram)) which would be curated manually and incorporate best practices in software testing and identified areas to be tested inside ML pipeline from human experts and past research.\n\nThis checklist will be our basis in evaluating the effectiveness and completeness of existing tests in a given codebase. Relevant information will be injected into a prompt template, which the LLMs would then be prompted to follow the checklist **exactly** during the evaluation.\n\nHere is an example of how the proposed checklist would be structured:\n\n\n```{toggle}\n```yaml\n%YAML 1.2\n---\nTitle: Checklist for Tests in Machine Learning Projects\nDescription: &gt;\n  This is a comprehensive checklist for evaluating the data and ML pipeline\n  based on identified testing strategies from experts in the field.\nTest Areas:\n  - Topic: General\n    Description: &gt;\n      The following items describe best practices for all tests to be\n      written.\n    Tests:\n      - Title: Write Descriptive Test Names\n        Requirement: &gt;\n          Every test function should have a clear, descriptive name\n        Explanation: &gt;\n          If out tests are narrow and sufficiently descriptive, the test\n          name itself may give us enough information to start debugging.\n          This also helps us to identify what is being tested inside the\n          function.\n        References:\n          - https://testing.googleblog.com/2014/10/testing-on-toilet-writing-descriptive.html\n          - https://testing.googleblog.com/2024/05/test-failures-should-be-actionable.html\n\n      - Title: Keep Tests Focused\n        Requirement: &gt;\n          Each test should only test one scenario, meaning that in each\n          test we should only use one set of mock data.\n        Explanation: &gt;\n          If we test multiple scenarios in a single test, it is hard to\n          idenitfy exactly what went wrong. Keeping one scenario in a\n          single test helps us to isolate problematic scenarios.\n        References:\n          - https://testing.googleblog.com/2018/06/testing-on-toilet-keep-tests-focused.html\n\n      - Title: Prefer Narrow Assertions in Unit Tests\n        Requirement: &gt;\n          The assertions inside the tests should be narrow, meaning that\n          when checking a complex object, any unrelated behavior should\n          not be tested - Assert on only relevant behaviors.\n        Explanation: &gt;\n          If we have overly wide assertions (such as depending on every\n          field of a complex output proto), the test may fail for many\n          unimportant reasons. False positives are the opposite of\n          actionable.\n        References:\n          - https://testing.googleblog.com/2024/04/prefer-narrow-assertions-in-unit-tests.html\n\n      - Title: Keep Cause and Effect Clear\n        Requirement: &gt;\n          The modifications and the assertions of an object's behavior\n          in a single test should not be far away from each other.\n        Explanation: &gt;\n          Refrain from using large global test data structures shared\n          across multiple unit tests. This will allow for clear\n          identification of each test's setup and the cause and effect.\n        References:\n          - https://testing.googleblog.com/2017/01/testing-on-toilet-keep-cause-and-effect.html\n\n  - Topic: Data Presence\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the presence of data.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Data Quality\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the quality of data.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Data Ingestion\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      if the data is ingestion properly.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Model Fitting\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the model fitting process.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Model Evaluation\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the model evaluation process.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Artifact Testing\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      any artifacts that are created from the project.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\nEvaluation Artifacts\nThe end goal of our product is to generate the following three artifacts in relation to the evaluation of a given ML system codebase:\n\nML Test Completeness Score: The application utilizes LLMs and our curated checklist to analyze users’ ML system source code and returns a comprehensive score of the system’s test quality.\nMissing Test Recommendations: The application evaluates the adequacy of existing tests for users’ ML code and offers recommendations for additional, system-specific tests to enhance testing effectiveness.\nTest Function Specification Generation: Users select desired test recommendations and prompt the application to generate test function specifications and references. These are reliable starting points for users to enrich the ML system test suites.\n\n\n\nSuccess Metrics\nOur product’s success will depend on mutation testing of the test functions developed based on our application-generated specifications. The evaluation metric is the success rate of detecting the perturbations introduced to the ML project code.\nOur partners and stakeholders expect a significant improvement in the testing suites of their ML systems post-application usage. As a result, the testing suites will demonstrate high accuracy in detecting faults, ensuring consistency and high quality of ML projects during updates.\n\n\nData Science Approach\n\nData: GitHub Repositories\nIn this project, GitHub repositories are our data.\nTo develop our testing checklist, we will collect 11 repositories studied in {cite}openja2023studying. Additionally, we will collect 377 repositories identified in the study by {cite}wattanakriengkrai2022github for our product development.\nFor each repository, we are interested in the metadata and the ML modeling- and test-related source code. The metadata will be retrieved using the GitHub API, while the source code will be downloaded and filtered using our custom scripts. To ensure the relevance of the repositories to our study, we will apply the following criteria for filtering: 1. Repositories that are related to ML systems. 2. Repositories that include test cases. 3. Repositories whose development is written in the Python programming language.\n\n\nMethodologies\nOur data science methodology incorporates human expert evaluation and prompt engineering to assess and enhance the test quality of ML systems.\n\nHuman Expert Evaluation\nWe will begin by formulating a comprehensive checklist for evaluating the data and ML pipeline based on the established testing strategies outlined in {cite}openja2023studying as the foundational framework. Based on the formulated checklist, our team will manually assess the test quality within each repository data. We will refine the checklist to ensure applicability and robustness when testing general ML systems.\nPrompt Engineering\nWe will engineer the prompts for LLM to incorporate with the ML system code and the curated checklist and to serve various purposes across the three-stage process:\n\nPrompts to examine test cases within the ML system source codes and deliver test completeness scores.\nPrompts to compare and contrast the existing tests and the checklist and deliver recommendations.\nPrompts to generate system-specific test specifications based on user-selected testing recommendations {cite}schafer2023empirical\n\n\n\n\nIterative Development Approach\nWe begin by setting up a foundational framework based on the selected GitHub repositories and research on ML testing. The framework might not cover all ML systems or testing practices. Therefore, we adopt an iterative development approach by establishing an open and scalable framework to address these considerations. The application will be continuously refined based on contributors’ insights.\nUsers are encouraged to interpret the generated artifacts with a grain of salt and recognize the evolving nature of ML system testing practices."
-  },
-  {
-    "objectID": "proposal.html#delivery-timeline",
-    "href": "proposal.html#delivery-timeline",
-    "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
-    "section": "Delivery Timeline",
-    "text": "Delivery Timeline\nOur team follows the timeline below for our product delivery and prioritizes close communication with our partners to ensure that our developments align closely with their expectations.\n\n\n\n\n\n\n\nTimeline\nMilestones\n\n\n\n\nWeek 1 (Apr 29 - May 3)\nPrepare and Present Initial Proposal. Scrape repository data.\n\n\nWeek 2 - 3 (May 6 - 17)\nDeliver Proposal. Deliver Draft of ML Pipeline Test Checklist. Develop Minimum Viable Product (Test Completeness Score, Missing Test Recommendation)\n\n\nWeek 4 - 5 (May 20 - May 31)\nUpdate Test Checklist. Develop Test Function Specification Generator.\n\n\nWeek 6 (Jun 3 - Jun 7)\nUpdate Test Checklist. Wrap Up Product.\n\n\nWeek 7 (Jun 10 - Jun 14)\nFinalize Test Checklist. Perform Product System Test. Present Final Product. Prepare Final Product Report.\n\n\nWeek 8 (Jun 17 - Jun 21)\nDeliver Final Product. Deliver Final Product Report."
-  },
-  {
-    "objectID": "proposal.html#references",
-    "href": "proposal.html#references",
-    "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
-    "section": "References",
-    "text": "References"
+    "text": "Conclusion\n\nWrap Up\nThe development of FixML has been driven by the need of better quality assurance in ML systems and the current limitations of traditional testing methods on ML projects. FixML provides curated checklists and automated tools that enhance the evaluation and creation of test suites for ML projects. This in return, significantly reduces the time and effort required to assess the completeness of ML test suites, and thus promotes thorough and efficient assessment on ML projects.\n\n\nLimitation & Future Improvement\nWhile FixML provides substantial benefits, there are limitations and areas to be addressed in future development:\n\nSpecialized Checklist\n\nThe default checklist is general and may not cover all requirements for different ML projects. Future development will focus on creating specialized checklists for tailored evaluations across various domains and project types. Collaboration with ML researchers is welcomed for creating specialized checklists based on specific use cases.\n\nEnhanced Test Evaluator\n\nOur study reveals the accuracy and consistency issues on the evaluation results using OpenAI GPT-3.5-turbo model. Future improvements involves better prompt engineering techniques and support for multiple LLMs for enhanced performance and flexibility. User guidelines in prompt creation will be provided to facilitate collaboration with ML developers.\n\nCustomized Test Specification\n\nFuture developments will integrate project-specific information to produce customized test function skeletons. This may further encourage users to create comprehensive tests.\n\nWorkflow Optimization #FIXME: have to review whether to include as it seems lower priority.\n\nThe test evaluator and test specification generator are currently separate. Future improvements could embed a workflow engine that automatically takes actions based on LLM responses. This creates a more cohesive and efficient workflow, recues manual intervention, and improves overall system performance.\n\nPerformance Optimization #FIXME: have to review whether to include as it seems lower priority.\n\nAs FixML handles large codebases and complex evaluations, performance optimization is essential. Future developments will focus on improving the speed and accuracy of LLM responses, reducing analysis and report generation times, and ensuring scalability for handling larger and more complex projects.\nBy addressing these limitations and implementing future improvements, we aim for FixML to achieve better performance and contribute to the development of better ML systems, and ultimately enhance human life."
   }
 ]
\ No newline at end of file
diff --git a/report/final_report/_quarto.yml b/report/final_report/_quarto.yml
index a18ba50..02ad858 100644
--- a/report/final_report/_quarto.yml
+++ b/report/final_report/_quarto.yml
@@ -1,19 +1,19 @@
 project:
   type: website
+  render:
+    - "*qmd"
   output-dir: docs
 
 website:
   sidebar:
     style: "docked"
-    logo: "logo.png"
+    logo: "img/logo.png"
     search: true
     contents:
-      - section: "Final Report"
-        contents:
-          - final_report.qmd
-      - section: "Proposal"
-        contents:
-          - proposal.ipynb
+      - text: "Capstone Final Report"
+        href: final_report.qmd
+      - text: "Capstone Proposal"
+        href: proposal.qmd
 
 format:
   html:
diff --git a/report/final_report/docs/01_preprocess.html b/report/final_report/docs/01_preprocess.html
deleted file mode 100644
index b724d86..0000000
--- a/report/final_report/docs/01_preprocess.html
+++ /dev/null
@@ -1,606 +0,0 @@
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
-
-<meta charset="utf-8">
-<meta name="generator" content="quarto-1.3.450">
-
-<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
-
-
-<title>preprocess</title>
-<style>
-code{white-space: pre-wrap;}
-span.smallcaps{font-variant: small-caps;}
-div.columns{display: flex; gap: min(4vw, 1.5em);}
-div.column{flex: auto; overflow-x: auto;}
-div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
-ul.task-list{list-style: none;}
-ul.task-list li input[type="checkbox"] {
-  width: 0.8em;
-  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
-  vertical-align: middle;
-}
-/* CSS for syntax highlighting */
-pre > code.sourceCode { white-space: pre; position: relative; }
-pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
-pre > code.sourceCode > span:empty { height: 1.2em; }
-.sourceCode { overflow: visible; }
-code.sourceCode > span { color: inherit; text-decoration: inherit; }
-div.sourceCode { margin: 1em 0; }
-pre.sourceCode { margin: 0; }
-@media screen {
-div.sourceCode { overflow: auto; }
-}
-@media print {
-pre > code.sourceCode { white-space: pre-wrap; }
-pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
-}
-pre.numberSource code
-  { counter-reset: source-line 0; }
-pre.numberSource code > span
-  { position: relative; left: -4em; counter-increment: source-line; }
-pre.numberSource code > span > a:first-child::before
-  { content: counter(source-line);
-    position: relative; left: -1em; text-align: right; vertical-align: baseline;
-    border: none; display: inline-block;
-    -webkit-touch-callout: none; -webkit-user-select: none;
-    -khtml-user-select: none; -moz-user-select: none;
-    -ms-user-select: none; user-select: none;
-    padding: 0 4px; width: 4em;
-  }
-pre.numberSource { margin-left: 3em;  padding-left: 4px; }
-div.sourceCode
-  {   }
-@media screen {
-pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
-}
-</style>
-
-
-<script src="site_libs/quarto-nav/quarto-nav.js"></script>
-<script src="site_libs/quarto-nav/headroom.min.js"></script>
-<script src="site_libs/clipboard/clipboard.min.js"></script>
-<script src="site_libs/quarto-search/autocomplete.umd.js"></script>
-<script src="site_libs/quarto-search/fuse.min.js"></script>
-<script src="site_libs/quarto-search/quarto-search.js"></script>
-<meta name="quarto:offset" content="./">
-<script src="site_libs/quarto-html/quarto.js"></script>
-<script src="site_libs/quarto-html/popper.min.js"></script>
-<script src="site_libs/quarto-html/tippy.umd.min.js"></script>
-<script src="site_libs/quarto-html/anchor.min.js"></script>
-<link href="site_libs/quarto-html/tippy.css" rel="stylesheet">
-<link href="site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
-<script src="site_libs/bootstrap/bootstrap.min.js"></script>
-<link href="site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
-<link href="site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
-<script id="quarto-search-options" type="application/json">{
-  "location": "sidebar",
-  "copy-button": false,
-  "collapse-after": 3,
-  "panel-placement": "start",
-  "type": "textbox",
-  "limit": 20,
-  "language": {
-    "search-no-results-text": "No results",
-    "search-matching-documents-text": "matching documents",
-    "search-copy-link-title": "Copy link to search",
-    "search-hide-matches-text": "Hide additional matches",
-    "search-more-match-text": "more match in this document",
-    "search-more-matches-text": "more matches in this document",
-    "search-clear-button-title": "Clear",
-    "search-detached-cancel-button-title": "Cancel",
-    "search-submit-button-title": "Submit",
-    "search-label": "Search"
-  }
-}</script>
-
-
-<link rel="stylesheet" href="styles.css">
-</head>
-
-<body class="nav-sidebar docked">
-
-<div id="quarto-search-results"></div>
-  <header id="quarto-header" class="headroom fixed-top">
-  <nav class="quarto-secondary-nav">
-    <div class="container-fluid d-flex">
-      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
-        <i class="bi bi-layout-text-sidebar-reverse"></i>
-      </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"></ol></nav>
-      <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
-      </a>
-      <button type="button" class="btn quarto-search-button" aria-label="" onclick="window.quartoOpenSearch();">
-        <i class="bi bi-search"></i>
-      </button>
-    </div>
-  </nav>
-</header>
-<!-- content -->
-<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
-<!-- sidebar -->
-  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal sidebar-navigation docked overflow-auto">
-    <div class="pt-lg-2 mt-2 text-left sidebar-header">
-      <a href="./index.html" class="sidebar-logo-link">
-      <img src="./logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
-      </a>
-      </div>
-        <div class="mt-2 flex-shrink-0 align-items-center">
-        <div class="sidebar-search">
-        <div id="quarto-search" class="" title="Search"></div>
-        </div>
-        </div>
-    <div class="sidebar-menu-container"> 
-    <ul class="list-unstyled mt-1">
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
- <span class="menu-text">Final Report</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./final_report.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">DSCI591 Capstone Final Report</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
- <span class="menu-text">Proposal</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./proposal.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-    </ul>
-    </div>
-</nav>
-<div id="quarto-sidebar-glass" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass"></div>
-<!-- margin-sidebar -->
-    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
-        
-    </div>
-<!-- main -->
-<main class="content" id="quarto-document-content">
-
-
-
-<div class="cell" data-execution_count="1">
-<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co">#!pip install scipy altair</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="2">
-<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os</span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> json</span>
-<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> yaml</span>
-<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
-<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> Counter</span>
-<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>id_item_map <span class="op">=</span> {</span>
-<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>    <span class="st">'2.1'</span>: <span class="st">'Ensure Data File Loads as Expected'</span>,</span>
-<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>    <span class="st">'3.2'</span>: <span class="st">'Data in the Expected Format'</span>,</span>
-<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a>    <span class="st">'3.5'</span>: <span class="st">'Check for Duplicate Records in Data'</span>,</span>
-<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>    <span class="st">'4.2'</span>: <span class="st">'Verify Data Split Proportion'</span>,</span>
-<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a>    <span class="st">'5.3'</span>: <span class="st">'Ensure Model Output Shape Aligns with Expectation'</span>,</span>
-<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a>    <span class="st">'6.1'</span>: <span class="st">'Verify Evaluation Metrics Implementation'</span>,</span>
-<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a>    <span class="st">'6.2'</span>: <span class="st">"Evaluate Model's Performance Against Thresholds"</span></span>
-<span id="cb2-15"><a href="#cb2-15" aria-hidden="true" tabindex="-1"></a>}</span>
-<span id="cb2-16"><a href="#cb2-16" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-17"><a href="#cb2-17" aria-hidden="true" tabindex="-1"></a>ground_truth <span class="op">=</span> [</span>
-<span id="cb2-18"><a href="#cb2-18" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb2-19"><a href="#cb2-19" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb2-20"><a href="#cb2-20" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb2-21"><a href="#cb2-21" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb2-22"><a href="#cb2-22" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb2-23"><a href="#cb2-23" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb2-24"><a href="#cb2-24" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb2-25"><a href="#cb2-25" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb2-26"><a href="#cb2-26" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb2-27"><a href="#cb2-27" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb2-28"><a href="#cb2-28" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb2-29"><a href="#cb2-29" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb2-30"><a href="#cb2-30" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb2-31"><a href="#cb2-31" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb2-32"><a href="#cb2-32" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb2-33"><a href="#cb2-33" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb2-34"><a href="#cb2-34" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb2-35"><a href="#cb2-35" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb2-36"><a href="#cb2-36" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb2-37"><a href="#cb2-37" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb2-38"><a href="#cb2-38" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb2-39"><a href="#cb2-39" aria-hidden="true" tabindex="-1"></a>]</span>
-<span id="cb2-40"><a href="#cb2-40" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-41"><a href="#cb2-41" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_score_report_from_response(resp_path, verbose<span class="op">=</span><span class="va">False</span>):</span>
-<span id="cb2-42"><a href="#cb2-42" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> verbose:</span>
-<span id="cb2-43"><a href="#cb2-43" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(resp_path)</span>
-<span id="cb2-44"><a href="#cb2-44" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> <span class="bu">open</span>(resp_path, <span class="st">'rb'</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
-<span id="cb2-45"><a href="#cb2-45" aria-hidden="true" tabindex="-1"></a>        response <span class="op">=</span> json.load(<span class="bu">file</span>)</span>
-<span id="cb2-46"><a href="#cb2-46" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-47"><a href="#cb2-47" aria-hidden="true" tabindex="-1"></a>    reports <span class="op">=</span> [] <span class="co"># report for each test file</span></span>
-<span id="cb2-48"><a href="#cb2-48" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> result <span class="kw">in</span> response[<span class="st">'call_results'</span>]: <span class="co"># one test file per response</span></span>
-<span id="cb2-49"><a href="#cb2-49" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> result[<span class="st">'parsed_response'</span>]:</span>
-<span id="cb2-50"><a href="#cb2-50" aria-hidden="true" tabindex="-1"></a>            resp <span class="op">=</span> result[<span class="st">'parsed_response'</span>][<span class="st">'results'</span>]</span>
-<span id="cb2-51"><a href="#cb2-51" aria-hidden="true" tabindex="-1"></a>            <span class="cf">for</span> item <span class="kw">in</span> resp:</span>
-<span id="cb2-52"><a href="#cb2-52" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'file'</span>] <span class="op">=</span> result[<span class="st">'files_evaluated'</span>][<span class="dv">0</span>] </span>
-<span id="cb2-53"><a href="#cb2-53" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'success'</span>] <span class="op">=</span> result[<span class="st">'success'</span>]</span>
-<span id="cb2-54"><a href="#cb2-54" aria-hidden="true" tabindex="-1"></a>                reports.append(item)</span>
-<span id="cb2-55"><a href="#cb2-55" aria-hidden="true" tabindex="-1"></a>        <span class="co"># </span><span class="al">FIXME</span><span class="co">: not handled failed run for now</span></span>
-<span id="cb2-56"><a href="#cb2-56" aria-hidden="true" tabindex="-1"></a>        <span class="co"># else: # if the run is failed, the parsed_response will be None</span></span>
-<span id="cb2-57"><a href="#cb2-57" aria-hidden="true" tabindex="-1"></a>        <span class="co">#     reports.append({</span></span>
-<span id="cb2-58"><a href="#cb2-58" aria-hidden="true" tabindex="-1"></a>        <span class="co">#         'ID': '2.1', </span></span>
-<span id="cb2-59"><a href="#cb2-59" aria-hidden="true" tabindex="-1"></a>        <span class="co">#         'Title': '',</span></span>
-<span id="cb2-60"><a href="#cb2-60" aria-hidden="true" tabindex="-1"></a>        <span class="co">#         'Requirement': '',</span></span>
-<span id="cb2-61"><a href="#cb2-61" aria-hidden="true" tabindex="-1"></a>        <span class="co">#         'Observation': '',</span></span>
-<span id="cb2-62"><a href="#cb2-62" aria-hidden="true" tabindex="-1"></a>        <span class="co">#         'Functions': [],</span></span>
-<span id="cb2-63"><a href="#cb2-63" aria-hidden="true" tabindex="-1"></a>        <span class="co">#         'Evaluation': '',</span></span>
-<span id="cb2-64"><a href="#cb2-64" aria-hidden="true" tabindex="-1"></a>        <span class="co">#         'Score': 0,</span></span>
-<span id="cb2-65"><a href="#cb2-65" aria-hidden="true" tabindex="-1"></a>        <span class="co">#         'file': result['files_evaluated'][0],</span></span>
-<span id="cb2-66"><a href="#cb2-66" aria-hidden="true" tabindex="-1"></a>        <span class="co">#         'success': result['success']</span></span>
-<span id="cb2-67"><a href="#cb2-67" aria-hidden="true" tabindex="-1"></a>        <span class="co">#     })</span></span>
-<span id="cb2-68"><a href="#cb2-68" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-69"><a href="#cb2-69" aria-hidden="true" tabindex="-1"></a>    reports_df <span class="op">=</span> pd.DataFrame(reports)</span>
-<span id="cb2-70"><a href="#cb2-70" aria-hidden="true" tabindex="-1"></a>    df <span class="op">=</span> (</span>
-<span id="cb2-71"><a href="#cb2-71" aria-hidden="true" tabindex="-1"></a>        reports_df</span>
-<span id="cb2-72"><a href="#cb2-72" aria-hidden="true" tabindex="-1"></a>        .pivot(index<span class="op">=</span><span class="st">'file'</span>, columns<span class="op">=</span><span class="st">'ID'</span>, values<span class="op">=</span><span class="st">'Score'</span>)</span>
-<span id="cb2-73"><a href="#cb2-73" aria-hidden="true" tabindex="-1"></a>        .rename_axis(<span class="va">None</span>, axis<span class="op">=</span><span class="dv">1</span>)</span>
-<span id="cb2-74"><a href="#cb2-74" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-75"><a href="#cb2-75" aria-hidden="true" tabindex="-1"></a>    df[<span class="st">'success'</span>] <span class="op">=</span> reports_df.groupby([<span class="st">'file'</span>])[<span class="st">'success'</span>].<span class="bu">all</span>()</span>
-<span id="cb2-76"><a href="#cb2-76" aria-hidden="true" tabindex="-1"></a>    df[<span class="st">'response_path'</span>] <span class="op">=</span> os.path.abspath(resp_path)</span>
-<span id="cb2-77"><a href="#cb2-77" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-78"><a href="#cb2-78" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> df.reset_index()</span>
-<span id="cb2-79"><a href="#cb2-79" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-80"><a href="#cb2-80" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_scores_by_repo_by_run_by_file(batch_run_dir_path, record_yml<span class="op">=</span><span class="st">'record_combine.yml'</span>, verbose<span class="op">=</span><span class="va">False</span>):</span>
-<span id="cb2-81"><a href="#cb2-81" aria-hidden="true" tabindex="-1"></a>    <span class="co">''' Get score for each checklist item, by repository, by run and by test file</span></span>
-<span id="cb2-82"><a href="#cb2-82" aria-hidden="true" tabindex="-1"></a><span class="co">    '''</span></span>
-<span id="cb2-83"><a href="#cb2-83" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> <span class="bu">open</span>(os.path.join(batch_run_dir_path, record_yml), <span class="st">'r'</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
-<span id="cb2-84"><a href="#cb2-84" aria-hidden="true" tabindex="-1"></a>        config <span class="op">=</span> pd.DataFrame(yaml.safe_load(<span class="bu">file</span>))</span>
-<span id="cb2-85"><a href="#cb2-85" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-86"><a href="#cb2-86" aria-hidden="true" tabindex="-1"></a>    config[<span class="st">'response_path'</span>] <span class="op">=</span> config[<span class="st">'response_path'</span>].<span class="bu">apply</span>(</span>
-<span id="cb2-87"><a href="#cb2-87" aria-hidden="true" tabindex="-1"></a>        <span class="kw">lambda</span> x: os.path.abspath(os.path.join(batch_run_dir_path, x))</span>
-<span id="cb2-88"><a href="#cb2-88" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-89"><a href="#cb2-89" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-90"><a href="#cb2-90" aria-hidden="true" tabindex="-1"></a>    tmp <span class="op">=</span> [</span>
-<span id="cb2-91"><a href="#cb2-91" aria-hidden="true" tabindex="-1"></a>        get_score_report_from_response(</span>
-<span id="cb2-92"><a href="#cb2-92" aria-hidden="true" tabindex="-1"></a>            os.path.join(batch_run_dir_path, path),</span>
-<span id="cb2-93"><a href="#cb2-93" aria-hidden="true" tabindex="-1"></a>            verbose<span class="op">=</span>verbose</span>
-<span id="cb2-94"><a href="#cb2-94" aria-hidden="true" tabindex="-1"></a>        ) <span class="cf">for</span> path <span class="kw">in</span> config[<span class="st">'response_path'</span>]</span>
-<span id="cb2-95"><a href="#cb2-95" aria-hidden="true" tabindex="-1"></a>    ]</span>
-<span id="cb2-96"><a href="#cb2-96" aria-hidden="true" tabindex="-1"></a>    tmp <span class="op">=</span> pd.concat(tmp, axis<span class="op">=</span><span class="dv">0</span>).reset_index(drop<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb2-97"><a href="#cb2-97" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-98"><a href="#cb2-98" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> config.merge(tmp, on<span class="op">=</span><span class="st">'response_path'</span>, how<span class="op">=</span><span class="st">'left'</span>)</span>
-<span id="cb2-99"><a href="#cb2-99" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-100"><a href="#cb2-100" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> preprocess(df_repo_run_file, id_item_map<span class="op">=</span><span class="va">None</span>):</span>
-<span id="cb2-101"><a href="#cb2-101" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> id_item_map <span class="kw">is</span> <span class="va">None</span>:</span>
-<span id="cb2-102"><a href="#cb2-102" aria-hidden="true" tabindex="-1"></a>        id_item_map <span class="op">=</span> {</span>
-<span id="cb2-103"><a href="#cb2-103" aria-hidden="true" tabindex="-1"></a>            <span class="st">'2.1'</span>: <span class="st">'Ensure Data File Loads as Expected'</span>,</span>
-<span id="cb2-104"><a href="#cb2-104" aria-hidden="true" tabindex="-1"></a>            <span class="st">'3.2'</span>: <span class="st">'Data in the Expected Format'</span>,</span>
-<span id="cb2-105"><a href="#cb2-105" aria-hidden="true" tabindex="-1"></a>            <span class="st">'3.5'</span>: <span class="st">'Check for Duplicate Records in Data'</span>,</span>
-<span id="cb2-106"><a href="#cb2-106" aria-hidden="true" tabindex="-1"></a>            <span class="st">'4.2'</span>: <span class="st">'Verify Data Split Proportion'</span>,</span>
-<span id="cb2-107"><a href="#cb2-107" aria-hidden="true" tabindex="-1"></a>            <span class="st">'5.3'</span>: <span class="st">'Ensure Model Output Shape Aligns with Expectation'</span>,</span>
-<span id="cb2-108"><a href="#cb2-108" aria-hidden="true" tabindex="-1"></a>            <span class="st">'6.1'</span>: <span class="st">'Verify Evaluation Metrics Implementation'</span>,</span>
-<span id="cb2-109"><a href="#cb2-109" aria-hidden="true" tabindex="-1"></a>            <span class="st">'6.2'</span>: <span class="st">"Evaluate Model's Performance Against Thresholds"</span></span>
-<span id="cb2-110"><a href="#cb2-110" aria-hidden="true" tabindex="-1"></a>        }</span>
-<span id="cb2-111"><a href="#cb2-111" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-112"><a href="#cb2-112" aria-hidden="true" tabindex="-1"></a>    <span class="co"># prepare score data by repo, by run</span></span>
-<span id="cb2-113"><a href="#cb2-113" aria-hidden="true" tabindex="-1"></a>    df_repo_run <span class="op">=</span> df_repo_run_file.groupby([<span class="st">'repo'</span>, <span class="st">'run'</span>]).agg({</span>
-<span id="cb2-114"><a href="#cb2-114" aria-hidden="true" tabindex="-1"></a>        <span class="bu">id</span>: [<span class="st">'max'</span>] <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> id_item_map.keys()</span>
-<span id="cb2-115"><a href="#cb2-115" aria-hidden="true" tabindex="-1"></a>    })</span>
-<span id="cb2-116"><a href="#cb2-116" aria-hidden="true" tabindex="-1"></a>    df_repo_run.columns <span class="op">=</span> [col[<span class="dv">0</span>] <span class="cf">for</span> col <span class="kw">in</span> df_repo_run.columns]</span>
-<span id="cb2-117"><a href="#cb2-117" aria-hidden="true" tabindex="-1"></a>    df_repo_run <span class="op">=</span> df_repo_run.reset_index()</span>
-<span id="cb2-118"><a href="#cb2-118" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-119"><a href="#cb2-119" aria-hidden="true" tabindex="-1"></a>    <span class="co"># prepare statistics of scores by repo</span></span>
-<span id="cb2-120"><a href="#cb2-120" aria-hidden="true" tabindex="-1"></a>    df_repo__stat <span class="op">=</span> df_repo_run.groupby([<span class="st">'repo'</span>]).agg({</span>
-<span id="cb2-121"><a href="#cb2-121" aria-hidden="true" tabindex="-1"></a>        <span class="bu">id</span>: [<span class="st">'mean'</span>, <span class="st">'std'</span>, <span class="st">'count'</span>] <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> id_item_map.keys()</span>
-<span id="cb2-122"><a href="#cb2-122" aria-hidden="true" tabindex="-1"></a>    })</span>
-<span id="cb2-123"><a href="#cb2-123" aria-hidden="true" tabindex="-1"></a>    df_repo__stat <span class="op">=</span> pd.melt(df_repo__stat.reset_index(), id_vars<span class="op">=</span>[(<span class="st">'repo'</span>, <span class="st">''</span>)])</span>
-<span id="cb2-124"><a href="#cb2-124" aria-hidden="true" tabindex="-1"></a>    df_repo__stat.columns <span class="op">=</span> [<span class="st">'repo'</span>, <span class="st">'id'</span>, <span class="st">'stat'</span>, <span class="st">'value'</span>]</span>
-<span id="cb2-125"><a href="#cb2-125" aria-hidden="true" tabindex="-1"></a>    df_repo__stat <span class="op">=</span> (</span>
-<span id="cb2-126"><a href="#cb2-126" aria-hidden="true" tabindex="-1"></a>        df_repo__stat.pivot(index<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>], columns<span class="op">=</span><span class="st">'stat'</span>, values<span class="op">=</span><span class="st">'value'</span>)</span>
-<span id="cb2-127"><a href="#cb2-127" aria-hidden="true" tabindex="-1"></a>        .reset_index()</span>
-<span id="cb2-128"><a href="#cb2-128" aria-hidden="true" tabindex="-1"></a>        .rename_axis(<span class="va">None</span>, axis<span class="op">=</span><span class="dv">1</span>)</span>
-<span id="cb2-129"><a href="#cb2-129" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-130"><a href="#cb2-130" aria-hidden="true" tabindex="-1"></a>    df_repo__stat[<span class="st">'title'</span>] <span class="op">=</span> df_repo__stat[<span class="st">'id'</span>].<span class="bu">apply</span>(<span class="kw">lambda</span> x: id_item_map[x])</span>
-<span id="cb2-131"><a href="#cb2-131" aria-hidden="true" tabindex="-1"></a>    df_repo__stat[<span class="st">'id_title'</span>] <span class="op">=</span> df_repo__stat[<span class="st">'id'</span>] <span class="op">+</span> <span class="st">'. '</span> <span class="op">+</span> df_repo__stat[<span class="st">'title'</span>]</span>
-<span id="cb2-132"><a href="#cb2-132" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-133"><a href="#cb2-133" aria-hidden="true" tabindex="-1"></a>    <span class="co"># prepare counting of scores by repo</span></span>
-<span id="cb2-134"><a href="#cb2-134" aria-hidden="true" tabindex="-1"></a>    df_repo__count <span class="op">=</span> df_repo_run.groupby([<span class="st">'repo'</span>])[<span class="st">'2.1'</span>].<span class="bu">apply</span>(Counter).reset_index()</span>
-<span id="cb2-135"><a href="#cb2-135" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> <span class="bu">list</span>(id_item_map.keys())[<span class="dv">1</span>:]:</span>
-<span id="cb2-136"><a href="#cb2-136" aria-hidden="true" tabindex="-1"></a>        df_repo__count <span class="op">=</span> df_repo__count.merge(</span>
-<span id="cb2-137"><a href="#cb2-137" aria-hidden="true" tabindex="-1"></a>            df_repo_run.groupby([<span class="st">'repo'</span>])[<span class="bu">id</span>].<span class="bu">apply</span>(Counter).reset_index(),</span>
-<span id="cb2-138"><a href="#cb2-138" aria-hidden="true" tabindex="-1"></a>            on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'level_1'</span>],</span>
-<span id="cb2-139"><a href="#cb2-139" aria-hidden="true" tabindex="-1"></a>            how<span class="op">=</span><span class="st">'outer'</span></span>
-<span id="cb2-140"><a href="#cb2-140" aria-hidden="true" tabindex="-1"></a>        )</span>
-<span id="cb2-141"><a href="#cb2-141" aria-hidden="true" tabindex="-1"></a>    <span class="co">#df_repo__count['title'] = df_repo__count['id'].apply(lambda x: id_item_map[x])</span></span>
-<span id="cb2-142"><a href="#cb2-142" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-143"><a href="#cb2-143" aria-hidden="true" tabindex="-1"></a>    df_repo__count <span class="op">=</span> df_repo__count.fillna(<span class="dv">0</span>)</span>
-<span id="cb2-144"><a href="#cb2-144" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-145"><a href="#cb2-145" aria-hidden="true" tabindex="-1"></a>    df_repo_run <span class="op">=</span> df_repo_run.melt(id_vars<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'run'</span>], var_name<span class="op">=</span><span class="st">'id'</span>, value_name<span class="op">=</span><span class="st">'score'</span>)</span>
-<span id="cb2-146"><a href="#cb2-146" aria-hidden="true" tabindex="-1"></a>    df_repo_run[<span class="st">'title'</span>] <span class="op">=</span> df_repo_run[<span class="st">'id'</span>].<span class="bu">apply</span>(<span class="kw">lambda</span> x: id_item_map[x])</span>
-<span id="cb2-147"><a href="#cb2-147" aria-hidden="true" tabindex="-1"></a>    df_repo_run[<span class="st">'id_title'</span>] <span class="op">=</span> df_repo_run[<span class="st">'id'</span>] <span class="op">+</span> <span class="st">'. '</span> <span class="op">+</span> df_repo_run[<span class="st">'title'</span>]</span>
-<span id="cb2-148"><a href="#cb2-148" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-149"><a href="#cb2-149" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (df_repo_run, df_repo__stat, df_repo__count)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-scrolled="true" data-execution_count="3">
-<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_file <span class="op">=</span> get_scores_by_repo_by_run_by_file(<span class="st">'../../data/processed/batch_run_3.5-turbo/'</span>)</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>df_repo_run, df_repo__stat, df_repo__count <span class="op">=</span> preprocess(df_repo_run_file)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="4">
-<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>df_repo_run.to_csv(<span class="st">'score_by_repo_run_3.5-turbo.csv'</span>, index<span class="op">=</span><span class="va">False</span>)</span>
-<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>df_repo__stat.to_csv(<span class="st">'score_stat_by_repo_3.5-turbo.csv'</span>, index<span class="op">=</span><span class="va">False</span>)</span>
-<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>df_repo__count.to_csv(<span class="st">'score_count_by_repo_3.5-turbo.csv'</span>, index<span class="op">=</span><span class="va">False</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="5">
-<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_file <span class="op">=</span> get_scores_by_repo_by_run_by_file(<span class="st">'../../data/processed/batch_run_4o/'</span>)</span>
-<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>df_repo_run, df_repo__stat, df_repo__count <span class="op">=</span> preprocess(df_repo_run_file)</span>
-<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>df_repo_run.to_csv(<span class="st">'score_by_repo_run_4o.csv'</span>, index<span class="op">=</span><span class="va">False</span>)</span>
-<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>df_repo__stat.to_csv(<span class="st">'score_stat_by_repo_4o.csv'</span>, index<span class="op">=</span><span class="va">False</span>)</span>
-<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>df_repo__count.to_csv(<span class="st">'score_count_by_repo_4o.csv'</span>, index<span class="op">=</span><span class="va">False</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="6">
-<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>ground_truth_df <span class="op">=</span> pd.DataFrame(ground_truth)</span>
-<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>ground_truth_df[<span class="st">'title'</span>] <span class="op">=</span> ground_truth_df[<span class="st">'id'</span>].<span class="bu">apply</span>(<span class="kw">lambda</span> x: id_item_map[x])</span>
-<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>ground_truth_df <span class="op">=</span> ground_truth_df.pivot(index<span class="op">=</span>[<span class="st">'id'</span>, <span class="st">'title'</span>], columns<span class="op">=</span><span class="st">'repo'</span>, values<span class="op">=</span><span class="st">'score'</span>)</span>
-<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>ground_truth_df.to_csv(<span class="st">'ground_truth.csv'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="7">
-<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="co">#df_repo__count.melt(id_vars=['repo', 'level_1'])</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-
-
-
-</main> <!-- /main -->
-<script id="quarto-html-after-body" type="application/javascript">
-window.document.addEventListener("DOMContentLoaded", function (event) {
-  const toggleBodyColorMode = (bsSheetEl) => {
-    const mode = bsSheetEl.getAttribute("data-mode");
-    const bodyEl = window.document.querySelector("body");
-    if (mode === "dark") {
-      bodyEl.classList.add("quarto-dark");
-      bodyEl.classList.remove("quarto-light");
-    } else {
-      bodyEl.classList.add("quarto-light");
-      bodyEl.classList.remove("quarto-dark");
-    }
-  }
-  const toggleBodyColorPrimary = () => {
-    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
-    if (bsSheetEl) {
-      toggleBodyColorMode(bsSheetEl);
-    }
-  }
-  toggleBodyColorPrimary();  
-  const icon = "";
-  const anchorJS = new window.AnchorJS();
-  anchorJS.options = {
-    placement: 'right',
-    icon: icon
-  };
-  anchorJS.add('.anchored');
-  const isCodeAnnotation = (el) => {
-    for (const clz of el.classList) {
-      if (clz.startsWith('code-annotation-')) {                     
-        return true;
-      }
-    }
-    return false;
-  }
-  const clipboard = new window.ClipboardJS('.code-copy-button', {
-    text: function(trigger) {
-      const codeEl = trigger.previousElementSibling.cloneNode(true);
-      for (const childEl of codeEl.children) {
-        if (isCodeAnnotation(childEl)) {
-          childEl.remove();
-        }
-      }
-      return codeEl.innerText;
-    }
-  });
-  clipboard.on('success', function(e) {
-    // button target
-    const button = e.trigger;
-    // don't keep focus
-    button.blur();
-    // flash "checked"
-    button.classList.add('code-copy-button-checked');
-    var currentTitle = button.getAttribute("title");
-    button.setAttribute("title", "Copied!");
-    let tooltip;
-    if (window.bootstrap) {
-      button.setAttribute("data-bs-toggle", "tooltip");
-      button.setAttribute("data-bs-placement", "left");
-      button.setAttribute("data-bs-title", "Copied!");
-      tooltip = new bootstrap.Tooltip(button, 
-        { trigger: "manual", 
-          customClass: "code-copy-button-tooltip",
-          offset: [0, -8]});
-      tooltip.show();    
-    }
-    setTimeout(function() {
-      if (tooltip) {
-        tooltip.hide();
-        button.removeAttribute("data-bs-title");
-        button.removeAttribute("data-bs-toggle");
-        button.removeAttribute("data-bs-placement");
-      }
-      button.setAttribute("title", currentTitle);
-      button.classList.remove('code-copy-button-checked');
-    }, 1000);
-    // clear code selection
-    e.clearSelection();
-  });
-  function tippyHover(el, contentFn) {
-    const config = {
-      allowHTML: true,
-      content: contentFn,
-      maxWidth: 500,
-      delay: 100,
-      arrow: false,
-      appendTo: function(el) {
-          return el.parentElement;
-      },
-      interactive: true,
-      interactiveBorder: 10,
-      theme: 'quarto',
-      placement: 'bottom-start'
-    };
-    window.tippy(el, config); 
-  }
-  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
-  for (var i=0; i<noterefs.length; i++) {
-    const ref = noterefs[i];
-    tippyHover(ref, function() {
-      // use id or data attribute instead here
-      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
-      try { href = new URL(href).hash; } catch {}
-      const id = href.replace(/^#\/?/, "");
-      const note = window.document.getElementById(id);
-      return note.innerHTML;
-    });
-  }
-      let selectedAnnoteEl;
-      const selectorForAnnotation = ( cell, annotation) => {
-        let cellAttr = 'data-code-cell="' + cell + '"';
-        let lineAttr = 'data-code-annotation="' +  annotation + '"';
-        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
-        return selector;
-      }
-      const selectCodeLines = (annoteEl) => {
-        const doc = window.document;
-        const targetCell = annoteEl.getAttribute("data-target-cell");
-        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
-        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
-        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
-        const lineIds = lines.map((line) => {
-          return targetCell + "-" + line;
-        })
-        let top = null;
-        let height = null;
-        let parent = null;
-        if (lineIds.length > 0) {
-            //compute the position of the single el (top and bottom and make a div)
-            const el = window.document.getElementById(lineIds[0]);
-            top = el.offsetTop;
-            height = el.offsetHeight;
-            parent = el.parentElement.parentElement;
-          if (lineIds.length > 1) {
-            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
-            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
-            height = bottom - top;
-          }
-          if (top !== null && height !== null && parent !== null) {
-            // cook up a div (if necessary) and position it 
-            let div = window.document.getElementById("code-annotation-line-highlight");
-            if (div === null) {
-              div = window.document.createElement("div");
-              div.setAttribute("id", "code-annotation-line-highlight");
-              div.style.position = 'absolute';
-              parent.appendChild(div);
-            }
-            div.style.top = top - 2 + "px";
-            div.style.height = height + 4 + "px";
-            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
-            if (gutterDiv === null) {
-              gutterDiv = window.document.createElement("div");
-              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
-              gutterDiv.style.position = 'absolute';
-              const codeCell = window.document.getElementById(targetCell);
-              const gutter = codeCell.querySelector('.code-annotation-gutter');
-              gutter.appendChild(gutterDiv);
-            }
-            gutterDiv.style.top = top - 2 + "px";
-            gutterDiv.style.height = height + 4 + "px";
-          }
-          selectedAnnoteEl = annoteEl;
-        }
-      };
-      const unselectCodeLines = () => {
-        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
-        elementsIds.forEach((elId) => {
-          const div = window.document.getElementById(elId);
-          if (div) {
-            div.remove();
-          }
-        });
-        selectedAnnoteEl = undefined;
-      };
-      // Attach click handler to the DT
-      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
-      for (const annoteDlNode of annoteDls) {
-        annoteDlNode.addEventListener('click', (event) => {
-          const clickedEl = event.target;
-          if (clickedEl !== selectedAnnoteEl) {
-            unselectCodeLines();
-            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
-            if (activeEl) {
-              activeEl.classList.remove('code-annotation-active');
-            }
-            selectCodeLines(clickedEl);
-            clickedEl.classList.add('code-annotation-active');
-          } else {
-            // Unselect the line
-            unselectCodeLines();
-            clickedEl.classList.remove('code-annotation-active');
-          }
-        });
-      }
-  const findCites = (el) => {
-    const parentEl = el.parentElement;
-    if (parentEl) {
-      const cites = parentEl.dataset.cites;
-      if (cites) {
-        return {
-          el,
-          cites: cites.split(' ')
-        };
-      } else {
-        return findCites(el.parentElement)
-      }
-    } else {
-      return undefined;
-    }
-  };
-  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
-  for (var i=0; i<bibliorefs.length; i++) {
-    const ref = bibliorefs[i];
-    const citeInfo = findCites(ref);
-    if (citeInfo) {
-      tippyHover(citeInfo.el, function() {
-        var popup = window.document.createElement('div');
-        citeInfo.cites.forEach(function(cite) {
-          var citeDiv = window.document.createElement('div');
-          citeDiv.classList.add('hanging-indent');
-          citeDiv.classList.add('csl-entry');
-          var biblioDiv = window.document.getElementById('ref-' + cite);
-          if (biblioDiv) {
-            citeDiv.innerHTML = biblioDiv.innerHTML;
-          }
-          popup.appendChild(citeDiv);
-        });
-        return popup.innerHTML;
-      });
-    }
-  }
-});
-</script>
-</div> <!-- /content -->
-
-
-
-</body></html>
\ No newline at end of file
diff --git a/report/final_report/docs/02_finding-report.html b/report/final_report/docs/02_finding-report.html
deleted file mode 100644
index 3102504..0000000
--- a/report/final_report/docs/02_finding-report.html
+++ /dev/null
@@ -1,2241 +0,0 @@
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
-
-<meta charset="utf-8">
-<meta name="generator" content="quarto-1.3.450">
-
-<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
-
-
-<title>NOTE: the result is based on the code base abb9a21, which is similar to the commit 69d61a9 in the main branch</title>
-<style>
-code{white-space: pre-wrap;}
-span.smallcaps{font-variant: small-caps;}
-div.columns{display: flex; gap: min(4vw, 1.5em);}
-div.column{flex: auto; overflow-x: auto;}
-div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
-ul.task-list{list-style: none;}
-ul.task-list li input[type="checkbox"] {
-  width: 0.8em;
-  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
-  vertical-align: middle;
-}
-/* CSS for syntax highlighting */
-pre > code.sourceCode { white-space: pre; position: relative; }
-pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
-pre > code.sourceCode > span:empty { height: 1.2em; }
-.sourceCode { overflow: visible; }
-code.sourceCode > span { color: inherit; text-decoration: inherit; }
-div.sourceCode { margin: 1em 0; }
-pre.sourceCode { margin: 0; }
-@media screen {
-div.sourceCode { overflow: auto; }
-}
-@media print {
-pre > code.sourceCode { white-space: pre-wrap; }
-pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
-}
-pre.numberSource code
-  { counter-reset: source-line 0; }
-pre.numberSource code > span
-  { position: relative; left: -4em; counter-increment: source-line; }
-pre.numberSource code > span > a:first-child::before
-  { content: counter(source-line);
-    position: relative; left: -1em; text-align: right; vertical-align: baseline;
-    border: none; display: inline-block;
-    -webkit-touch-callout: none; -webkit-user-select: none;
-    -khtml-user-select: none; -moz-user-select: none;
-    -ms-user-select: none; user-select: none;
-    padding: 0 4px; width: 4em;
-  }
-pre.numberSource { margin-left: 3em;  padding-left: 4px; }
-div.sourceCode
-  {   }
-@media screen {
-pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
-}
-</style>
-
-
-<script src="site_libs/quarto-nav/quarto-nav.js"></script>
-<script src="site_libs/quarto-nav/headroom.min.js"></script>
-<script src="site_libs/clipboard/clipboard.min.js"></script>
-<script src="site_libs/quarto-search/autocomplete.umd.js"></script>
-<script src="site_libs/quarto-search/fuse.min.js"></script>
-<script src="site_libs/quarto-search/quarto-search.js"></script>
-<meta name="quarto:offset" content="./">
-<script src="site_libs/quarto-html/quarto.js"></script>
-<script src="site_libs/quarto-html/popper.min.js"></script>
-<script src="site_libs/quarto-html/tippy.umd.min.js"></script>
-<script src="site_libs/quarto-html/anchor.min.js"></script>
-<link href="site_libs/quarto-html/tippy.css" rel="stylesheet">
-<link href="site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
-<script src="site_libs/bootstrap/bootstrap.min.js"></script>
-<link href="site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
-<link href="site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
-<script id="quarto-search-options" type="application/json">{
-  "location": "sidebar",
-  "copy-button": false,
-  "collapse-after": 3,
-  "panel-placement": "start",
-  "type": "textbox",
-  "limit": 20,
-  "language": {
-    "search-no-results-text": "No results",
-    "search-matching-documents-text": "matching documents",
-    "search-copy-link-title": "Copy link to search",
-    "search-hide-matches-text": "Hide additional matches",
-    "search-more-match-text": "more match in this document",
-    "search-more-matches-text": "more matches in this document",
-    "search-clear-button-title": "Clear",
-    "search-detached-cancel-button-title": "Cancel",
-    "search-submit-button-title": "Submit",
-    "search-label": "Search"
-  }
-}</script>
-<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
-<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script>
-<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
-
-
-<link rel="stylesheet" href="styles.css">
-</head>
-
-<body class="nav-sidebar docked">
-
-<div id="quarto-search-results"></div>
-  <header id="quarto-header" class="headroom fixed-top">
-  <nav class="quarto-secondary-nav">
-    <div class="container-fluid d-flex">
-      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
-        <i class="bi bi-layout-text-sidebar-reverse"></i>
-      </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item">NOTE: the result is based on the code base <a href="https://github.com/UBC-MDS/test-creation/pull/136/commits/abb9a21828cb257ff8f2629261dc1ad64ad7dcb0">abb9a21</a>, which is similar to the commit <a href="https://github.com/UBC-MDS/test-creation/tree/69d61a9f5ac62baefca23ee293a6cd09fe41eeb2">69d61a9</a> in the main branch</li></ol></nav>
-      <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
-      </a>
-      <button type="button" class="btn quarto-search-button" aria-label="" onclick="window.quartoOpenSearch();">
-        <i class="bi bi-search"></i>
-      </button>
-    </div>
-  </nav>
-</header>
-<!-- content -->
-<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
-<!-- sidebar -->
-  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal sidebar-navigation docked overflow-auto">
-    <div class="pt-lg-2 mt-2 text-left sidebar-header">
-      <a href="./index.html" class="sidebar-logo-link">
-      <img src="./logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
-      </a>
-      </div>
-        <div class="mt-2 flex-shrink-0 align-items-center">
-        <div class="sidebar-search">
-        <div id="quarto-search" class="" title="Search"></div>
-        </div>
-        </div>
-    <div class="sidebar-menu-container"> 
-    <ul class="list-unstyled mt-1">
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
- <span class="menu-text">Final Report</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./final_report.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">DSCI591 Capstone Final Report</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
- <span class="menu-text">Proposal</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./proposal.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-    </ul>
-    </div>
-</nav>
-<div id="quarto-sidebar-glass" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass"></div>
-<!-- margin-sidebar -->
-    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
-        <nav id="TOC" role="doc-toc" class="toc-active">
-    <h2 id="toc-title">On this page</h2>
-   
-  <ul>
-  <li><a href="#preprocess-data" id="toc-preprocess-data" class="nav-link active" data-scroll-target="#preprocess-data">preprocess data</a></li>
-  <li><a href="#runs-quality" id="toc-runs-quality" class="nav-link" data-scroll-target="#runs-quality">Runs Quality</a></li>
-  <li><a href="#findings-on-8-repos" id="toc-findings-on-8-repos" class="nav-link" data-scroll-target="#findings-on-8-repos">Findings on 8 repos</a></li>
-  </ul>
-</nav>
-    </div>
-<!-- main -->
-<main class="content" id="quarto-document-content">
-
-<header id="title-block-header" class="quarto-title-block default">
-<div class="quarto-title">
-<h1 class="title">NOTE: the result is based on the code base <a href="https://github.com/UBC-MDS/test-creation/pull/136/commits/abb9a21828cb257ff8f2629261dc1ad64ad7dcb0">abb9a21</a>, which is similar to the commit <a href="https://github.com/UBC-MDS/test-creation/tree/69d61a9f5ac62baefca23ee293a6cd09fe41eeb2">69d61a9</a> in the main branch</h1>
-</div>
-
-
-
-<div class="quarto-title-meta">
-
-    
-  
-    
-  </div>
-  
-
-</header>
-
-<div class="cell" data-execution_count="1">
-<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> scipy</span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pickle</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> yaml</span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> altair <span class="im">as</span> alt</span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> Counter</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="2">
-<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_report(response):</span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>    report <span class="op">=</span> []</span>
-<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> result <span class="kw">in</span> response.call_results:</span>
-<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> result.parsed_response:</span>
-<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>            resp <span class="op">=</span> result.parsed_response[<span class="st">'results'</span>]</span>
-<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>            <span class="cf">for</span> item <span class="kw">in</span> resp:</span>
-<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'file'</span>] <span class="op">=</span> result.files_evaluated[<span class="dv">0</span>] </span>
-<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'success'</span>] <span class="op">=</span> result.success</span>
-<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>                report.append(item)</span>
-<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a>        <span class="cf">else</span>:</span>
-<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>            report.append({</span>
-<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a>                <span class="st">'ID'</span>: <span class="st">'2.1'</span>, <span class="co"># </span><span class="al">FIXME</span></span>
-<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Title'</span>: <span class="st">''</span>,</span>
-<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Requirement'</span>: <span class="st">''</span>,</span>
-<span id="cb2-15"><a href="#cb2-15" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Observation'</span>: <span class="st">''</span>,</span>
-<span id="cb2-16"><a href="#cb2-16" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Functions'</span>: [],</span>
-<span id="cb2-17"><a href="#cb2-17" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Evaluation'</span>: <span class="st">''</span>,</span>
-<span id="cb2-18"><a href="#cb2-18" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Score'</span>: <span class="dv">0</span>,</span>
-<span id="cb2-19"><a href="#cb2-19" aria-hidden="true" tabindex="-1"></a>                <span class="st">'file'</span>: result.files_evaluated[<span class="dv">0</span>],</span>
-<span id="cb2-20"><a href="#cb2-20" aria-hidden="true" tabindex="-1"></a>                <span class="st">'success'</span>: result.success</span>
-<span id="cb2-21"><a href="#cb2-21" aria-hidden="true" tabindex="-1"></a>            })</span>
-<span id="cb2-22"><a href="#cb2-22" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> pd.DataFrame(report)</span>
-<span id="cb2-23"><a href="#cb2-23" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-24"><a href="#cb2-24" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> extract_file_and_scores(resp_path, verbose<span class="op">=</span><span class="va">False</span>):</span>
-<span id="cb2-25"><a href="#cb2-25" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> verbose:</span>
-<span id="cb2-26"><a href="#cb2-26" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(resp_path)</span>
-<span id="cb2-27"><a href="#cb2-27" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> <span class="bu">open</span>(resp_path, <span class="st">'rb'</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
-<span id="cb2-28"><a href="#cb2-28" aria-hidden="true" tabindex="-1"></a>        response <span class="op">=</span> pickle.load(<span class="bu">file</span>)</span>
-<span id="cb2-29"><a href="#cb2-29" aria-hidden="true" tabindex="-1"></a>    report <span class="op">=</span> get_report(response)</span>
-<span id="cb2-30"><a href="#cb2-30" aria-hidden="true" tabindex="-1"></a>    df <span class="op">=</span> (</span>
-<span id="cb2-31"><a href="#cb2-31" aria-hidden="true" tabindex="-1"></a>        report</span>
-<span id="cb2-32"><a href="#cb2-32" aria-hidden="true" tabindex="-1"></a>        .pivot(index<span class="op">=</span><span class="st">'file'</span>, columns<span class="op">=</span><span class="st">'ID'</span>, values<span class="op">=</span><span class="st">'Score'</span>)</span>
-<span id="cb2-33"><a href="#cb2-33" aria-hidden="true" tabindex="-1"></a>        .rename_axis(<span class="va">None</span>, axis<span class="op">=</span><span class="dv">1</span>)</span>
-<span id="cb2-34"><a href="#cb2-34" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-35"><a href="#cb2-35" aria-hidden="true" tabindex="-1"></a>    df[<span class="st">'success'</span>] <span class="op">=</span> report.groupby([<span class="st">'file'</span>])[<span class="st">'success'</span>].<span class="bu">all</span>()</span>
-<span id="cb2-36"><a href="#cb2-36" aria-hidden="true" tabindex="-1"></a>    df[<span class="st">'response_path'</span>] <span class="op">=</span> resp_path</span>
-<span id="cb2-37"><a href="#cb2-37" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> df.reset_index()</span>
-<span id="cb2-38"><a href="#cb2-38" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-39"><a href="#cb2-39" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> generate_stat_plot(df_repo__stat, ground_truth<span class="op">=</span><span class="va">None</span>, facet_col<span class="op">=</span><span class="st">'repo'</span>, repo<span class="op">=</span><span class="va">None</span>, <span class="bu">id</span><span class="op">=</span><span class="va">None</span>):</span>
-<span id="cb2-40"><a href="#cb2-40" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""</span></span>
-<span id="cb2-41"><a href="#cb2-41" aria-hidden="true" tabindex="-1"></a><span class="co">    Generate Stat plot across all repo and all checklist item</span></span>
-<span id="cb2-42"><a href="#cb2-42" aria-hidden="true" tabindex="-1"></a><span class="co">    Optional to incorporate ground truth and select specific repo/checklist item</span></span>
-<span id="cb2-43"><a href="#cb2-43" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
-<span id="cb2-44"><a href="#cb2-44" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> facet_col <span class="op">==</span> <span class="st">'repo'</span>:</span>
-<span id="cb2-45"><a href="#cb2-45" aria-hidden="true" tabindex="-1"></a>        x_col <span class="op">=</span> <span class="st">'id'</span></span>
-<span id="cb2-46"><a href="#cb2-46" aria-hidden="true" tabindex="-1"></a>        x_title <span class="op">=</span> <span class="st">'Checklist ID'</span></span>
-<span id="cb2-47"><a href="#cb2-47" aria-hidden="true" tabindex="-1"></a>    <span class="cf">elif</span> facet_col <span class="op">==</span> <span class="st">'id'</span>:</span>
-<span id="cb2-48"><a href="#cb2-48" aria-hidden="true" tabindex="-1"></a>        x_col <span class="op">=</span> <span class="st">'repo'</span></span>
-<span id="cb2-49"><a href="#cb2-49" aria-hidden="true" tabindex="-1"></a>        x_title <span class="op">=</span> <span class="st">'Repository'</span></span>
-<span id="cb2-50"><a href="#cb2-50" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-51"><a href="#cb2-51" aria-hidden="true" tabindex="-1"></a>    <span class="co"># the base chart</span></span>
-<span id="cb2-52"><a href="#cb2-52" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> repo:</span>
-<span id="cb2-53"><a href="#cb2-53" aria-hidden="true" tabindex="-1"></a>        df_repo__stat <span class="op">=</span> df_repo__stat.query(<span class="ss">f'repo == "</span><span class="sc">{</span>repo<span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb2-54"><a href="#cb2-54" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> <span class="bu">id</span>:</span>
-<span id="cb2-55"><a href="#cb2-55" aria-hidden="true" tabindex="-1"></a>        df_repo__stat <span class="op">=</span> df_repo__stat.query(<span class="ss">f'id == "</span><span class="sc">{</span><span class="bu">id</span><span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb2-56"><a href="#cb2-56" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-57"><a href="#cb2-57" aria-hidden="true" tabindex="-1"></a>    base <span class="op">=</span> alt.Chart().transform_calculate(</span>
-<span id="cb2-58"><a href="#cb2-58" aria-hidden="true" tabindex="-1"></a>        <span class="bu">min</span><span class="op">=</span><span class="st">"max(0, datum.mean-datum.std)"</span>,</span>
-<span id="cb2-59"><a href="#cb2-59" aria-hidden="true" tabindex="-1"></a>        <span class="bu">max</span><span class="op">=</span><span class="st">"min(1, datum.mean+datum.std)"</span></span>
-<span id="cb2-60"><a href="#cb2-60" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-61"><a href="#cb2-61" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-62"><a href="#cb2-62" aria-hidden="true" tabindex="-1"></a>    <span class="co"># generate the points</span></span>
-<span id="cb2-63"><a href="#cb2-63" aria-hidden="true" tabindex="-1"></a>    points <span class="op">=</span> base.mark_point(</span>
-<span id="cb2-64"><a href="#cb2-64" aria-hidden="true" tabindex="-1"></a>        filled<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb2-65"><a href="#cb2-65" aria-hidden="true" tabindex="-1"></a>        size<span class="op">=</span><span class="dv">50</span>,</span>
-<span id="cb2-66"><a href="#cb2-66" aria-hidden="true" tabindex="-1"></a>        color<span class="op">=</span><span class="st">'black'</span></span>
-<span id="cb2-67"><a href="#cb2-67" aria-hidden="true" tabindex="-1"></a>    ).encode(</span>
-<span id="cb2-68"><a href="#cb2-68" aria-hidden="true" tabindex="-1"></a>        x<span class="op">=</span>alt.X(<span class="ss">f'</span><span class="sc">{</span>x_col<span class="sc">}</span><span class="ss">:O'</span>).axis(labelAngle<span class="op">=</span><span class="dv">0</span>).title(x_title),</span>
-<span id="cb2-69"><a href="#cb2-69" aria-hidden="true" tabindex="-1"></a>        y<span class="op">=</span>alt.Y(<span class="st">'mean:Q'</span>).scale(domainMin<span class="op">=</span><span class="dv">0</span>, domainMax<span class="op">=</span><span class="dv">1</span>).title(<span class="st">'Score'</span>),</span>
-<span id="cb2-70"><a href="#cb2-70" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-71"><a href="#cb2-71" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-72"><a href="#cb2-72" aria-hidden="true" tabindex="-1"></a>    <span class="co"># generate the error bars</span></span>
-<span id="cb2-73"><a href="#cb2-73" aria-hidden="true" tabindex="-1"></a>    errorbars <span class="op">=</span> base.mark_errorbar().encode(</span>
-<span id="cb2-74"><a href="#cb2-74" aria-hidden="true" tabindex="-1"></a>        x<span class="op">=</span><span class="ss">f"</span><span class="sc">{</span>x_col<span class="sc">}</span><span class="ss">:O"</span>,</span>
-<span id="cb2-75"><a href="#cb2-75" aria-hidden="true" tabindex="-1"></a>        y<span class="op">=</span>alt.Y(<span class="st">"min:Q"</span>).title(<span class="st">'1 SD'</span>),</span>
-<span id="cb2-76"><a href="#cb2-76" aria-hidden="true" tabindex="-1"></a>        y2<span class="op">=</span><span class="st">"max:Q"</span></span>
-<span id="cb2-77"><a href="#cb2-77" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-78"><a href="#cb2-78" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-79"><a href="#cb2-79" aria-hidden="true" tabindex="-1"></a>    plot <span class="op">=</span> points <span class="op">+</span> errorbars</span>
-<span id="cb2-80"><a href="#cb2-80" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb2-81"><a href="#cb2-81" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> ground_truth <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
-<span id="cb2-82"><a href="#cb2-82" aria-hidden="true" tabindex="-1"></a>        <span class="co"># generate points of ground truth</span></span>
-<span id="cb2-83"><a href="#cb2-83" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> repo:</span>
-<span id="cb2-84"><a href="#cb2-84" aria-hidden="true" tabindex="-1"></a>            ground_truth <span class="op">=</span> ground_truth.query(<span class="ss">f'repo == "</span><span class="sc">{</span>repo<span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb2-85"><a href="#cb2-85" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> <span class="bu">id</span>:</span>
-<span id="cb2-86"><a href="#cb2-86" aria-hidden="true" tabindex="-1"></a>            ground_truth <span class="op">=</span> ground_truth.query(<span class="ss">f'id == "</span><span class="sc">{</span><span class="bu">id</span><span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb2-87"><a href="#cb2-87" aria-hidden="true" tabindex="-1"></a>        </span>
-<span id="cb2-88"><a href="#cb2-88" aria-hidden="true" tabindex="-1"></a>        df_repo__stat <span class="op">=</span> pd.merge(df_repo__stat, ground_truth, how<span class="op">=</span><span class="st">'left'</span>, on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>])</span>
-<span id="cb2-89"><a href="#cb2-89" aria-hidden="true" tabindex="-1"></a>        </span>
-<span id="cb2-90"><a href="#cb2-90" aria-hidden="true" tabindex="-1"></a>        gt_points <span class="op">=</span> alt.Chart().mark_point(</span>
-<span id="cb2-91"><a href="#cb2-91" aria-hidden="true" tabindex="-1"></a>            filled<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb2-92"><a href="#cb2-92" aria-hidden="true" tabindex="-1"></a>            size<span class="op">=</span><span class="dv">100</span>,</span>
-<span id="cb2-93"><a href="#cb2-93" aria-hidden="true" tabindex="-1"></a>            color<span class="op">=</span><span class="st">'green'</span>,</span>
-<span id="cb2-94"><a href="#cb2-94" aria-hidden="true" tabindex="-1"></a>            shape<span class="op">=</span><span class="st">"diamond"</span></span>
-<span id="cb2-95"><a href="#cb2-95" aria-hidden="true" tabindex="-1"></a>        ).encode(</span>
-<span id="cb2-96"><a href="#cb2-96" aria-hidden="true" tabindex="-1"></a>            x<span class="op">=</span>alt.X(<span class="ss">f'</span><span class="sc">{</span>x_col<span class="sc">}</span><span class="ss">:O'</span>),</span>
-<span id="cb2-97"><a href="#cb2-97" aria-hidden="true" tabindex="-1"></a>            y<span class="op">=</span>alt.Y(<span class="st">'score:Q'</span>)</span>
-<span id="cb2-98"><a href="#cb2-98" aria-hidden="true" tabindex="-1"></a>        )</span>
-<span id="cb2-99"><a href="#cb2-99" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-100"><a href="#cb2-100" aria-hidden="true" tabindex="-1"></a>        plot <span class="op">+=</span> gt_points</span>
-<span id="cb2-101"><a href="#cb2-101" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-102"><a href="#cb2-102" aria-hidden="true" tabindex="-1"></a>    plot <span class="op">=</span> alt.layer(</span>
-<span id="cb2-103"><a href="#cb2-103" aria-hidden="true" tabindex="-1"></a>                plot,</span>
-<span id="cb2-104"><a href="#cb2-104" aria-hidden="true" tabindex="-1"></a>                data<span class="op">=</span>df_repo__stat</span>
-<span id="cb2-105"><a href="#cb2-105" aria-hidden="true" tabindex="-1"></a>            ).properties(</span>
-<span id="cb2-106"><a href="#cb2-106" aria-hidden="true" tabindex="-1"></a>                width<span class="op">=</span><span class="dv">400</span>,</span>
-<span id="cb2-107"><a href="#cb2-107" aria-hidden="true" tabindex="-1"></a>            ).facet(</span>
-<span id="cb2-108"><a href="#cb2-108" aria-hidden="true" tabindex="-1"></a>                column<span class="op">=</span><span class="ss">f'</span><span class="sc">{</span>facet_col<span class="sc">}</span><span class="ss">'</span>,</span>
-<span id="cb2-109"><a href="#cb2-109" aria-hidden="true" tabindex="-1"></a>                columns<span class="op">=</span><span class="dv">2</span></span>
-<span id="cb2-110"><a href="#cb2-110" aria-hidden="true" tabindex="-1"></a>            )</span>
-<span id="cb2-111"><a href="#cb2-111" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-112"><a href="#cb2-112" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> plot</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<section id="preprocess-data" class="level3">
-<h3 class="anchored" data-anchor-id="preprocess-data">preprocess data</h3>
-<div class="cell" data-scrolled="true" data-execution_count="3">
-<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>checklist_ids <span class="op">=</span> [<span class="st">'2.1'</span>, <span class="st">'3.2'</span>, <span class="st">'3.5'</span>, <span class="st">'4.2'</span>, <span class="st">'5.3'</span>, <span class="st">'6.1'</span>, <span class="st">'6.2'</span>]</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="co">#result_path = '../draft/batch_run_results/record_combine.yml'</span></span>
-<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>result_path <span class="op">=</span> <span class="st">'../data/processed/batch_run/record_combine.yml'</span></span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(result_path, <span class="st">'r'</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
-<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>    config <span class="op">=</span> pd.DataFrame(yaml.safe_load(<span class="bu">file</span>))</span>
-<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a><span class="co"># prepare score data by repo, run, file</span></span>
-<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>tmp <span class="op">=</span> [</span>
-<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a>    extract_file_and_scores(path) <span class="cf">for</span> path <span class="kw">in</span> config[<span class="st">'response_path'</span>] <span class="co"># </span><span class="al">FIXME</span><span class="co">: excluded deepchem</span></span>
-<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>]</span>
-<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>tmp <span class="op">=</span> pd.concat(tmp, axis<span class="op">=</span><span class="dv">0</span>).reset_index(drop<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>raw_df_repo_run_file <span class="op">=</span> config.merge(tmp, on<span class="op">=</span><span class="st">'response_path'</span>, how<span class="op">=</span><span class="st">'left'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="4">
-<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># filter non-test files in qlib</span></span>
-<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>df_repo_run_file <span class="op">=</span> raw_df_repo_run_file.query(<span class="st">'(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))'</span>)</span>
-<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co"># prepare score data by repo, run</span></span>
-<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>df_repo_run <span class="op">=</span> df_repo_run_file.groupby([<span class="st">'repo'</span>, <span class="st">'run'</span>]).agg({</span>
-<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>    <span class="bu">id</span>: [<span class="st">'max'</span>] <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> checklist_ids</span>
-<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>})</span>
-<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>df_repo_run.columns <span class="op">=</span> [col[<span class="dv">0</span>] <span class="cf">for</span> col <span class="kw">in</span> df_repo_run.columns]</span>
-<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>df_repo_run <span class="op">=</span> df_repo_run.reset_index()</span>
-<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a><span class="co"># prepare statistics of scores by repo</span></span>
-<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a>df_repo__stat <span class="op">=</span> df_repo_run.groupby([<span class="st">'repo'</span>]).agg({</span>
-<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>    <span class="bu">id</span>: [<span class="st">'mean'</span>, <span class="st">'std'</span>, <span class="st">'count'</span>] <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> checklist_ids</span>
-<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>})</span>
-<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a>df_repo__stat <span class="op">=</span> pd.melt(df_repo__stat.reset_index(), id_vars<span class="op">=</span>[(<span class="st">'repo'</span>, <span class="st">''</span>)])</span>
-<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a>df_repo__stat.columns <span class="op">=</span> [<span class="st">'repo'</span>, <span class="st">'id'</span>, <span class="st">'stat'</span>, <span class="st">'value'</span>]</span>
-<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a>df_repo__stat <span class="op">=</span> (</span>
-<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a>    df_repo__stat.pivot(index<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>], columns<span class="op">=</span><span class="st">'stat'</span>, values<span class="op">=</span><span class="st">'value'</span>)</span>
-<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a>    .reset_index()</span>
-<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a>    .rename_axis(<span class="va">None</span>, axis<span class="op">=</span><span class="dv">1</span>)</span>
-<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a><span class="co"># prepare counting of scores by repo</span></span>
-<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a>df_repo__count <span class="op">=</span> df_repo_run.groupby([<span class="st">'repo'</span>])[<span class="st">'2.1'</span>].<span class="bu">apply</span>(Counter).reset_index()</span>
-<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> checklist_ids[<span class="dv">1</span>:]:</span>
-<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a>    df_repo__count <span class="op">=</span> df_repo__count.merge(</span>
-<span id="cb4-27"><a href="#cb4-27" aria-hidden="true" tabindex="-1"></a>        df_repo_run.groupby([<span class="st">'repo'</span>])[<span class="bu">id</span>].<span class="bu">apply</span>(Counter).reset_index(),</span>
-<span id="cb4-28"><a href="#cb4-28" aria-hidden="true" tabindex="-1"></a>        on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'level_1'</span>],</span>
-<span id="cb4-29"><a href="#cb4-29" aria-hidden="true" tabindex="-1"></a>        how<span class="op">=</span><span class="st">'outer'</span></span>
-<span id="cb4-30"><a href="#cb4-30" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb4-31"><a href="#cb4-31" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-32"><a href="#cb4-32" aria-hidden="true" tabindex="-1"></a>df_repo__count <span class="op">=</span> df_repo__count.fillna(<span class="dv">0</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-</section>
-<section id="runs-quality" class="level3">
-<h3 class="anchored" data-anchor-id="runs-quality">Runs Quality</h3>
-<section id="some-non-test-files-are-included-in-the-evaluation" class="level4">
-<h4 class="anchored" data-anchor-id="some-non-test-files-are-included-in-the-evaluation">1. Some non-test files are included in the evaluation</h4>
-<p>For example, the <code>./nanodet/nanodet/trainer/task.py</code></p>
-<div class="cell" data-execution_count="5">
-<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>raw_df_repo_run_file.query(<span class="st">'repo == "nanodet"'</span>)[<span class="st">'file'</span>].unique()[:<span class="dv">3</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="5">
-<pre><code>array(['../data/raw/openja/nanodet/nanodet/trainer/task.py',
-       '../data/raw/openja/nanodet/tests/test_configs/test_config.py',
-       '../data/raw/openja/nanodet/tests/test_data/test_batch_process.py'],
-      dtype=object)</code></pre>
-</div>
-</div>
-</section>
-<section id="evaluation-on-the-file-magentamagentamodelsmusic_vaedata_test.py-is-always-failed" class="level4">
-<h4 class="anchored" data-anchor-id="evaluation-on-the-file-magentamagentamodelsmusic_vaedata_test.py-is-always-failed">2. Evaluation on the file <code>magenta/magenta/models/music_vae/data_test.py</code> is always failed</h4>
-<div class="cell" data-execution_count="6">
-<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_file[<span class="op">~</span>df_repo_run_file.success][<span class="st">'file'</span>].unique()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="6">
-<pre><code>array(['../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',
-       '../data/raw/openja/paperless-ng/src/documents/tests/test_api.py'],
-      dtype=object)</code></pre>
-</div>
-</div>
-</section>
-<section id="deepspeech-lightfm-and-magenta-have-the-least-python-test-files" class="level4">
-<h4 class="anchored" data-anchor-id="deepspeech-lightfm-and-magenta-have-the-least-python-test-files">3. <code>DeepSpeech</code>, <code>lightfm</code> and <code>magenta</code> have the least (Python) test files</h4>
-<div class="cell" data-execution_count="7">
-<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_file.query(<span class="st">'run == 1'</span>).groupby([<span class="st">'repo'</span>])[<span class="st">'file'</span>].count().reset_index()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="7">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">file</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>DeepSpeech</td>
-<td>3</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>apollo</td>
-<td>14</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>lightfm</td>
-<td>7</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>magenta</td>
-<td>8</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>mmf</td>
-<td>70</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">5</td>
-<td>mycroft-core</td>
-<td>64</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">6</td>
-<td>nanodet</td>
-<td>42</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">7</td>
-<td>paperless-ng</td>
-<td>35</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">8</td>
-<td>qlib</td>
-<td>31</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-</section>
-<section id="the-test-files-are-not-always-in-a-tests-folder.-is-it-be-good-practice-to-always-do-that-should-it-be-one-of-the-checklist-item-to-ensure-all-tests-placed-under-tests-folder" class="level4">
-<h4 class="anchored" data-anchor-id="the-test-files-are-not-always-in-a-tests-folder.-is-it-be-good-practice-to-always-do-that-should-it-be-one-of-the-checklist-item-to-ensure-all-tests-placed-under-tests-folder">4. The test files are not always in a <code>tests/</code> folder. Is it be good practice to always do that? Should it be one of the checklist item to ensure all tests placed under <code>tests/</code> folder?</h4>
-<p>For example, <code>magenta</code></p>
-<div class="cell" data-execution_count="8">
-<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_file.query(<span class="st">'repo == "magenta"'</span>)[<span class="st">'file'</span>].unique()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="8">
-<pre><code>array(['../data/raw/openja/magenta/conftest.py',
-       '../data/raw/openja/magenta/magenta/common/state_util_test.py',
-       '../data/raw/openja/magenta/magenta/models/coconet/export_saved_model_test.py',
-       '../data/raw/openja/magenta/magenta/models/coconet/lib_data.py',
-       '../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',
-       '../data/raw/openja/magenta/magenta/models/onsets_frames_transcription/create_dataset_lib_test.py',
-       '../data/raw/openja/magenta/magenta/models/score2perf/datagen_beam_test.py',
-       '../data/raw/openja/magenta/magenta/pipelines/pipeline_test.py'],
-      dtype=object)</code></pre>
-</div>
-</div>
-</section>
-</section>
-<section id="findings-on-8-repos" class="level3">
-<h3 class="anchored" data-anchor-id="findings-on-8-repos">Findings on 8 repos</h3>
-<div class="cell" data-execution_count="9">
-<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_file.repo.unique()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="9">
-<pre><code>array(['lightfm', 'qlib', 'mmf', 'nanodet', 'magenta', 'DeepSpeech',
-       'paperless-ng', 'mycroft-core', 'apollo'], dtype=object)</code></pre>
-</div>
-</div>
-<section id="overview-of-accuracy-and-consistency-lightfm-evaluation" class="level4">
-<h4 class="anchored" data-anchor-id="overview-of-accuracy-and-consistency-lightfm-evaluation">1. Overview of accuracy and consistency <code>lightfm</code> evaluation</h4>
-<p>Let the ground truth of the <code>lightfm</code> is as the <a href="https://github.com/UBC-MDS/test-creation/blob/69d61a9f5ac62baefca23ee293a6cd09fe41eeb2/report/repo_human_evaluation/human_evaluation_report-lightfm.md">following</a>:</p>
-<div class="cell" data-execution_count="33">
-<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ground truth</span></span>
-<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>ground_truth <span class="op">=</span> pd.DataFrame([</span>
-<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-13"><a href="#cb14-13" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb14-14"><a href="#cb14-14" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-16"><a href="#cb14-16" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb14-17"><a href="#cb14-17" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-18"><a href="#cb14-18" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-19"><a href="#cb14-19" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-20"><a href="#cb14-20" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-21"><a href="#cb14-21" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-22"><a href="#cb14-22" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-23"><a href="#cb14-23" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb14-24"><a href="#cb14-24" aria-hidden="true" tabindex="-1"></a>])</span>
-<span id="cb14-25"><a href="#cb14-25" aria-hidden="true" tabindex="-1"></a>ground_truth[ground_truth.repo <span class="op">==</span> <span class="st">'lightfm'</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="33">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th">score</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>lightfm</td>
-<td>2.1</td>
-<td>1.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>lightfm</td>
-<td>3.2</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>lightfm</td>
-<td>3.5</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>lightfm</td>
-<td>4.2</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>lightfm</td>
-<td>5.3</td>
-<td>0.5</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">5</td>
-<td>lightfm</td>
-<td>6.1</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">6</td>
-<td>lightfm</td>
-<td>6.2</td>
-<td>1.0</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="11">
-<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>generate_stat_plot(df_repo__stat, ground_truth<span class="op">=</span>ground_truth, repo<span class="op">=</span><span class="st">"lightfm"</span>, facet_col<span class="op">=</span><span class="st">'repo'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="11">
-
-<style>
-  #altair-viz-b7ec977d7ba843178f36988e02bbdab5.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-b7ec977d7ba843178f36988e02bbdab5.vega-embed details,
-  #altair-viz-b7ec977d7ba843178f36988e02bbdab5.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-b7ec977d7ba843178f36988e02bbdab5"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-b7ec977d7ba843178f36988e02bbdab5") {
-      outputDiv = document.getElementById("altair-viz-b7ec977d7ba843178f36988e02bbdab5");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}}, "data": {"name": "data-ede8ba921ae9ebafaf5e850295187276"}, "facet": {"column": {"field": "repo", "type": "nominal"}}, "spec": {"layer": [{"layer": [{"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelAngle": 0}, "field": "id", "title": "Checklist ID", "type": "ordinal"}, "y": {"field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "id", "type": "ordinal"}, "y": {"field": "min", "title": "1 SD", "type": "quantitative"}, "y2": {"field": "max"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 100}, "encoding": {"x": {"field": "id", "type": "ordinal"}, "y": {"field": "score", "type": "quantitative"}}}]}], "width": 400}, "columns": 2, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-ede8ba921ae9ebafaf5e850295187276": [{"repo": "lightfm", "id": "2.1", "count": 30.0, "mean": 0.9833333333333333, "std": 0.09128709291752768, "score": 1.0}, {"repo": "lightfm", "id": "3.2", "count": 30.0, "mean": 0.48333333333333334, "std": 0.09128709291752768, "score": 1.0}, {"repo": "lightfm", "id": "3.5", "count": 30.0, "mean": 0.26666666666666666, "std": 0.38803957613068046, "score": 0.0}, {"repo": "lightfm", "id": "4.2", "count": 30.0, "mean": 0.55, "std": 0.15256428831468233, "score": 1.0}, {"repo": "lightfm", "id": "5.3", "count": 30.0, "mean": 0.2, "std": 0.2491364395612199, "score": 0.5}, {"repo": "lightfm", "id": "6.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.23973165074269212, "score": 1.0}, {"repo": "lightfm", "id": "6.2", "count": 30.0, "mean": 0.9333333333333333, "std": 0.1728729518208802, "score": 1.0}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<p>The distribution of the scores for each checklist items:</p>
-<div class="cell" data-execution_count="12">
-<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>df_repo__count.query(<span class="st">'repo == "lightfm"'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="12">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">level_1</th>
-<th data-quarto-table-cell-role="th">2.1</th>
-<th data-quarto-table-cell-role="th">3.2</th>
-<th data-quarto-table-cell-role="th">3.5</th>
-<th data-quarto-table-cell-role="th">4.2</th>
-<th data-quarto-table-cell-role="th">5.3</th>
-<th data-quarto-table-cell-role="th">6.1</th>
-<th data-quarto-table-cell-role="th">6.2</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">6</td>
-<td>lightfm</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>1.0</td>
-<td>19.0</td>
-<td>0.0</td>
-<td>18.0</td>
-<td>0.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">7</td>
-<td>lightfm</td>
-<td>0.5</td>
-<td>1.0</td>
-<td>29.0</td>
-<td>6.0</td>
-<td>27.0</td>
-<td>12.0</td>
-<td>20.0</td>
-<td>4.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">8</td>
-<td>lightfm</td>
-<td>1.0</td>
-<td>29.0</td>
-<td>0.0</td>
-<td>5.0</td>
-<td>3.0</td>
-<td>0.0</td>
-<td>10.0</td>
-<td>26.0</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<p><strong>Observations</strong>: The system evaluation kind of aligns with our evaluation, that is, - for those items that we believe “Satisfied” (Score = 1), the system mostly output 0.5 or 1 - for those items that we believe “Partially Satisfied” or “Not Satisfied”, the system mostly output 0.5 or 0 - some checklist items display high variance, e.g.&nbsp;3.5, 5.3 and 6.1.</p>
-</section>
-<section id="overview-of-qlib" class="level4">
-<h4 class="anchored" data-anchor-id="overview-of-qlib">2. Overview of <code>qlib</code></h4>
-<p>Let the ground truth of the <code>qlib</code> is as the following (FIXME: to be confirmed):</p>
-<div class="cell" data-execution_count="13">
-<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ground truth</span></span>
-<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>ground_truth[ground_truth.repo <span class="op">==</span> <span class="st">'qlib'</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="13">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th">score</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">7</td>
-<td>qlib</td>
-<td>2.1</td>
-<td>0.5</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">8</td>
-<td>qlib</td>
-<td>3.2</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">9</td>
-<td>qlib</td>
-<td>3.5</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">10</td>
-<td>qlib</td>
-<td>4.2</td>
-<td>0.5</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">11</td>
-<td>qlib</td>
-<td>5.3</td>
-<td>1.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">12</td>
-<td>qlib</td>
-<td>6.1</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">13</td>
-<td>qlib</td>
-<td>6.2</td>
-<td>1.0</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="14">
-<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>generate_stat_plot(df_repo__stat, ground_truth<span class="op">=</span>ground_truth, repo<span class="op">=</span><span class="st">"qlib"</span>, facet_col<span class="op">=</span><span class="st">'repo'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="14">
-
-<style>
-  #altair-viz-2fe13965b16244bab46e66ec6bbd4792.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-2fe13965b16244bab46e66ec6bbd4792.vega-embed details,
-  #altair-viz-2fe13965b16244bab46e66ec6bbd4792.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-2fe13965b16244bab46e66ec6bbd4792"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-2fe13965b16244bab46e66ec6bbd4792") {
-      outputDiv = document.getElementById("altair-viz-2fe13965b16244bab46e66ec6bbd4792");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}}, "data": {"name": "data-af1db541ee80464e3d63aac6a537dd6f"}, "facet": {"column": {"field": "repo", "type": "nominal"}}, "spec": {"layer": [{"layer": [{"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelAngle": 0}, "field": "id", "title": "Checklist ID", "type": "ordinal"}, "y": {"field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "id", "type": "ordinal"}, "y": {"field": "min", "title": "1 SD", "type": "quantitative"}, "y2": {"field": "max"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 100}, "encoding": {"x": {"field": "id", "type": "ordinal"}, "y": {"field": "score", "type": "quantitative"}}}]}], "width": 400}, "columns": 2, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-af1db541ee80464e3d63aac6a537dd6f": [{"repo": "qlib", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0, "score": 0.5}, {"repo": "qlib", "id": "3.2", "count": 30.0, "mean": 0.7666666666666667, "std": 0.285673231861683, "score": 1.0}, {"repo": "qlib", "id": "3.5", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768, "score": 0.0}, {"repo": "qlib", "id": "4.2", "count": 30.0, "mean": 0.45, "std": 0.15256428831468236, "score": 0.5}, {"repo": "qlib", "id": "5.3", "count": 30.0, "mean": 0.26666666666666666, "std": 0.2537081317024624, "score": 1.0}, {"repo": "qlib", "id": "6.1", "count": 30.0, "mean": 0.4666666666666667, "std": 0.2248882225544018, "score": 1.0}, {"repo": "qlib", "id": "6.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.18257418583505536, "score": 1.0}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<div class="cell" data-execution_count="15">
-<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>df_repo__count.query(<span class="st">'repo == "qlib"'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="15">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">level_1</th>
-<th data-quarto-table-cell-role="th">2.1</th>
-<th data-quarto-table-cell-role="th">3.2</th>
-<th data-quarto-table-cell-role="th">3.5</th>
-<th data-quarto-table-cell-role="th">4.2</th>
-<th data-quarto-table-cell-role="th">5.3</th>
-<th data-quarto-table-cell-role="th">6.1</th>
-<th data-quarto-table-cell-role="th">6.2</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">24</td>
-<td>qlib</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>1.0</td>
-<td>29.0</td>
-<td>3.0</td>
-<td>14.0</td>
-<td>4.0</td>
-<td>1.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">25</td>
-<td>qlib</td>
-<td>0.5</td>
-<td>0.0</td>
-<td>12.0</td>
-<td>1.0</td>
-<td>27.0</td>
-<td>16.0</td>
-<td>24.0</td>
-<td>26.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">26</td>
-<td>qlib</td>
-<td>1.0</td>
-<td>30.0</td>
-<td>17.0</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>2.0</td>
-<td>3.0</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<p><strong>Observations</strong>: - There are more disagreement between system and manual evaluation - especially for 5.3, 6.1, 6.2. - The items consistency in this repo are not similar to those in <code>lightfm</code>. - e.g.&nbsp;Variance for 3.5 is greatly reduced. Variance for 3.2 becomes larger. - However, <code>qlib</code> is not just a machine learning project, it also contains a software inside. - e.g.&nbsp;It has a lot of randomly generated data by itself, instead of reading a data to perform analysis, it seems to deviate from the objective of 2.1.</p>
-</section>
-<section id="the-consistency-for-each-checklist-items" class="level4">
-<h4 class="anchored" data-anchor-id="the-consistency-for-each-checklist-items">3. The consistency for each checklist items</h4>
-<ul>
-<li>Why is it important? If the score of a particular item varies a lot when evaluating a repository, it might mean that its prompt (<code>Requirement</code>) is confusing to the LLM, or the checklist item itself is not well defined.</li>
-</ul>
-<div class="cell" data-execution_count="16">
-<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>df_repo__stat.pivot(index<span class="op">=</span><span class="st">'id'</span>, columns<span class="op">=</span><span class="st">'repo'</span>, values<span class="op">=</span><span class="st">'std'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="16">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">DeepSpeech</th>
-<th data-quarto-table-cell-role="th">apollo</th>
-<th data-quarto-table-cell-role="th">lightfm</th>
-<th data-quarto-table-cell-role="th">magenta</th>
-<th data-quarto-table-cell-role="th">mmf</th>
-<th data-quarto-table-cell-role="th">mycroft-core</th>
-<th data-quarto-table-cell-role="th">nanodet</th>
-<th data-quarto-table-cell-role="th">paperless-ng</th>
-<th data-quarto-table-cell-role="th">qlib</th>
-</tr>
-<tr class="odd">
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2.1</td>
-<td>0.479463</td>
-<td>0.152564</td>
-<td>0.091287</td>
-<td>0.423451</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.242117</td>
-<td>0.000000</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3.2</td>
-<td>0.406838</td>
-<td>0.215092</td>
-<td>0.091287</td>
-<td>0.189525</td>
-<td>0.245066</td>
-<td>0.278027</td>
-<td>0.239732</td>
-<td>0.091287</td>
-<td>0.285673</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">3.5</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.388040</td>
-<td>0.252003</td>
-<td>0.126854</td>
-<td>0.000000</td>
-<td>0.252003</td>
-<td>0.000000</td>
-<td>0.091287</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">4.2</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.152564</td>
-<td>0.091287</td>
-<td>0.126854</td>
-<td>0.000000</td>
-<td>0.254274</td>
-<td>0.000000</td>
-<td>0.152564</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">5.3</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.249136</td>
-<td>0.000000</td>
-<td>0.126854</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.253708</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">6.1</td>
-<td>0.351107</td>
-<td>0.172873</td>
-<td>0.239732</td>
-<td>0.252003</td>
-<td>0.233046</td>
-<td>0.000000</td>
-<td>0.285673</td>
-<td>0.000000</td>
-<td>0.224888</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">6.2</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.172873</td>
-<td>0.000000</td>
-<td>0.201289</td>
-<td>0.253708</td>
-<td>0.260415</td>
-<td>0.126854</td>
-<td>0.182574</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="17">
-<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>alt.Chart(df_repo__stat).mark_boxplot().encode(</span>
-<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span><span class="st">"std:Q"</span>,</span>
-<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">'id:N'</span></span>
-<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a>).properties(</span>
-<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">400</span></span>
-<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="17">
-
-<style>
-  #altair-viz-314f036109d441fd8fcab6bda695b7de.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-314f036109d441fd8fcab6bda695b7de.vega-embed details,
-  #altair-viz-314f036109d441fd8fcab6bda695b7de.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-314f036109d441fd8fcab6bda695b7de"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-314f036109d441fd8fcab6bda695b7de") {
-      outputDiv = document.getElementById("altair-viz-314f036109d441fd8fcab6bda695b7de");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}}, "data": {"name": "data-5faacda36c1e8a62adb5863cd86f4ae0"}, "mark": {"type": "boxplot"}, "encoding": {"x": {"field": "std", "type": "quantitative"}, "y": {"field": "id", "type": "nominal"}}, "height": 200, "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-5faacda36c1e8a62adb5863cd86f4ae0": [{"repo": "DeepSpeech", "id": "2.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.47946330148538413}, {"repo": "DeepSpeech", "id": "3.2", "count": 30.0, "mean": 0.2, "std": 0.40683810217248617}, {"repo": "DeepSpeech", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "DeepSpeech", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "DeepSpeech", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "DeepSpeech", "id": "6.1", "count": 30.0, "mean": 0.15, "std": 0.3511066249289032}, {"repo": "DeepSpeech", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "2.1", "count": 30.0, "mean": 0.95, "std": 0.15256428831468236}, {"repo": "apollo", "id": "3.2", "count": 30.0, "mean": 0.11666666666666667, "std": 0.21509153357603822}, {"repo": "apollo", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "6.1", "count": 30.0, "mean": 0.06666666666666667, "std": 0.1728729518208802}, {"repo": "apollo", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "lightfm", "id": "2.1", "count": 30.0, "mean": 0.9833333333333333, "std": 0.09128709291752768}, {"repo": "lightfm", "id": "3.2", "count": 30.0, "mean": 0.48333333333333334, "std": 0.09128709291752768}, {"repo": "lightfm", "id": "3.5", "count": 30.0, "mean": 0.26666666666666666, "std": 0.38803957613068046}, {"repo": "lightfm", "id": "4.2", "count": 30.0, "mean": 0.55, "std": 0.15256428831468233}, {"repo": "lightfm", "id": "5.3", "count": 30.0, "mean": 0.2, "std": 0.2491364395612199}, {"repo": "lightfm", "id": "6.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.23973165074269212}, {"repo": "lightfm", "id": "6.2", "count": 30.0, "mean": 0.9333333333333333, "std": 0.1728729518208802}, {"repo": "magenta", "id": "2.1", "count": 30.0, "mean": 0.6, "std": 0.4234505222898965}, {"repo": "magenta", "id": "3.2", "count": 30.0, "mean": 0.4166666666666667, "std": 0.18952451089472586}, {"repo": "magenta", "id": "3.5", "count": 30.0, "mean": 0.2833333333333333, "std": 0.25200346649686545}, {"repo": "magenta", "id": "4.2", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768}, {"repo": "magenta", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "magenta", "id": "6.1", "count": 30.0, "mean": 0.21666666666666667, "std": 0.25200346649686545}, {"repo": "magenta", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mmf", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "mmf", "id": "3.2", "count": 30.0, "mean": 0.6833333333333333, "std": 0.2450662589267805}, {"repo": "mmf", "id": "3.5", "count": 30.0, "mean": 0.03333333333333333, "std": 0.1268540658512312}, {"repo": "mmf", "id": "4.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.12685406585123124}, {"repo": "mmf", "id": "5.3", "count": 30.0, "mean": 0.5333333333333333, "std": 0.1268540658512312}, {"repo": "mmf", "id": "6.1", "count": 30.0, "mean": 0.85, "std": 0.23304579984969953}, {"repo": "mmf", "id": "6.2", "count": 30.0, "mean": 0.55, "std": 0.20128894996822438}, {"repo": "mycroft-core", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "mycroft-core", "id": "3.2", "count": 30.0, "mean": 0.5166666666666667, "std": 0.2780267083837678}, {"repo": "mycroft-core", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mycroft-core", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mycroft-core", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mycroft-core", "id": "6.1", "count": 30.0, "mean": 0.5, "std": 0.0}, {"repo": "mycroft-core", "id": "6.2", "count": 30.0, "mean": 0.23333333333333334, "std": 0.2537081317024625}, {"repo": "nanodet", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "nanodet", "id": "3.2", "count": 30.0, "mean": 0.6666666666666666, "std": 0.2397316507426921}, {"repo": "nanodet", "id": "3.5", "count": 30.0, "mean": 0.21666666666666667, "std": 0.25200346649686545}, {"repo": "nanodet", "id": "4.2", "count": 30.0, "mean": 0.25, "std": 0.2542738138578039}, {"repo": "nanodet", "id": "5.3", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "nanodet", "id": "6.1", "count": 30.0, "mean": 0.43333333333333335, "std": 0.2856732318616829}, {"repo": "nanodet", "id": "6.2", "count": 30.0, "mean": 0.36666666666666664, "std": 0.2604152298810939}, {"repo": "paperless-ng", "id": "2.1", "count": 30.0, "mean": 0.9, "std": 0.24211709905575113}, {"repo": "paperless-ng", "id": "3.2", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768}, {"repo": "paperless-ng", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "paperless-ng", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "paperless-ng", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "paperless-ng", "id": "6.1", "count": 30.0, "mean": 0.5, "std": 0.0}, {"repo": "paperless-ng", "id": "6.2", "count": 30.0, "mean": 0.03333333333333333, "std": 0.1268540658512312}, {"repo": "qlib", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "qlib", "id": "3.2", "count": 30.0, "mean": 0.7666666666666667, "std": 0.285673231861683}, {"repo": "qlib", "id": "3.5", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768}, {"repo": "qlib", "id": "4.2", "count": 30.0, "mean": 0.45, "std": 0.15256428831468236}, {"repo": "qlib", "id": "5.3", "count": 30.0, "mean": 0.26666666666666666, "std": 0.2537081317024624}, {"repo": "qlib", "id": "6.1", "count": 30.0, "mean": 0.4666666666666667, "std": 0.2248882225544018}, {"repo": "qlib", "id": "6.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.18257418583505536}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<p><strong>Observations</strong>: - The evaluation of the checklist item 2.1 <code>Ensure Data File Loads as Expected</code> is usually stable. - When evaluating a repository, 50% of the time its standard deviation is smaller than 0.05, the smallest among the others.</p>
-<p>Below shows the breakdown of item scores for each repository:<br>
-(NOTE: only <code>lightfm</code> and <code>qlib</code> have ground truth, in green diamond)</p>
-<div class="cell" data-execution_count="18">
-<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>generate_stat_plot(df_repo__stat, ground_truth<span class="op">=</span>ground_truth, facet_col<span class="op">=</span><span class="st">'id'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="18">
-
-<style>
-  #altair-viz-cd3ee5795ce04e14b1e1be2bd1bb6e17.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-cd3ee5795ce04e14b1e1be2bd1bb6e17.vega-embed details,
-  #altair-viz-cd3ee5795ce04e14b1e1be2bd1bb6e17.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-cd3ee5795ce04e14b1e1be2bd1bb6e17"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-cd3ee5795ce04e14b1e1be2bd1bb6e17") {
-      outputDiv = document.getElementById("altair-viz-cd3ee5795ce04e14b1e1be2bd1bb6e17");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}}, "data": {"name": "data-caf9bb3136627b1b25d838f76432530c"}, "facet": {"column": {"field": "id", "type": "nominal"}}, "spec": {"layer": [{"layer": [{"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelAngle": 0}, "field": "repo", "title": "Repository", "type": "ordinal"}, "y": {"field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "repo", "type": "ordinal"}, "y": {"field": "min", "title": "1 SD", "type": "quantitative"}, "y2": {"field": "max"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 100}, "encoding": {"x": {"field": "repo", "type": "ordinal"}, "y": {"field": "score", "type": "quantitative"}}}]}], "width": 400}, "columns": 2, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-caf9bb3136627b1b25d838f76432530c": [{"repo": "DeepSpeech", "id": "2.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.47946330148538413, "score": null}, {"repo": "DeepSpeech", "id": "3.2", "count": 30.0, "mean": 0.2, "std": 0.40683810217248617, "score": null}, {"repo": "DeepSpeech", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "DeepSpeech", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "DeepSpeech", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "DeepSpeech", "id": "6.1", "count": 30.0, "mean": 0.15, "std": 0.3511066249289032, "score": null}, {"repo": "DeepSpeech", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "apollo", "id": "2.1", "count": 30.0, "mean": 0.95, "std": 0.15256428831468236, "score": null}, {"repo": "apollo", "id": "3.2", "count": 30.0, "mean": 0.11666666666666667, "std": 0.21509153357603822, "score": null}, {"repo": "apollo", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "apollo", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "apollo", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "apollo", "id": "6.1", "count": 30.0, "mean": 0.06666666666666667, "std": 0.1728729518208802, "score": null}, {"repo": "apollo", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "lightfm", "id": "2.1", "count": 30.0, "mean": 0.9833333333333333, "std": 0.09128709291752768, "score": 1.0}, {"repo": "lightfm", "id": "3.2", "count": 30.0, "mean": 0.48333333333333334, "std": 0.09128709291752768, "score": 1.0}, {"repo": "lightfm", "id": "3.5", "count": 30.0, "mean": 0.26666666666666666, "std": 0.38803957613068046, "score": 0.0}, {"repo": "lightfm", "id": "4.2", "count": 30.0, "mean": 0.55, "std": 0.15256428831468233, "score": 1.0}, {"repo": "lightfm", "id": "5.3", "count": 30.0, "mean": 0.2, "std": 0.2491364395612199, "score": 0.5}, {"repo": "lightfm", "id": "6.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.23973165074269212, "score": 1.0}, {"repo": "lightfm", "id": "6.2", "count": 30.0, "mean": 0.9333333333333333, "std": 0.1728729518208802, "score": 1.0}, {"repo": "magenta", "id": "2.1", "count": 30.0, "mean": 0.6, "std": 0.4234505222898965, "score": null}, {"repo": "magenta", "id": "3.2", "count": 30.0, "mean": 0.4166666666666667, "std": 0.18952451089472586, "score": null}, {"repo": "magenta", "id": "3.5", "count": 30.0, "mean": 0.2833333333333333, "std": 0.25200346649686545, "score": null}, {"repo": "magenta", "id": "4.2", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768, "score": null}, {"repo": "magenta", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "magenta", "id": "6.1", "count": 30.0, "mean": 0.21666666666666667, "std": 0.25200346649686545, "score": null}, {"repo": "magenta", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "mmf", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0, "score": null}, {"repo": "mmf", "id": "3.2", "count": 30.0, "mean": 0.6833333333333333, "std": 0.2450662589267805, "score": null}, {"repo": "mmf", "id": "3.5", "count": 30.0, "mean": 0.03333333333333333, "std": 0.1268540658512312, "score": null}, {"repo": "mmf", "id": "4.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.12685406585123124, "score": null}, {"repo": "mmf", "id": "5.3", "count": 30.0, "mean": 0.5333333333333333, "std": 0.1268540658512312, "score": null}, {"repo": "mmf", "id": "6.1", "count": 30.0, "mean": 0.85, "std": 0.23304579984969953, "score": null}, {"repo": "mmf", "id": "6.2", "count": 30.0, "mean": 0.55, "std": 0.20128894996822438, "score": null}, {"repo": "mycroft-core", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0, "score": null}, {"repo": "mycroft-core", "id": "3.2", "count": 30.0, "mean": 0.5166666666666667, "std": 0.2780267083837678, "score": null}, {"repo": "mycroft-core", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "mycroft-core", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "mycroft-core", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "mycroft-core", "id": "6.1", "count": 30.0, "mean": 0.5, "std": 0.0, "score": null}, {"repo": "mycroft-core", "id": "6.2", "count": 30.0, "mean": 0.23333333333333334, "std": 0.2537081317024625, "score": null}, {"repo": "nanodet", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0, "score": null}, {"repo": "nanodet", "id": "3.2", "count": 30.0, "mean": 0.6666666666666666, "std": 0.2397316507426921, "score": null}, {"repo": "nanodet", "id": "3.5", "count": 30.0, "mean": 0.21666666666666667, "std": 0.25200346649686545, "score": null}, {"repo": "nanodet", "id": "4.2", "count": 30.0, "mean": 0.25, "std": 0.2542738138578039, "score": null}, {"repo": "nanodet", "id": "5.3", "count": 30.0, "mean": 1.0, "std": 0.0, "score": null}, {"repo": "nanodet", "id": "6.1", "count": 30.0, "mean": 0.43333333333333335, "std": 0.2856732318616829, "score": null}, {"repo": "nanodet", "id": "6.2", "count": 30.0, "mean": 0.36666666666666664, "std": 0.2604152298810939, "score": null}, {"repo": "paperless-ng", "id": "2.1", "count": 30.0, "mean": 0.9, "std": 0.24211709905575113, "score": null}, {"repo": "paperless-ng", "id": "3.2", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768, "score": null}, {"repo": "paperless-ng", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "paperless-ng", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "paperless-ng", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0, "score": null}, {"repo": "paperless-ng", "id": "6.1", "count": 30.0, "mean": 0.5, "std": 0.0, "score": null}, {"repo": "paperless-ng", "id": "6.2", "count": 30.0, "mean": 0.03333333333333333, "std": 0.1268540658512312, "score": null}, {"repo": "qlib", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0, "score": 0.5}, {"repo": "qlib", "id": "3.2", "count": 30.0, "mean": 0.7666666666666667, "std": 0.285673231861683, "score": 1.0}, {"repo": "qlib", "id": "3.5", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768, "score": 0.0}, {"repo": "qlib", "id": "4.2", "count": 30.0, "mean": 0.45, "std": 0.15256428831468236, "score": 0.5}, {"repo": "qlib", "id": "5.3", "count": 30.0, "mean": 0.26666666666666666, "std": 0.2537081317024624, "score": 1.0}, {"repo": "qlib", "id": "6.1", "count": 30.0, "mean": 0.4666666666666667, "std": 0.2248882225544018, "score": 1.0}, {"repo": "qlib", "id": "6.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.18257418583505536, "score": 1.0}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<p><strong>Observations</strong>: - (TBC) The standard deviation for Item 3.5 and 5.3 shows great variation, which might imply that test cases in some repo might be confusing to LLM while some are clear. - (TBC) The standard deviation for Item 5.3, 6.1, 6.2 are relatively high and consistent, which might imply that there is room for refining the prompt to reduce consistency issue.</p>
-</section>
-<section id="the-consistency-for-each-checklist-items-compared-to-the-lightfm" class="level4">
-<h4 class="anchored" data-anchor-id="the-consistency-for-each-checklist-items-compared-to-the-lightfm">4. The consistency for each checklist items, compared to the <code>lightfm</code></h4>
-<ul>
-<li>Why is it important? We optimized the consistency of our system using <code>lightfm</code>. Therefore, we treat this repository as a benchmark. If a particular checklist item has a much worse consistency in other repository, that might mean that the prompt for that item is not generalizable.</li>
-</ul>
-<p>Below shows the standard deviations in a 30 runs for each checklist item for each repository:</p>
-<div class="cell" data-execution_count="19">
-<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> df_repo__stat[[<span class="st">'repo'</span>, <span class="st">'std'</span>, <span class="st">'id'</span>]].pivot(index<span class="op">=</span><span class="st">'repo'</span>, columns<span class="op">=</span><span class="st">'id'</span>)</span>
-<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>stds</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="19">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th colspan="7" data-quarto-table-cell-role="th" data-halign="left">std</th>
-</tr>
-<tr class="odd">
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th">2.1</th>
-<th data-quarto-table-cell-role="th">3.2</th>
-<th data-quarto-table-cell-role="th">3.5</th>
-<th data-quarto-table-cell-role="th">4.2</th>
-<th data-quarto-table-cell-role="th">5.3</th>
-<th data-quarto-table-cell-role="th">6.1</th>
-<th data-quarto-table-cell-role="th">6.2</th>
-</tr>
-<tr class="header">
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">DeepSpeech</td>
-<td>0.479463</td>
-<td>0.406838</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.351107</td>
-<td>0.000000</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">apollo</td>
-<td>0.152564</td>
-<td>0.215092</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.172873</td>
-<td>0.000000</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">lightfm</td>
-<td>0.091287</td>
-<td>0.091287</td>
-<td>0.388040</td>
-<td>0.152564</td>
-<td>0.249136</td>
-<td>0.239732</td>
-<td>0.172873</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">magenta</td>
-<td>0.423451</td>
-<td>0.189525</td>
-<td>0.252003</td>
-<td>0.091287</td>
-<td>0.000000</td>
-<td>0.252003</td>
-<td>0.000000</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">mmf</td>
-<td>0.000000</td>
-<td>0.245066</td>
-<td>0.126854</td>
-<td>0.126854</td>
-<td>0.126854</td>
-<td>0.233046</td>
-<td>0.201289</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">mycroft-core</td>
-<td>0.000000</td>
-<td>0.278027</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.253708</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">nanodet</td>
-<td>0.000000</td>
-<td>0.239732</td>
-<td>0.252003</td>
-<td>0.254274</td>
-<td>0.000000</td>
-<td>0.285673</td>
-<td>0.260415</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">paperless-ng</td>
-<td>0.242117</td>
-<td>0.091287</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.000000</td>
-<td>0.126854</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">qlib</td>
-<td>0.000000</td>
-<td>0.285673</td>
-<td>0.091287</td>
-<td>0.152564</td>
-<td>0.253708</td>
-<td>0.224888</td>
-<td>0.182574</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="20">
-<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a>stds_p <span class="op">=</span> stds.copy()</span>
-<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a>stds_p.columns <span class="op">=</span> [col[<span class="dv">1</span>] <span class="cf">for</span> col <span class="kw">in</span> stds_p.columns]</span>
-<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a>stds_p <span class="op">=</span> stds_p.reset_index()</span>
-<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a>stds_p <span class="op">=</span> stds_p.melt(id_vars<span class="op">=</span><span class="st">'repo'</span>, var_name<span class="op">=</span><span class="st">'id'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="21">
-<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>stds_p.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="21">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th">value</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>DeepSpeech</td>
-<td>2.1</td>
-<td>0.479463</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>apollo</td>
-<td>2.1</td>
-<td>0.152564</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>lightfm</td>
-<td>2.1</td>
-<td>0.091287</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>magenta</td>
-<td>2.1</td>
-<td>0.423451</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>mmf</td>
-<td>2.1</td>
-<td>0.000000</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="22">
-<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="co"># stripplot = (</span></span>
-<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a><span class="co">#     alt.Chart(stds_p)</span></span>
-<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a><span class="co">#     .mark_point(filled=True, size=100)</span></span>
-<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a><span class="co">#     .transform_calculate( </span></span>
-<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a><span class="co">#         # Generate Gaussian jitter with a Box-Muller transform </span></span>
-<span id="cb26-6"><a href="#cb26-6" aria-hidden="true" tabindex="-1"></a><span class="co">#         jitter='sqrt(-2*log(random()))*cos(2*PI*random())'</span></span>
-<span id="cb26-7"><a href="#cb26-7" aria-hidden="true" tabindex="-1"></a><span class="co">#         # jitter='random()'</span></span>
-<span id="cb26-8"><a href="#cb26-8" aria-hidden="true" tabindex="-1"></a><span class="co">#     ).encode( </span></span>
-<span id="cb26-9"><a href="#cb26-9" aria-hidden="true" tabindex="-1"></a><span class="co">#         y=alt.Y( </span></span>
-<span id="cb26-10"><a href="#cb26-10" aria-hidden="true" tabindex="-1"></a><span class="co">#             'jitter:Q', </span></span>
-<span id="cb26-11"><a href="#cb26-11" aria-hidden="true" tabindex="-1"></a><span class="co">#             title=None, </span></span>
-<span id="cb26-12"><a href="#cb26-12" aria-hidden="true" tabindex="-1"></a><span class="co">#             axis=alt.Axis(ticks=False, grid=True, labels=False), </span></span>
-<span id="cb26-13"><a href="#cb26-13" aria-hidden="true" tabindex="-1"></a><span class="co">#             scale=alt.Scale(), </span></span>
-<span id="cb26-14"><a href="#cb26-14" aria-hidden="true" tabindex="-1"></a><span class="co">#         ), </span></span>
-<span id="cb26-15"><a href="#cb26-15" aria-hidden="true" tabindex="-1"></a><span class="co">#         x=alt.X('value:Q'), </span></span>
-<span id="cb26-16"><a href="#cb26-16" aria-hidden="true" tabindex="-1"></a><span class="co">#         color=alt.Color('repo:N'),</span></span>
-<span id="cb26-17"><a href="#cb26-17" aria-hidden="true" tabindex="-1"></a><span class="co">#         row=alt.Row( </span></span>
-<span id="cb26-18"><a href="#cb26-18" aria-hidden="true" tabindex="-1"></a><span class="co">#             'id:N',</span></span>
-<span id="cb26-19"><a href="#cb26-19" aria-hidden="true" tabindex="-1"></a><span class="co">#             header=alt.Header(</span></span>
-<span id="cb26-20"><a href="#cb26-20" aria-hidden="true" tabindex="-1"></a><span class="co">#                 labelFontSize=16,</span></span>
-<span id="cb26-21"><a href="#cb26-21" aria-hidden="true" tabindex="-1"></a><span class="co">#                 labelAngle=0</span></span>
-<span id="cb26-22"><a href="#cb26-22" aria-hidden="true" tabindex="-1"></a><span class="co">#             )</span></span>
-<span id="cb26-23"><a href="#cb26-23" aria-hidden="true" tabindex="-1"></a><span class="co">#         ),</span></span>
-<span id="cb26-24"><a href="#cb26-24" aria-hidden="true" tabindex="-1"></a><span class="co">#         tooltip='repo'</span></span>
-<span id="cb26-25"><a href="#cb26-25" aria-hidden="true" tabindex="-1"></a><span class="co">#     ).configure_facet( </span></span>
-<span id="cb26-26"><a href="#cb26-26" aria-hidden="true" tabindex="-1"></a><span class="co">#         spacing=0</span></span>
-<span id="cb26-27"><a href="#cb26-27" aria-hidden="true" tabindex="-1"></a><span class="co">#     ).configure_view( </span></span>
-<span id="cb26-28"><a href="#cb26-28" aria-hidden="true" tabindex="-1"></a><span class="co">#         stroke=None</span></span>
-<span id="cb26-29"><a href="#cb26-29" aria-hidden="true" tabindex="-1"></a><span class="co">#     ).configure_axis( </span></span>
-<span id="cb26-30"><a href="#cb26-30" aria-hidden="true" tabindex="-1"></a><span class="co">#         labelFontSize=16, </span></span>
-<span id="cb26-31"><a href="#cb26-31" aria-hidden="true" tabindex="-1"></a><span class="co">#         titleFontSize=16</span></span>
-<span id="cb26-32"><a href="#cb26-32" aria-hidden="true" tabindex="-1"></a><span class="co">#     ).properties(</span></span>
-<span id="cb26-33"><a href="#cb26-33" aria-hidden="true" tabindex="-1"></a><span class="co">#         height=50, </span></span>
-<span id="cb26-34"><a href="#cb26-34" aria-hidden="true" tabindex="-1"></a><span class="co">#         width=600</span></span>
-<span id="cb26-35"><a href="#cb26-35" aria-hidden="true" tabindex="-1"></a><span class="co">#     ) </span></span>
-<span id="cb26-36"><a href="#cb26-36" aria-hidden="true" tabindex="-1"></a><span class="co"># )</span></span>
-<span id="cb26-37"><a href="#cb26-37" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb26-38"><a href="#cb26-38" aria-hidden="true" tabindex="-1"></a><span class="co"># stripplot </span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="23">
-<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> generate_jitterbox_plot(df_stds_p):</span>
-<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""</span></span>
-<span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a><span class="co">    Generate jitterbox plot across all repo and all checklist item</span></span>
-<span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
-<span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a>    box <span class="op">=</span> alt.Chart().mark_boxplot(</span>
-<span id="cb27-6"><a href="#cb27-6" aria-hidden="true" tabindex="-1"></a>        color<span class="op">=</span><span class="st">'grey'</span>,</span>
-<span id="cb27-7"><a href="#cb27-7" aria-hidden="true" tabindex="-1"></a>        opacity<span class="op">=</span><span class="fl">0.5</span>,</span>
-<span id="cb27-8"><a href="#cb27-8" aria-hidden="true" tabindex="-1"></a>        size<span class="op">=</span><span class="dv">20</span>,</span>
-<span id="cb27-9"><a href="#cb27-9" aria-hidden="true" tabindex="-1"></a>    ).encode(</span>
-<span id="cb27-10"><a href="#cb27-10" aria-hidden="true" tabindex="-1"></a>        x<span class="op">=</span>alt.X(<span class="st">'value:Q'</span>).title(<span class="st">'SD(Score)'</span>),</span>
-<span id="cb27-11"><a href="#cb27-11" aria-hidden="true" tabindex="-1"></a>        y<span class="op">=</span>alt.Y(<span class="st">'id:N'</span>, title<span class="op">=</span><span class="va">None</span>, axis<span class="op">=</span>alt.Axis(labelPadding<span class="op">=</span><span class="dv">10</span>, grid<span class="op">=</span><span class="va">False</span>))</span>
-<span id="cb27-12"><a href="#cb27-12" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb27-13"><a href="#cb27-13" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb27-14"><a href="#cb27-14" aria-hidden="true" tabindex="-1"></a>    stripplot <span class="op">=</span> alt.Chart().mark_circle(size<span class="op">=</span><span class="dv">100</span>).encode(</span>
-<span id="cb27-15"><a href="#cb27-15" aria-hidden="true" tabindex="-1"></a>        y<span class="op">=</span>alt.Y( </span>
-<span id="cb27-16"><a href="#cb27-16" aria-hidden="true" tabindex="-1"></a>            <span class="st">'id:N'</span>,</span>
-<span id="cb27-17"><a href="#cb27-17" aria-hidden="true" tabindex="-1"></a>            axis<span class="op">=</span>alt.Axis(ticks<span class="op">=</span><span class="va">False</span>, grid<span class="op">=</span><span class="va">True</span>, labels<span class="op">=</span><span class="va">True</span>), </span>
-<span id="cb27-18"><a href="#cb27-18" aria-hidden="true" tabindex="-1"></a>            scale<span class="op">=</span>alt.Scale(), </span>
-<span id="cb27-19"><a href="#cb27-19" aria-hidden="true" tabindex="-1"></a>        ), </span>
-<span id="cb27-20"><a href="#cb27-20" aria-hidden="true" tabindex="-1"></a>        x<span class="op">=</span><span class="st">'value:Q'</span>,</span>
-<span id="cb27-21"><a href="#cb27-21" aria-hidden="true" tabindex="-1"></a>        yOffset<span class="op">=</span><span class="st">"jitter:Q"</span>,</span>
-<span id="cb27-22"><a href="#cb27-22" aria-hidden="true" tabindex="-1"></a>        color<span class="op">=</span>alt.Color(<span class="st">'id:N'</span>, legend<span class="op">=</span><span class="va">None</span>),</span>
-<span id="cb27-23"><a href="#cb27-23" aria-hidden="true" tabindex="-1"></a>        tooltip<span class="op">=</span><span class="st">'repo'</span></span>
-<span id="cb27-24"><a href="#cb27-24" aria-hidden="true" tabindex="-1"></a>    ).transform_calculate(</span>
-<span id="cb27-25"><a href="#cb27-25" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Generate Gaussian jitter with a Box-Muller transform</span></span>
-<span id="cb27-26"><a href="#cb27-26" aria-hidden="true" tabindex="-1"></a>        jitter<span class="op">=</span><span class="st">"sqrt(-2*log(random()))*cos(2*PI*random())"</span></span>
-<span id="cb27-27"><a href="#cb27-27" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb27-28"><a href="#cb27-28" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb27-29"><a href="#cb27-29" aria-hidden="true" tabindex="-1"></a>    plot <span class="op">=</span> alt.layer(</span>
-<span id="cb27-30"><a href="#cb27-30" aria-hidden="true" tabindex="-1"></a>        box,</span>
-<span id="cb27-31"><a href="#cb27-31" aria-hidden="true" tabindex="-1"></a>        stripplot,</span>
-<span id="cb27-32"><a href="#cb27-32" aria-hidden="true" tabindex="-1"></a>        data<span class="op">=</span>df_stds_p</span>
-<span id="cb27-33"><a href="#cb27-33" aria-hidden="true" tabindex="-1"></a>    ).configure_view( </span>
-<span id="cb27-34"><a href="#cb27-34" aria-hidden="true" tabindex="-1"></a>        stroke<span class="op">=</span><span class="va">None</span></span>
-<span id="cb27-35"><a href="#cb27-35" aria-hidden="true" tabindex="-1"></a>    ).configure_axis( </span>
-<span id="cb27-36"><a href="#cb27-36" aria-hidden="true" tabindex="-1"></a>        labelFontSize<span class="op">=</span><span class="dv">16</span>, </span>
-<span id="cb27-37"><a href="#cb27-37" aria-hidden="true" tabindex="-1"></a>        titleFontSize<span class="op">=</span><span class="dv">16</span></span>
-<span id="cb27-38"><a href="#cb27-38" aria-hidden="true" tabindex="-1"></a>    ).properties(</span>
-<span id="cb27-39"><a href="#cb27-39" aria-hidden="true" tabindex="-1"></a>        height<span class="op">=</span><span class="dv">300</span>, </span>
-<span id="cb27-40"><a href="#cb27-40" aria-hidden="true" tabindex="-1"></a>        width<span class="op">=</span><span class="dv">600</span></span>
-<span id="cb27-41"><a href="#cb27-41" aria-hidden="true" tabindex="-1"></a>    ) </span>
-<span id="cb27-42"><a href="#cb27-42" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb27-43"><a href="#cb27-43" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> plot</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="24">
-<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a>generate_jitterbox_plot(stds_p)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="24">
-
-<style>
-  #altair-viz-bf718614d34e4d4fa2079ee562721d43.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-bf718614d34e4d4fa2079ee562721d43.vega-embed details,
-  #altair-viz-bf718614d34e4d4fa2079ee562721d43.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-bf718614d34e4d4fa2079ee562721d43"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-bf718614d34e4d4fa2079ee562721d43") {
-      outputDiv = document.getElementById("altair-viz-bf718614d34e4d4fa2079ee562721d43");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300, "stroke": null}, "axis": {"labelFontSize": 16, "titleFontSize": 16}}, "layer": [{"mark": {"type": "boxplot", "color": "grey", "opacity": 0.5, "size": 20}, "encoding": {"x": {"field": "value", "title": "SD(Score)", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelPadding": 10}, "field": "id", "title": null, "type": "nominal"}}}, {"mark": {"type": "circle", "size": 100}, "encoding": {"color": {"field": "id", "legend": null, "type": "nominal"}, "tooltip": {"field": "repo", "type": "nominal"}, "x": {"field": "value", "type": "quantitative"}, "y": {"axis": {"grid": true, "labels": true, "ticks": false}, "field": "id", "scale": {}, "type": "nominal"}, "yOffset": {"field": "jitter", "type": "quantitative"}}, "transform": [{"calculate": "sqrt(-2*log(random()))*cos(2*PI*random())", "as": "jitter"}]}], "data": {"name": "data-99bb98eaf09607f2cf1769cbf170cb2a"}, "height": 300, "width": 600, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-99bb98eaf09607f2cf1769cbf170cb2a": [{"repo": "DeepSpeech", "id": "2.1", "value": 0.47946330148538413}, {"repo": "apollo", "id": "2.1", "value": 0.15256428831468236}, {"repo": "lightfm", "id": "2.1", "value": 0.09128709291752768}, {"repo": "magenta", "id": "2.1", "value": 0.4234505222898965}, {"repo": "mmf", "id": "2.1", "value": 0.0}, {"repo": "mycroft-core", "id": "2.1", "value": 0.0}, {"repo": "nanodet", "id": "2.1", "value": 0.0}, {"repo": "paperless-ng", "id": "2.1", "value": 0.24211709905575113}, {"repo": "qlib", "id": "2.1", "value": 0.0}, {"repo": "DeepSpeech", "id": "3.2", "value": 0.40683810217248617}, {"repo": "apollo", "id": "3.2", "value": 0.21509153357603822}, {"repo": "lightfm", "id": "3.2", "value": 0.09128709291752768}, {"repo": "magenta", "id": "3.2", "value": 0.18952451089472586}, {"repo": "mmf", "id": "3.2", "value": 0.2450662589267805}, {"repo": "mycroft-core", "id": "3.2", "value": 0.2780267083837678}, {"repo": "nanodet", "id": "3.2", "value": 0.2397316507426921}, {"repo": "paperless-ng", "id": "3.2", "value": 0.09128709291752768}, {"repo": "qlib", "id": "3.2", "value": 0.285673231861683}, {"repo": "DeepSpeech", "id": "3.5", "value": 0.0}, {"repo": "apollo", "id": "3.5", "value": 0.0}, {"repo": "lightfm", "id": "3.5", "value": 0.38803957613068046}, {"repo": "magenta", "id": "3.5", "value": 0.25200346649686545}, {"repo": "mmf", "id": "3.5", "value": 0.1268540658512312}, {"repo": "mycroft-core", "id": "3.5", "value": 0.0}, {"repo": "nanodet", "id": "3.5", "value": 0.25200346649686545}, {"repo": "paperless-ng", "id": "3.5", "value": 0.0}, {"repo": "qlib", "id": "3.5", "value": 0.09128709291752768}, {"repo": "DeepSpeech", "id": "4.2", "value": 0.0}, {"repo": "apollo", "id": "4.2", "value": 0.0}, {"repo": "lightfm", "id": "4.2", "value": 0.15256428831468233}, {"repo": "magenta", "id": "4.2", "value": 0.09128709291752768}, {"repo": "mmf", "id": "4.2", "value": 0.12685406585123124}, {"repo": "mycroft-core", "id": "4.2", "value": 0.0}, {"repo": "nanodet", "id": "4.2", "value": 0.2542738138578039}, {"repo": "paperless-ng", "id": "4.2", "value": 0.0}, {"repo": "qlib", "id": "4.2", "value": 0.15256428831468236}, {"repo": "DeepSpeech", "id": "5.3", "value": 0.0}, {"repo": "apollo", "id": "5.3", "value": 0.0}, {"repo": "lightfm", "id": "5.3", "value": 0.2491364395612199}, {"repo": "magenta", "id": "5.3", "value": 0.0}, {"repo": "mmf", "id": "5.3", "value": 0.1268540658512312}, {"repo": "mycroft-core", "id": "5.3", "value": 0.0}, {"repo": "nanodet", "id": "5.3", "value": 0.0}, {"repo": "paperless-ng", "id": "5.3", "value": 0.0}, {"repo": "qlib", "id": "5.3", "value": 0.2537081317024624}, {"repo": "DeepSpeech", "id": "6.1", "value": 0.3511066249289032}, {"repo": "apollo", "id": "6.1", "value": 0.1728729518208802}, {"repo": "lightfm", "id": "6.1", "value": 0.23973165074269212}, {"repo": "magenta", "id": "6.1", "value": 0.25200346649686545}, {"repo": "mmf", "id": "6.1", "value": 0.23304579984969953}, {"repo": "mycroft-core", "id": "6.1", "value": 0.0}, {"repo": "nanodet", "id": "6.1", "value": 0.2856732318616829}, {"repo": "paperless-ng", "id": "6.1", "value": 0.0}, {"repo": "qlib", "id": "6.1", "value": 0.2248882225544018}, {"repo": "DeepSpeech", "id": "6.2", "value": 0.0}, {"repo": "apollo", "id": "6.2", "value": 0.0}, {"repo": "lightfm", "id": "6.2", "value": 0.1728729518208802}, {"repo": "magenta", "id": "6.2", "value": 0.0}, {"repo": "mmf", "id": "6.2", "value": 0.20128894996822438}, {"repo": "mycroft-core", "id": "6.2", "value": 0.2537081317024625}, {"repo": "nanodet", "id": "6.2", "value": 0.2604152298810939}, {"repo": "paperless-ng", "id": "6.2", "value": 0.1268540658512312}, {"repo": "qlib", "id": "6.2", "value": 0.18257418583505536}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<div class="cell" data-execution_count="25">
-<div class="sourceCode cell-code" id="cb29"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a>alt.Chart(df_repo__stat).mark_boxplot().encode(</span>
-<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span><span class="st">"std:Q"</span>,</span>
-<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">'id:N'</span></span>
-<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a>).properties(</span>
-<span id="cb29-5"><a href="#cb29-5" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb29-6"><a href="#cb29-6" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">400</span></span>
-<span id="cb29-7"><a href="#cb29-7" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="25">
-
-<style>
-  #altair-viz-4ac4f6337ac841c28a87178fb85fc13c.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-4ac4f6337ac841c28a87178fb85fc13c.vega-embed details,
-  #altair-viz-4ac4f6337ac841c28a87178fb85fc13c.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-4ac4f6337ac841c28a87178fb85fc13c"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-4ac4f6337ac841c28a87178fb85fc13c") {
-      outputDiv = document.getElementById("altair-viz-4ac4f6337ac841c28a87178fb85fc13c");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}}, "data": {"name": "data-5faacda36c1e8a62adb5863cd86f4ae0"}, "mark": {"type": "boxplot"}, "encoding": {"x": {"field": "std", "type": "quantitative"}, "y": {"field": "id", "type": "nominal"}}, "height": 200, "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-5faacda36c1e8a62adb5863cd86f4ae0": [{"repo": "DeepSpeech", "id": "2.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.47946330148538413}, {"repo": "DeepSpeech", "id": "3.2", "count": 30.0, "mean": 0.2, "std": 0.40683810217248617}, {"repo": "DeepSpeech", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "DeepSpeech", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "DeepSpeech", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "DeepSpeech", "id": "6.1", "count": 30.0, "mean": 0.15, "std": 0.3511066249289032}, {"repo": "DeepSpeech", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "2.1", "count": 30.0, "mean": 0.95, "std": 0.15256428831468236}, {"repo": "apollo", "id": "3.2", "count": 30.0, "mean": 0.11666666666666667, "std": 0.21509153357603822}, {"repo": "apollo", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "apollo", "id": "6.1", "count": 30.0, "mean": 0.06666666666666667, "std": 0.1728729518208802}, {"repo": "apollo", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "lightfm", "id": "2.1", "count": 30.0, "mean": 0.9833333333333333, "std": 0.09128709291752768}, {"repo": "lightfm", "id": "3.2", "count": 30.0, "mean": 0.48333333333333334, "std": 0.09128709291752768}, {"repo": "lightfm", "id": "3.5", "count": 30.0, "mean": 0.26666666666666666, "std": 0.38803957613068046}, {"repo": "lightfm", "id": "4.2", "count": 30.0, "mean": 0.55, "std": 0.15256428831468233}, {"repo": "lightfm", "id": "5.3", "count": 30.0, "mean": 0.2, "std": 0.2491364395612199}, {"repo": "lightfm", "id": "6.1", "count": 30.0, "mean": 0.6666666666666666, "std": 0.23973165074269212}, {"repo": "lightfm", "id": "6.2", "count": 30.0, "mean": 0.9333333333333333, "std": 0.1728729518208802}, {"repo": "magenta", "id": "2.1", "count": 30.0, "mean": 0.6, "std": 0.4234505222898965}, {"repo": "magenta", "id": "3.2", "count": 30.0, "mean": 0.4166666666666667, "std": 0.18952451089472586}, {"repo": "magenta", "id": "3.5", "count": 30.0, "mean": 0.2833333333333333, "std": 0.25200346649686545}, {"repo": "magenta", "id": "4.2", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768}, {"repo": "magenta", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "magenta", "id": "6.1", "count": 30.0, "mean": 0.21666666666666667, "std": 0.25200346649686545}, {"repo": "magenta", "id": "6.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mmf", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "mmf", "id": "3.2", "count": 30.0, "mean": 0.6833333333333333, "std": 0.2450662589267805}, {"repo": "mmf", "id": "3.5", "count": 30.0, "mean": 0.03333333333333333, "std": 0.1268540658512312}, {"repo": "mmf", "id": "4.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.12685406585123124}, {"repo": "mmf", "id": "5.3", "count": 30.0, "mean": 0.5333333333333333, "std": 0.1268540658512312}, {"repo": "mmf", "id": "6.1", "count": 30.0, "mean": 0.85, "std": 0.23304579984969953}, {"repo": "mmf", "id": "6.2", "count": 30.0, "mean": 0.55, "std": 0.20128894996822438}, {"repo": "mycroft-core", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "mycroft-core", "id": "3.2", "count": 30.0, "mean": 0.5166666666666667, "std": 0.2780267083837678}, {"repo": "mycroft-core", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mycroft-core", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mycroft-core", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "mycroft-core", "id": "6.1", "count": 30.0, "mean": 0.5, "std": 0.0}, {"repo": "mycroft-core", "id": "6.2", "count": 30.0, "mean": 0.23333333333333334, "std": 0.2537081317024625}, {"repo": "nanodet", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "nanodet", "id": "3.2", "count": 30.0, "mean": 0.6666666666666666, "std": 0.2397316507426921}, {"repo": "nanodet", "id": "3.5", "count": 30.0, "mean": 0.21666666666666667, "std": 0.25200346649686545}, {"repo": "nanodet", "id": "4.2", "count": 30.0, "mean": 0.25, "std": 0.2542738138578039}, {"repo": "nanodet", "id": "5.3", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "nanodet", "id": "6.1", "count": 30.0, "mean": 0.43333333333333335, "std": 0.2856732318616829}, {"repo": "nanodet", "id": "6.2", "count": 30.0, "mean": 0.36666666666666664, "std": 0.2604152298810939}, {"repo": "paperless-ng", "id": "2.1", "count": 30.0, "mean": 0.9, "std": 0.24211709905575113}, {"repo": "paperless-ng", "id": "3.2", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768}, {"repo": "paperless-ng", "id": "3.5", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "paperless-ng", "id": "4.2", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "paperless-ng", "id": "5.3", "count": 30.0, "mean": 0.0, "std": 0.0}, {"repo": "paperless-ng", "id": "6.1", "count": 30.0, "mean": 0.5, "std": 0.0}, {"repo": "paperless-ng", "id": "6.2", "count": 30.0, "mean": 0.03333333333333333, "std": 0.1268540658512312}, {"repo": "qlib", "id": "2.1", "count": 30.0, "mean": 1.0, "std": 0.0}, {"repo": "qlib", "id": "3.2", "count": 30.0, "mean": 0.7666666666666667, "std": 0.285673231861683}, {"repo": "qlib", "id": "3.5", "count": 30.0, "mean": 0.016666666666666666, "std": 0.09128709291752768}, {"repo": "qlib", "id": "4.2", "count": 30.0, "mean": 0.45, "std": 0.15256428831468236}, {"repo": "qlib", "id": "5.3", "count": 30.0, "mean": 0.26666666666666666, "std": 0.2537081317024624}, {"repo": "qlib", "id": "6.1", "count": 30.0, "mean": 0.4666666666666667, "std": 0.2248882225544018}, {"repo": "qlib", "id": "6.2", "count": 30.0, "mean": 0.5333333333333333, "std": 0.18257418583505536}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<div class="cell" data-scrolled="true" data-execution_count="26">
-<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !pip install altair_catplot</span></span>
-<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !pip install seaborn</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="27">
-<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a><span class="co"># import altair_catplot</span></span>
-<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb31-3"><a href="#cb31-3" aria-hidden="true" tabindex="-1"></a><span class="co"># altair_catplot.catplot(</span></span>
-<span id="cb31-4"><a href="#cb31-4" aria-hidden="true" tabindex="-1"></a><span class="co">#     stds_p, </span></span>
-<span id="cb31-5"><a href="#cb31-5" aria-hidden="true" tabindex="-1"></a><span class="co">#     transform ='jitterbox', </span></span>
-<span id="cb31-6"><a href="#cb31-6" aria-hidden="true" tabindex="-1"></a><span class="co">#     mark ='point', </span></span>
-<span id="cb31-7"><a href="#cb31-7" aria-hidden="true" tabindex="-1"></a><span class="co">#     encoding = dict(</span></span>
-<span id="cb31-8"><a href="#cb31-8" aria-hidden="true" tabindex="-1"></a><span class="co">#         x = alt.X('value:Q'), </span></span>
-<span id="cb31-9"><a href="#cb31-9" aria-hidden="true" tabindex="-1"></a><span class="co">#         y = alt.Y('id:N'), </span></span>
-<span id="cb31-10"><a href="#cb31-10" aria-hidden="true" tabindex="-1"></a><span class="co">#         color = alt.Color('repo:N')</span></span>
-<span id="cb31-11"><a href="#cb31-11" aria-hidden="true" tabindex="-1"></a><span class="co">#     ) </span></span>
-<span id="cb31-12"><a href="#cb31-12" aria-hidden="true" tabindex="-1"></a><span class="co"># )</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="28">
-<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a>F <span class="op">=</span> stds.drop(index<span class="op">=</span><span class="st">'lightfm'</span>) <span class="op">/</span> stds.loc[<span class="st">'lightfm'</span>]</span>
-<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a>base <span class="op">=</span> alt.Chart(</span>
-<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a>    F.melt(ignore_index<span class="op">=</span><span class="va">False</span>).reset_index()[[<span class="st">'repo'</span>, <span class="st">'id'</span>, <span class="st">'value'</span>]]</span>
-<span id="cb32-5"><a href="#cb32-5" aria-hidden="true" tabindex="-1"></a>).transform_calculate(</span>
-<span id="cb32-6"><a href="#cb32-6" aria-hidden="true" tabindex="-1"></a>    benchmark<span class="op">=</span><span class="st">"1"</span>,</span>
-<span id="cb32-7"><a href="#cb32-7" aria-hidden="true" tabindex="-1"></a>    threshold<span class="op">=</span><span class="ss">f"</span><span class="sc">{</span>scipy<span class="sc">.</span>stats<span class="sc">.</span>f<span class="sc">.</span>ppf(<span class="fl">0.975</span>, <span class="dv">29</span>, <span class="dv">29</span>)<span class="sc">}</span><span class="ss">"</span></span>
-<span id="cb32-8"><a href="#cb32-8" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb32-9"><a href="#cb32-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb32-10"><a href="#cb32-10" aria-hidden="true" tabindex="-1"></a>point <span class="op">=</span> base.mark_point(</span>
-<span id="cb32-11"><a href="#cb32-11" aria-hidden="true" tabindex="-1"></a>    filled<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb32-12"><a href="#cb32-12" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">100</span>,</span>
-<span id="cb32-13"><a href="#cb32-13" aria-hidden="true" tabindex="-1"></a>).encode(</span>
-<span id="cb32-14"><a href="#cb32-14" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'value:Q'</span>).title(<span class="st">"std ratio (c.f. lightfm)"</span>),</span>
-<span id="cb32-15"><a href="#cb32-15" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">'id:N'</span>,</span>
-<span id="cb32-16"><a href="#cb32-16" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'repo'</span>,</span>
-<span id="cb32-17"><a href="#cb32-17" aria-hidden="true" tabindex="-1"></a>    tooltip<span class="op">=</span><span class="st">'repo'</span></span>
-<span id="cb32-18"><a href="#cb32-18" aria-hidden="true" tabindex="-1"></a>).properties(</span>
-<span id="cb32-19"><a href="#cb32-19" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb32-20"><a href="#cb32-20" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">400</span></span>
-<span id="cb32-21"><a href="#cb32-21" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb32-22"><a href="#cb32-22" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb32-23"><a href="#cb32-23" aria-hidden="true" tabindex="-1"></a>point <span class="op">\</span></span>
-<span id="cb32-24"><a href="#cb32-24" aria-hidden="true" tabindex="-1"></a><span class="op">+</span> base.mark_rule(color<span class="op">=</span><span class="st">'black'</span>).encode(x<span class="op">=</span><span class="st">"benchmark:Q"</span>) <span class="op">\</span></span>
-<span id="cb32-25"><a href="#cb32-25" aria-hidden="true" tabindex="-1"></a><span class="op">+</span> base.mark_rule(color<span class="op">=</span><span class="st">'red'</span>).encode(x<span class="op">=</span><span class="st">"threshold:Q"</span>)</span>
-<span id="cb32-26"><a href="#cb32-26" aria-hidden="true" tabindex="-1"></a><span class="co"># jitter instead of mark_point &lt;-- prompt vs. repo problem?</span></span>
-<span id="cb32-27"><a href="#cb32-27" aria-hidden="true" tabindex="-1"></a><span class="co"># prompt: sd of checklist item for all repo is high</span></span>
-<span id="cb32-28"><a href="#cb32-28" aria-hidden="true" tabindex="-1"></a><span class="co"># repo: most of repo have low sd, the repo we're looking at has outlier</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="28">
-
-<style>
-  #altair-viz-2ad8caf7cc9745f4929e9d82636d88e8.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-2ad8caf7cc9745f4929e9d82636d88e8.vega-embed details,
-  #altair-viz-2ad8caf7cc9745f4929e9d82636d88e8.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-2ad8caf7cc9745f4929e9d82636d88e8"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-2ad8caf7cc9745f4929e9d82636d88e8") {
-      outputDiv = document.getElementById("altair-viz-2ad8caf7cc9745f4929e9d82636d88e8");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}}, "layer": [{"mark": {"type": "point", "filled": true, "size": 100}, "encoding": {"color": {"field": "repo", "type": "nominal"}, "tooltip": {"field": "repo", "type": "nominal"}, "x": {"field": "value", "title": "std ratio (c.f. lightfm)", "type": "quantitative"}, "y": {"field": "id", "type": "nominal"}}, "transform": [{"calculate": "1", "as": "benchmark"}, {"calculate": "2.100995817284212", "as": "threshold"}]}, {"mark": {"type": "rule", "color": "black"}, "encoding": {"x": {"field": "benchmark", "type": "quantitative"}}, "transform": [{"calculate": "1", "as": "benchmark"}, {"calculate": "2.100995817284212", "as": "threshold"}]}, {"mark": {"type": "rule", "color": "red"}, "encoding": {"x": {"field": "threshold", "type": "quantitative"}}, "transform": [{"calculate": "1", "as": "benchmark"}, {"calculate": "2.100995817284212", "as": "threshold"}]}], "data": {"name": "data-bf73d5a667a373948920e49381ff5c49"}, "height": 200, "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-bf73d5a667a373948920e49381ff5c49": [{"repo": "DeepSpeech", "id": "2.1", "value": 5.252257314388903}, {"repo": "apollo", "id": "2.1", "value": 1.6712580435934672}, {"repo": "magenta", "id": "2.1", "value": 4.638668060910409}, {"repo": "mmf", "id": "2.1", "value": 0.0}, {"repo": "mycroft-core", "id": "2.1", "value": 0.0}, {"repo": "nanodet", "id": "2.1", "value": 0.0}, {"repo": "paperless-ng", "id": "2.1", "value": 2.652259934210953}, {"repo": "qlib", "id": "2.1", "value": 0.0}, {"repo": "DeepSpeech", "id": "3.2", "value": 4.456688116249245}, {"repo": "apollo", "id": "3.2", "value": 2.3562096973595192}, {"repo": "magenta", "id": "3.2", "value": 2.0761369963434992}, {"repo": "mmf", "id": "3.2", "value": 2.6845663619519895}, {"repo": "mycroft-core", "id": "3.2", "value": 3.0456299954140063}, {"repo": "nanodet", "id": "3.2", "value": 2.6261286571944518}, {"repo": "paperless-ng", "id": "3.2", "value": 1.0}, {"repo": "qlib", "id": "3.2", "value": 3.1293934633209464}, {"repo": "DeepSpeech", "id": "3.5", "value": 0.0}, {"repo": "apollo", "id": "3.5", "value": 0.0}, {"repo": "magenta", "id": "3.5", "value": 0.6494272285567027}, {"repo": "mmf", "id": "3.5", "value": 0.3269101237460131}, {"repo": "mycroft-core", "id": "3.5", "value": 0.0}, {"repo": "nanodet", "id": "3.5", "value": 0.6494272285567027}, {"repo": "paperless-ng", "id": "3.5", "value": 0.0}, {"repo": "qlib", "id": "3.5", "value": 0.2352520168890836}, {"repo": "DeepSpeech", "id": "4.2", "value": 0.0}, {"repo": "apollo", "id": "4.2", "value": 0.0}, {"repo": "magenta", "id": "4.2", "value": 0.5983516452371671}, {"repo": "mmf", "id": "4.2", "value": 0.8314794192830983}, {"repo": "mycroft-core", "id": "4.2", "value": 0.0}, {"repo": "nanodet", "id": "4.2", "value": 1.6666666666666667}, {"repo": "paperless-ng", "id": "4.2", "value": 0.0}, {"repo": "qlib", "id": "4.2", "value": 1.0000000000000002}, {"repo": "DeepSpeech", "id": "5.3", "value": 0.0}, {"repo": "apollo", "id": "5.3", "value": 0.0}, {"repo": "magenta", "id": "5.3", "value": 0.0}, {"repo": "mmf", "id": "5.3", "value": 0.5091750772173156}, {"repo": "mycroft-core", "id": "5.3", "value": 0.0}, {"repo": "nanodet", "id": "5.3", "value": 0.0}, {"repo": "paperless-ng", "id": "5.3", "value": 0.0}, {"repo": "qlib", "id": "5.3", "value": 1.0183501544346312}, {"repo": "DeepSpeech", "id": "6.1", "value": 1.4645818515876807}, {"repo": "apollo", "id": "6.1", "value": 0.7211102550927977}, {"repo": "magenta", "id": "6.1", "value": 1.0511898020814316}, {"repo": "mmf", "id": "6.1", "value": 0.9721111047611789}, {"repo": "mycroft-core", "id": "6.1", "value": 0.0}, {"repo": "nanodet", "id": "6.1", "value": 1.1916375287812981}, {"repo": "paperless-ng", "id": "6.1", "value": 0.0}, {"repo": "qlib", "id": "6.1", "value": 0.9380831519646856}, {"repo": "DeepSpeech", "id": "6.2", "value": 0.0}, {"repo": "apollo", "id": "6.2", "value": 0.0}, {"repo": "magenta", "id": "6.2", "value": 0.0}, {"repo": "mmf", "id": "6.2", "value": 1.1643750387092773}, {"repo": "mycroft-core", "id": "6.2", "value": 1.467598771410686}, {"repo": "nanodet", "id": "6.2", "value": 1.5063966175050876}, {"repo": "paperless-ng", "id": "6.2", "value": 0.7337993857053429}, {"repo": "qlib", "id": "6.2", "value": 1.0561177090573832}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<p><strong>Observations</strong>: - The evaluation of the checklist item 3.2 <code>Data in the Expected Format</code> becomes much more unstable in most of other repositories. - That of the 2.1 is significantly unstable in the repo <code>paperless-ng</code>, <code>magenta</code> and <code>DeepSpeech</code>, but it may be due to the repo itself.</p>
-<p>TODO: to look into the 3.2’s scores.</p>
-</section>
-<section id="todo-given-ground-truth-1-distribution-of-system-score" class="level4">
-<h4 class="anchored" data-anchor-id="todo-given-ground-truth-1-distribution-of-system-score">TODO: Given ground truth == 1, distribution of system score?</h4>
-</section>
-<section id="todo-given-ground-truth-0-distribution-of-system-score" class="level4">
-<h4 class="anchored" data-anchor-id="todo-given-ground-truth-0-distribution-of-system-score">TODO: Given ground truth == 0, distribution of system score?</h4>
-<div class="cell" data-execution_count="29">
-<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> generate_histogram_plot(df_repo_run_long, df_ground_truth<span class="op">=</span><span class="va">None</span>, repo<span class="op">=</span><span class="va">None</span>, <span class="bu">id</span><span class="op">=</span><span class="va">None</span>):</span>
-<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""</span></span>
-<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a><span class="co">    Generate histogram across all repo and all checklist item</span></span>
-<span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a><span class="co">    Optional to incorporate ground truth and select specific repo/checklist item</span></span>
-<span id="cb33-5"><a href="#cb33-5" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
-<span id="cb33-6"><a href="#cb33-6" aria-hidden="true" tabindex="-1"></a>    <span class="co"># data</span></span>
-<span id="cb33-7"><a href="#cb33-7" aria-hidden="true" tabindex="-1"></a>    repo_data <span class="op">=</span> df_repo_run_long.copy()</span>
-<span id="cb33-8"><a href="#cb33-8" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> repo:</span>
-<span id="cb33-9"><a href="#cb33-9" aria-hidden="true" tabindex="-1"></a>        repo_data <span class="op">=</span> repo_data.query(<span class="ss">f'repo == "</span><span class="sc">{</span>repo<span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb33-10"><a href="#cb33-10" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> <span class="bu">id</span>:</span>
-<span id="cb33-11"><a href="#cb33-11" aria-hidden="true" tabindex="-1"></a>        repo_data <span class="op">=</span> repo_data.query(<span class="ss">f'id == "</span><span class="sc">{</span><span class="bu">id</span><span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb33-12"><a href="#cb33-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb33-13"><a href="#cb33-13" aria-hidden="true" tabindex="-1"></a>    <span class="co"># base histogram chart</span></span>
-<span id="cb33-14"><a href="#cb33-14" aria-hidden="true" tabindex="-1"></a>    base <span class="op">=</span> alt.Chart().mark_bar().encode(</span>
-<span id="cb33-15"><a href="#cb33-15" aria-hidden="true" tabindex="-1"></a>                x<span class="op">=</span>alt.X(<span class="st">'eval_score:Q'</span>, title<span class="op">=</span><span class="st">'Score'</span>), </span>
-<span id="cb33-16"><a href="#cb33-16" aria-hidden="true" tabindex="-1"></a>                y<span class="op">=</span>alt.Y(<span class="st">'count()'</span>), </span>
-<span id="cb33-17"><a href="#cb33-17" aria-hidden="true" tabindex="-1"></a>                color<span class="op">=</span>alt.value(<span class="st">'grey'</span>),</span>
-<span id="cb33-18"><a href="#cb33-18" aria-hidden="true" tabindex="-1"></a>                size<span class="op">=</span>alt.value(<span class="dv">20</span>),</span>
-<span id="cb33-19"><a href="#cb33-19" aria-hidden="true" tabindex="-1"></a>            )</span>
-<span id="cb33-20"><a href="#cb33-20" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb33-21"><a href="#cb33-21" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> df_ground_truth <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
-<span id="cb33-22"><a href="#cb33-22" aria-hidden="true" tabindex="-1"></a>        <span class="co"># data</span></span>
-<span id="cb33-23"><a href="#cb33-23" aria-hidden="true" tabindex="-1"></a>        gt_data <span class="op">=</span> df_ground_truth.copy()</span>
-<span id="cb33-24"><a href="#cb33-24" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> repo:</span>
-<span id="cb33-25"><a href="#cb33-25" aria-hidden="true" tabindex="-1"></a>            gt_data <span class="op">=</span> gt_data.query(<span class="ss">f'repo == "</span><span class="sc">{</span>repo<span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb33-26"><a href="#cb33-26" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> <span class="bu">id</span>:</span>
-<span id="cb33-27"><a href="#cb33-27" aria-hidden="true" tabindex="-1"></a>            gt_data <span class="op">=</span> gt_data.query(<span class="ss">f'id == "</span><span class="sc">{</span><span class="bu">id</span><span class="sc">}</span><span class="ss">"'</span>)</span>
-<span id="cb33-28"><a href="#cb33-28" aria-hidden="true" tabindex="-1"></a>        </span>
-<span id="cb33-29"><a href="#cb33-29" aria-hidden="true" tabindex="-1"></a>        repo_data <span class="op">=</span> pd.merge(repo_data, gt_data, how<span class="op">=</span><span class="st">'left'</span>, on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>])</span>
-<span id="cb33-30"><a href="#cb33-30" aria-hidden="true" tabindex="-1"></a>        repo_data[<span class="st">'is_equal_to_gt'</span>] <span class="op">=</span> repo_data[<span class="st">'eval_score'</span>] <span class="op">==</span> repo_data[<span class="st">'score'</span>]</span>
-<span id="cb33-31"><a href="#cb33-31" aria-hidden="true" tabindex="-1"></a>        </span>
-<span id="cb33-32"><a href="#cb33-32" aria-hidden="true" tabindex="-1"></a>        <span class="co"># base histogram chart</span></span>
-<span id="cb33-33"><a href="#cb33-33" aria-hidden="true" tabindex="-1"></a>        base <span class="op">=</span> base.encode(</span>
-<span id="cb33-34"><a href="#cb33-34" aria-hidden="true" tabindex="-1"></a>                    color<span class="op">=</span>alt.Color(<span class="st">'is_equal_to_gt'</span>, scale<span class="op">=</span>alt.Scale(<span class="bu">range</span><span class="op">=</span>[<span class="st">'grey'</span>, <span class="st">'green'</span>]), legend<span class="op">=</span><span class="va">None</span>)</span>
-<span id="cb33-35"><a href="#cb33-35" aria-hidden="true" tabindex="-1"></a>                )</span>
-<span id="cb33-36"><a href="#cb33-36" aria-hidden="true" tabindex="-1"></a>        base <span class="op">+=</span> base.mark_text().encode(</span>
-<span id="cb33-37"><a href="#cb33-37" aria-hidden="true" tabindex="-1"></a>            text<span class="op">=</span>alt.value(<span class="st">'Ground Truth'</span>),</span>
-<span id="cb33-38"><a href="#cb33-38" aria-hidden="true" tabindex="-1"></a>            x<span class="op">=</span><span class="st">'score'</span>,</span>
-<span id="cb33-39"><a href="#cb33-39" aria-hidden="true" tabindex="-1"></a>            size<span class="op">=</span>alt.value(<span class="dv">10</span>),</span>
-<span id="cb33-40"><a href="#cb33-40" aria-hidden="true" tabindex="-1"></a>            color<span class="op">=</span>alt.value(<span class="st">'green'</span>),</span>
-<span id="cb33-41"><a href="#cb33-41" aria-hidden="true" tabindex="-1"></a>        )</span>
-<span id="cb33-42"><a href="#cb33-42" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb33-43"><a href="#cb33-43" aria-hidden="true" tabindex="-1"></a>    plot <span class="op">=</span> alt.layer(</span>
-<span id="cb33-44"><a href="#cb33-44" aria-hidden="true" tabindex="-1"></a>                base,</span>
-<span id="cb33-45"><a href="#cb33-45" aria-hidden="true" tabindex="-1"></a>                data<span class="op">=</span>repo_data</span>
-<span id="cb33-46"><a href="#cb33-46" aria-hidden="true" tabindex="-1"></a>            ).properties(</span>
-<span id="cb33-47"><a href="#cb33-47" aria-hidden="true" tabindex="-1"></a>                width<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb33-48"><a href="#cb33-48" aria-hidden="true" tabindex="-1"></a>                height<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb33-49"><a href="#cb33-49" aria-hidden="true" tabindex="-1"></a>            ).facet(</span>
-<span id="cb33-50"><a href="#cb33-50" aria-hidden="true" tabindex="-1"></a>                row<span class="op">=</span><span class="st">'repo'</span>,</span>
-<span id="cb33-51"><a href="#cb33-51" aria-hidden="true" tabindex="-1"></a>                column<span class="op">=</span><span class="st">'id'</span></span>
-<span id="cb33-52"><a href="#cb33-52" aria-hidden="true" tabindex="-1"></a>            )        </span>
-<span id="cb33-53"><a href="#cb33-53" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb33-54"><a href="#cb33-54" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> plot</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-</section>
-<section id="contingency-table" class="level4">
-<h4 class="anchored" data-anchor-id="contingency-table">Contingency Table</h4>
-<div class="cell" data-execution_count="37">
-<div class="sourceCode cell-code" id="cb34"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_p <span class="op">=</span> pd.melt(df_repo_run, id_vars<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'run'</span>], var_name<span class="op">=</span><span class="st">'id'</span>, value_name<span class="op">=</span><span class="st">'eval_score'</span>)</span>
-<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a>df_repo_run_p <span class="op">=</span> pd.merge(df_repo_run_p, ground_truth, how<span class="op">=</span><span class="st">'inner'</span>, on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>])</span>
-<span id="cb34-3"><a href="#cb34-3" aria-hidden="true" tabindex="-1"></a>df_repo_run_p <span class="op">=</span> df_repo_run_p.rename(columns<span class="op">=</span>{<span class="st">'score'</span>: <span class="st">'ground_truth'</span>})</span>
-<span id="cb34-4"><a href="#cb34-4" aria-hidden="true" tabindex="-1"></a>pd.pivot_table(df_repo_run_p, values<span class="op">=</span><span class="st">'run'</span>, index<span class="op">=</span>[<span class="st">'ground_truth'</span>], columns<span class="op">=</span>[<span class="st">'eval_score'</span>], aggfunc<span class="op">=</span><span class="st">'count'</span>, fill_value<span class="op">=</span><span class="dv">0</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="37">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th">eval_score</th>
-<th data-quarto-table-cell-role="th">0.0</th>
-<th data-quarto-table-cell-role="th">0.5</th>
-<th data-quarto-table-cell-role="th">1.0</th>
-</tr>
-<tr class="odd">
-<th data-quarto-table-cell-role="th">ground_truth</th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0.0</td>
-<td>227</td>
-<td>8</td>
-<td>35</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">0.5</td>
-<td>21</td>
-<td>39</td>
-<td>30</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>21</td>
-<td>159</td>
-<td>90</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="35">
-<div class="sourceCode cell-code" id="cb35"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a>df_repo_run_p</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="35">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">run</th>
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th">eval_score</th>
-<th data-quarto-table-cell-role="th">ground_truth</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>DeepSpeech</td>
-<td>1</td>
-<td>2.1</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>DeepSpeech</td>
-<td>2</td>
-<td>2.1</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>DeepSpeech</td>
-<td>3</td>
-<td>2.1</td>
-<td>0.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>DeepSpeech</td>
-<td>4</td>
-<td>2.1</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>DeepSpeech</td>
-<td>5</td>
-<td>2.1</td>
-<td>0.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">...</td>
-<td>...</td>
-<td>...</td>
-<td>...</td>
-<td>...</td>
-<td>...</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">625</td>
-<td>qlib</td>
-<td>26</td>
-<td>6.2</td>
-<td>0.5</td>
-<td>1.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">626</td>
-<td>qlib</td>
-<td>27</td>
-<td>6.2</td>
-<td>0.5</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">627</td>
-<td>qlib</td>
-<td>28</td>
-<td>6.2</td>
-<td>0.5</td>
-<td>1.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">628</td>
-<td>qlib</td>
-<td>29</td>
-<td>6.2</td>
-<td>1.0</td>
-<td>1.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">629</td>
-<td>qlib</td>
-<td>30</td>
-<td>6.2</td>
-<td>0.5</td>
-<td>1.0</td>
-</tr>
-</tbody>
-</table>
-
-<p>630 rows × 5 columns</p>
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="31">
-<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a><span class="co"># generate_histogram_plot(df_repo_run_p, df_ground_truth=ground_truth)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-
-
-</section>
-</section>
-
-</main> <!-- /main -->
-<script id="quarto-html-after-body" type="application/javascript">
-window.document.addEventListener("DOMContentLoaded", function (event) {
-  const toggleBodyColorMode = (bsSheetEl) => {
-    const mode = bsSheetEl.getAttribute("data-mode");
-    const bodyEl = window.document.querySelector("body");
-    if (mode === "dark") {
-      bodyEl.classList.add("quarto-dark");
-      bodyEl.classList.remove("quarto-light");
-    } else {
-      bodyEl.classList.add("quarto-light");
-      bodyEl.classList.remove("quarto-dark");
-    }
-  }
-  const toggleBodyColorPrimary = () => {
-    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
-    if (bsSheetEl) {
-      toggleBodyColorMode(bsSheetEl);
-    }
-  }
-  toggleBodyColorPrimary();  
-  const icon = "";
-  const anchorJS = new window.AnchorJS();
-  anchorJS.options = {
-    placement: 'right',
-    icon: icon
-  };
-  anchorJS.add('.anchored');
-  const isCodeAnnotation = (el) => {
-    for (const clz of el.classList) {
-      if (clz.startsWith('code-annotation-')) {                     
-        return true;
-      }
-    }
-    return false;
-  }
-  const clipboard = new window.ClipboardJS('.code-copy-button', {
-    text: function(trigger) {
-      const codeEl = trigger.previousElementSibling.cloneNode(true);
-      for (const childEl of codeEl.children) {
-        if (isCodeAnnotation(childEl)) {
-          childEl.remove();
-        }
-      }
-      return codeEl.innerText;
-    }
-  });
-  clipboard.on('success', function(e) {
-    // button target
-    const button = e.trigger;
-    // don't keep focus
-    button.blur();
-    // flash "checked"
-    button.classList.add('code-copy-button-checked');
-    var currentTitle = button.getAttribute("title");
-    button.setAttribute("title", "Copied!");
-    let tooltip;
-    if (window.bootstrap) {
-      button.setAttribute("data-bs-toggle", "tooltip");
-      button.setAttribute("data-bs-placement", "left");
-      button.setAttribute("data-bs-title", "Copied!");
-      tooltip = new bootstrap.Tooltip(button, 
-        { trigger: "manual", 
-          customClass: "code-copy-button-tooltip",
-          offset: [0, -8]});
-      tooltip.show();    
-    }
-    setTimeout(function() {
-      if (tooltip) {
-        tooltip.hide();
-        button.removeAttribute("data-bs-title");
-        button.removeAttribute("data-bs-toggle");
-        button.removeAttribute("data-bs-placement");
-      }
-      button.setAttribute("title", currentTitle);
-      button.classList.remove('code-copy-button-checked');
-    }, 1000);
-    // clear code selection
-    e.clearSelection();
-  });
-  function tippyHover(el, contentFn) {
-    const config = {
-      allowHTML: true,
-      content: contentFn,
-      maxWidth: 500,
-      delay: 100,
-      arrow: false,
-      appendTo: function(el) {
-          return el.parentElement;
-      },
-      interactive: true,
-      interactiveBorder: 10,
-      theme: 'quarto',
-      placement: 'bottom-start'
-    };
-    window.tippy(el, config); 
-  }
-  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
-  for (var i=0; i<noterefs.length; i++) {
-    const ref = noterefs[i];
-    tippyHover(ref, function() {
-      // use id or data attribute instead here
-      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
-      try { href = new URL(href).hash; } catch {}
-      const id = href.replace(/^#\/?/, "");
-      const note = window.document.getElementById(id);
-      return note.innerHTML;
-    });
-  }
-      let selectedAnnoteEl;
-      const selectorForAnnotation = ( cell, annotation) => {
-        let cellAttr = 'data-code-cell="' + cell + '"';
-        let lineAttr = 'data-code-annotation="' +  annotation + '"';
-        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
-        return selector;
-      }
-      const selectCodeLines = (annoteEl) => {
-        const doc = window.document;
-        const targetCell = annoteEl.getAttribute("data-target-cell");
-        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
-        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
-        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
-        const lineIds = lines.map((line) => {
-          return targetCell + "-" + line;
-        })
-        let top = null;
-        let height = null;
-        let parent = null;
-        if (lineIds.length > 0) {
-            //compute the position of the single el (top and bottom and make a div)
-            const el = window.document.getElementById(lineIds[0]);
-            top = el.offsetTop;
-            height = el.offsetHeight;
-            parent = el.parentElement.parentElement;
-          if (lineIds.length > 1) {
-            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
-            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
-            height = bottom - top;
-          }
-          if (top !== null && height !== null && parent !== null) {
-            // cook up a div (if necessary) and position it 
-            let div = window.document.getElementById("code-annotation-line-highlight");
-            if (div === null) {
-              div = window.document.createElement("div");
-              div.setAttribute("id", "code-annotation-line-highlight");
-              div.style.position = 'absolute';
-              parent.appendChild(div);
-            }
-            div.style.top = top - 2 + "px";
-            div.style.height = height + 4 + "px";
-            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
-            if (gutterDiv === null) {
-              gutterDiv = window.document.createElement("div");
-              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
-              gutterDiv.style.position = 'absolute';
-              const codeCell = window.document.getElementById(targetCell);
-              const gutter = codeCell.querySelector('.code-annotation-gutter');
-              gutter.appendChild(gutterDiv);
-            }
-            gutterDiv.style.top = top - 2 + "px";
-            gutterDiv.style.height = height + 4 + "px";
-          }
-          selectedAnnoteEl = annoteEl;
-        }
-      };
-      const unselectCodeLines = () => {
-        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
-        elementsIds.forEach((elId) => {
-          const div = window.document.getElementById(elId);
-          if (div) {
-            div.remove();
-          }
-        });
-        selectedAnnoteEl = undefined;
-      };
-      // Attach click handler to the DT
-      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
-      for (const annoteDlNode of annoteDls) {
-        annoteDlNode.addEventListener('click', (event) => {
-          const clickedEl = event.target;
-          if (clickedEl !== selectedAnnoteEl) {
-            unselectCodeLines();
-            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
-            if (activeEl) {
-              activeEl.classList.remove('code-annotation-active');
-            }
-            selectCodeLines(clickedEl);
-            clickedEl.classList.add('code-annotation-active');
-          } else {
-            // Unselect the line
-            unselectCodeLines();
-            clickedEl.classList.remove('code-annotation-active');
-          }
-        });
-      }
-  const findCites = (el) => {
-    const parentEl = el.parentElement;
-    if (parentEl) {
-      const cites = parentEl.dataset.cites;
-      if (cites) {
-        return {
-          el,
-          cites: cites.split(' ')
-        };
-      } else {
-        return findCites(el.parentElement)
-      }
-    } else {
-      return undefined;
-    }
-  };
-  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
-  for (var i=0; i<bibliorefs.length; i++) {
-    const ref = bibliorefs[i];
-    const citeInfo = findCites(ref);
-    if (citeInfo) {
-      tippyHover(citeInfo.el, function() {
-        var popup = window.document.createElement('div');
-        citeInfo.cites.forEach(function(cite) {
-          var citeDiv = window.document.createElement('div');
-          citeDiv.classList.add('hanging-indent');
-          citeDiv.classList.add('csl-entry');
-          var biblioDiv = window.document.getElementById('ref-' + cite);
-          if (biblioDiv) {
-            citeDiv.innerHTML = biblioDiv.innerHTML;
-          }
-          popup.appendChild(citeDiv);
-        });
-        return popup.innerHTML;
-      });
-    }
-  }
-});
-</script>
-</div> <!-- /content -->
-
-
-
-</body></html>
\ No newline at end of file
diff --git a/report/final_report/docs/02_plots-for-final-report.html b/report/final_report/docs/02_plots-for-final-report.html
deleted file mode 100644
index 2ed9083..0000000
--- a/report/final_report/docs/02_plots-for-final-report.html
+++ /dev/null
@@ -1,931 +0,0 @@
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
-
-<meta charset="utf-8">
-<meta name="generator" content="quarto-1.3.450">
-
-<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
-
-
-<title>plots-for-final-report</title>
-<style>
-code{white-space: pre-wrap;}
-span.smallcaps{font-variant: small-caps;}
-div.columns{display: flex; gap: min(4vw, 1.5em);}
-div.column{flex: auto; overflow-x: auto;}
-div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
-ul.task-list{list-style: none;}
-ul.task-list li input[type="checkbox"] {
-  width: 0.8em;
-  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
-  vertical-align: middle;
-}
-/* CSS for syntax highlighting */
-pre > code.sourceCode { white-space: pre; position: relative; }
-pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
-pre > code.sourceCode > span:empty { height: 1.2em; }
-.sourceCode { overflow: visible; }
-code.sourceCode > span { color: inherit; text-decoration: inherit; }
-div.sourceCode { margin: 1em 0; }
-pre.sourceCode { margin: 0; }
-@media screen {
-div.sourceCode { overflow: auto; }
-}
-@media print {
-pre > code.sourceCode { white-space: pre-wrap; }
-pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
-}
-pre.numberSource code
-  { counter-reset: source-line 0; }
-pre.numberSource code > span
-  { position: relative; left: -4em; counter-increment: source-line; }
-pre.numberSource code > span > a:first-child::before
-  { content: counter(source-line);
-    position: relative; left: -1em; text-align: right; vertical-align: baseline;
-    border: none; display: inline-block;
-    -webkit-touch-callout: none; -webkit-user-select: none;
-    -khtml-user-select: none; -moz-user-select: none;
-    -ms-user-select: none; user-select: none;
-    padding: 0 4px; width: 4em;
-  }
-pre.numberSource { margin-left: 3em;  padding-left: 4px; }
-div.sourceCode
-  {   }
-@media screen {
-pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
-}
-</style>
-
-
-<script src="site_libs/quarto-nav/quarto-nav.js"></script>
-<script src="site_libs/quarto-nav/headroom.min.js"></script>
-<script src="site_libs/clipboard/clipboard.min.js"></script>
-<script src="site_libs/quarto-search/autocomplete.umd.js"></script>
-<script src="site_libs/quarto-search/fuse.min.js"></script>
-<script src="site_libs/quarto-search/quarto-search.js"></script>
-<meta name="quarto:offset" content="./">
-<script src="site_libs/quarto-html/quarto.js"></script>
-<script src="site_libs/quarto-html/popper.min.js"></script>
-<script src="site_libs/quarto-html/tippy.umd.min.js"></script>
-<script src="site_libs/quarto-html/anchor.min.js"></script>
-<link href="site_libs/quarto-html/tippy.css" rel="stylesheet">
-<link href="site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
-<script src="site_libs/bootstrap/bootstrap.min.js"></script>
-<link href="site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
-<link href="site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
-<script id="quarto-search-options" type="application/json">{
-  "location": "sidebar",
-  "copy-button": false,
-  "collapse-after": 3,
-  "panel-placement": "start",
-  "type": "textbox",
-  "limit": 20,
-  "language": {
-    "search-no-results-text": "No results",
-    "search-matching-documents-text": "matching documents",
-    "search-copy-link-title": "Copy link to search",
-    "search-hide-matches-text": "Hide additional matches",
-    "search-more-match-text": "more match in this document",
-    "search-more-matches-text": "more matches in this document",
-    "search-clear-button-title": "Clear",
-    "search-detached-cancel-button-title": "Cancel",
-    "search-submit-button-title": "Submit",
-    "search-label": "Search"
-  }
-}</script>
-<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
-<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script>
-<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
-
-
-<link rel="stylesheet" href="styles.css">
-</head>
-
-<body class="nav-sidebar docked">
-
-<div id="quarto-search-results"></div>
-  <header id="quarto-header" class="headroom fixed-top">
-  <nav class="quarto-secondary-nav">
-    <div class="container-fluid d-flex">
-      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
-        <i class="bi bi-layout-text-sidebar-reverse"></i>
-      </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"></ol></nav>
-      <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
-      </a>
-      <button type="button" class="btn quarto-search-button" aria-label="" onclick="window.quartoOpenSearch();">
-        <i class="bi bi-search"></i>
-      </button>
-    </div>
-  </nav>
-</header>
-<!-- content -->
-<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
-<!-- sidebar -->
-  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal sidebar-navigation docked overflow-auto">
-    <div class="pt-lg-2 mt-2 text-left sidebar-header">
-      <a href="./index.html" class="sidebar-logo-link">
-      <img src="./logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
-      </a>
-      </div>
-        <div class="mt-2 flex-shrink-0 align-items-center">
-        <div class="sidebar-search">
-        <div id="quarto-search" class="" title="Search"></div>
-        </div>
-        </div>
-    <div class="sidebar-menu-container"> 
-    <ul class="list-unstyled mt-1">
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
- <span class="menu-text">Final Report</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./final_report.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">DSCI591 Capstone Final Report</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
- <span class="menu-text">Proposal</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./proposal.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-    </ul>
-    </div>
-</nav>
-<div id="quarto-sidebar-glass" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass"></div>
-<!-- margin-sidebar -->
-    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
-        
-    </div>
-<!-- main -->
-<main class="content" id="quarto-document-content">
-
-
-
-<div class="cell" data-execution_count="1">
-<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> altair <span class="im">as</span> alt</span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>df_repo__stat <span class="op">=</span> pd.read_csv(<span class="st">'score_stat_by_repo_3.5-turbo.csv'</span>)</span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>gt <span class="op">=</span> pd.read_csv(<span class="st">'ground_truth.csv'</span>)</span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>gt <span class="op">=</span> gt.melt(id_vars<span class="op">=</span>[<span class="st">'id'</span>, <span class="st">'title'</span>], var_name<span class="op">=</span><span class="st">'repo'</span>, value_name<span class="op">=</span><span class="st">'ground_truth'</span>)</span>
-<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a>df_repo__stat_with_gt <span class="op">=</span> df_repo__stat.merge(gt, on<span class="op">=</span>[<span class="st">'id'</span>, <span class="st">'title'</span>, <span class="st">'repo'</span>])</span>
-<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a>base <span class="op">=</span> alt.Chart(</span>
-<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a>    df_repo__stat_with_gt.query(<span class="st">'repo in ["lightfm", "qlib", "DeepSpeech"]'</span>)</span>
-<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a>).transform_calculate(</span>
-<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a>    <span class="bu">min</span><span class="op">=</span><span class="st">"max(0, datum.mean-datum.std)"</span>,</span>
-<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a>    <span class="bu">max</span><span class="op">=</span><span class="st">"min(1, datum.mean+datum.std)"</span></span>
-<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the points</span></span>
-<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a>points <span class="op">=</span> base.mark_point(</span>
-<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a>    filled<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">50</span>,</span>
-<span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'black'</span></span>
-<span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a>).encode(</span>
-<span id="cb1-23"><a href="#cb1-23" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'mean:Q'</span>).scale(domainMin<span class="op">=</span><span class="dv">0</span>, domainMax<span class="op">=</span><span class="dv">1</span>).title(<span class="st">"Score"</span>).axis(</span>
-<span id="cb1-24"><a href="#cb1-24" aria-hidden="true" tabindex="-1"></a>        labelExpr<span class="op">=</span><span class="st">"datum.value % 0.5 ? null : datum.label"</span></span>
-<span id="cb1-25"><a href="#cb1-25" aria-hidden="true" tabindex="-1"></a>    ),</span>
-<span id="cb1-26"><a href="#cb1-26" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'id_title:N'</span>, title<span class="op">=</span><span class="va">None</span>, axis<span class="op">=</span>alt.Axis(labelPadding<span class="op">=</span><span class="dv">10</span>, labelLimit<span class="op">=</span><span class="dv">1000</span>, grid<span class="op">=</span><span class="va">False</span>))<span class="co">#.scale(domainMin=0, domainMax=1).title('Score'),</span></span>
-<span id="cb1-27"><a href="#cb1-27" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb1-28"><a href="#cb1-28" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-29"><a href="#cb1-29" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the points for ground truth</span></span>
-<span id="cb1-30"><a href="#cb1-30" aria-hidden="true" tabindex="-1"></a>gt_points <span class="op">=</span> base.mark_point(</span>
-<span id="cb1-31"><a href="#cb1-31" aria-hidden="true" tabindex="-1"></a>    filled<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb1-32"><a href="#cb1-32" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb1-33"><a href="#cb1-33" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'green'</span>,</span>
-<span id="cb1-34"><a href="#cb1-34" aria-hidden="true" tabindex="-1"></a>    shape<span class="op">=</span><span class="st">"diamond"</span></span>
-<span id="cb1-35"><a href="#cb1-35" aria-hidden="true" tabindex="-1"></a>).encode(</span>
-<span id="cb1-36"><a href="#cb1-36" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'ground_truth:Q'</span>),</span>
-<span id="cb1-37"><a href="#cb1-37" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'id_title:N'</span>)</span>
-<span id="cb1-38"><a href="#cb1-38" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb1-39"><a href="#cb1-39" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-40"><a href="#cb1-40" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the error bars</span></span>
-<span id="cb1-41"><a href="#cb1-41" aria-hidden="true" tabindex="-1"></a>errorbars <span class="op">=</span> base.mark_errorbar().encode(</span>
-<span id="cb1-42"><a href="#cb1-42" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">"min:Q"</span>).title(<span class="st">'1 SD'</span>), <span class="co">#"id:N",</span></span>
-<span id="cb1-43"><a href="#cb1-43" aria-hidden="true" tabindex="-1"></a>    x2<span class="op">=</span><span class="st">"max:Q"</span>,</span>
-<span id="cb1-44"><a href="#cb1-44" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">"id_title:N"</span></span>
-<span id="cb1-45"><a href="#cb1-45" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb1-46"><a href="#cb1-46" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-47"><a href="#cb1-47" aria-hidden="true" tabindex="-1"></a>(gt_points <span class="op">+</span> points <span class="op">+</span> errorbars).facet(</span>
-<span id="cb1-48"><a href="#cb1-48" aria-hidden="true" tabindex="-1"></a>    column<span class="op">=</span>alt.Column(<span class="st">'repo:N'</span>).title(<span class="va">None</span>)</span>
-<span id="cb1-49"><a href="#cb1-49" aria-hidden="true" tabindex="-1"></a>).configure_axis( </span>
-<span id="cb1-50"><a href="#cb1-50" aria-hidden="true" tabindex="-1"></a>    labelFontSize<span class="op">=</span><span class="dv">12</span>, </span>
-<span id="cb1-51"><a href="#cb1-51" aria-hidden="true" tabindex="-1"></a>    titleFontSize<span class="op">=</span><span class="dv">12</span></span>
-<span id="cb1-52"><a href="#cb1-52" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="1">
-
-<style>
-  #altair-viz-89a40543444c4e129acbefdfb51c6f79.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-89a40543444c4e129acbefdfb51c6f79.vega-embed details,
-  #altair-viz-89a40543444c4e129acbefdfb51c6f79.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-89a40543444c4e129acbefdfb51c6f79"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-89a40543444c4e129acbefdfb51c6f79") {
-      outputDiv = document.getElementById("altair-viz-89a40543444c4e129acbefdfb51c6f79");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "data": {"name": "data-0cd69fd1c95de279cdd7d1f0310bd508"}, "facet": {"column": {"field": "repo", "title": null, "type": "nominal"}}, "spec": {"layer": [{"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 200}, "encoding": {"x": {"field": "ground_truth", "type": "quantitative"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelExpr": "datum.value % 0.5 ? null : datum.label"}, "field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "id_title", "title": null, "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "min", "title": "1 SD", "type": "quantitative"}, "x2": {"field": "max"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}]}, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-0cd69fd1c95de279cdd7d1f0310bd508": [{"repo": "lightfm", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 1.0}, {"repo": "lightfm", "id": 3.2, "count": 30.0, "mean": 0.5, "std": 0.0, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0}, {"repo": "lightfm", "id": 3.5, "count": 30.0, "mean": 0.0, "std": 0.0, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0}, {"repo": "lightfm", "id": 4.2, "count": 30.0, "mean": 0.8166666666666667, "std": 0.2450662589267805, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 1.0}, {"repo": "lightfm", "id": 5.3, "count": 30.0, "mean": 0.4833333333333333, "std": 0.0912870929175276, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 0.5}, {"repo": "lightfm", "id": 6.1, "count": 30.0, "mean": 0.9166666666666666, "std": 0.1895245108947258, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0}, {"repo": "lightfm", "id": 6.2, "count": 30.0, "mean": 0.9833333333333332, "std": 0.0912870929175276, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0}, {"repo": "qlib", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 0.5}, {"repo": "qlib", "id": 3.2, "count": 30.0, "mean": 0.7666666666666667, "std": 0.2537081317024624, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0}, {"repo": "qlib", "id": 3.5, "count": 30.0, "mean": 0.1166666666666666, "std": 0.2150915335760381, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0}, {"repo": "qlib", "id": 4.2, "count": 30.0, "mean": 0.4833333333333333, "std": 0.2069204966986668, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 0.5}, {"repo": "qlib", "id": 5.3, "count": 30.0, "mean": 0.55, "std": 0.2012889499682243, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 1.0}, {"repo": "qlib", "id": 6.1, "count": 30.0, "mean": 0.6333333333333333, "std": 0.2916461404928373, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0}, {"repo": "qlib", "id": 6.2, "count": 30.0, "mean": 0.6, "std": 0.203419051086243, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<div class="cell" data-execution_count="2">
-<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>df_repo_run <span class="op">=</span> pd.read_csv(<span class="st">'score_by_repo_run_3.5-turbo.csv'</span>)</span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>df_repo_run <span class="op">=</span> df_repo_run.merge(gt, on<span class="op">=</span>[<span class="st">'id'</span>, <span class="st">'title'</span>, <span class="st">'repo'</span>])</span>
-<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>contingency_table <span class="op">=</span> pd.pivot_table(</span>
-<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>    df_repo_run,</span>
-<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>    values<span class="op">=</span><span class="st">'run'</span>, </span>
-<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>    index<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id_title'</span>, <span class="st">'ground_truth'</span>], </span>
-<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>    columns<span class="op">=</span>[<span class="st">'score'</span>],</span>
-<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a>    aggfunc<span class="op">=</span><span class="st">'count'</span>, </span>
-<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>    fill_value<span class="op">=</span><span class="dv">0</span></span>
-<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a>contingency_table.index.names <span class="op">=</span> [<span class="st">'Repository'</span>, <span class="st">'Checklist Item'</span>, <span class="st">'Ground Truth'</span>]</span>
-<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a>contingency_table.sort_index(level<span class="op">=</span>[<span class="dv">0</span>, <span class="dv">2</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="2">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">score</th>
-<th data-quarto-table-cell-role="th">0.0</th>
-<th data-quarto-table-cell-role="th">0.5</th>
-<th data-quarto-table-cell-role="th">1.0</th>
-</tr>
-<tr class="odd">
-<th data-quarto-table-cell-role="th">Repository</th>
-<th data-quarto-table-cell-role="th">Checklist Item</th>
-<th data-quarto-table-cell-role="th">Ground Truth</th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td rowspan="7" data-quarto-table-cell-role="th" data-valign="top">lightfm</td>
-<td data-quarto-table-cell-role="th">3.5. Check for Duplicate Records in Data</td>
-<td data-quarto-table-cell-role="th">0.0</td>
-<td>30</td>
-<td>0</td>
-<td>0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">5.3. Ensure Model Output Shape Aligns with Expectation</td>
-<td data-quarto-table-cell-role="th">0.5</td>
-<td>1</td>
-<td>29</td>
-<td>0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2.1. Ensure Data File Loads as Expected</td>
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>0</td>
-<td>0</td>
-<td>30</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3.2. Data in the Expected Format</td>
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>0</td>
-<td>30</td>
-<td>0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4.2. Verify Data Split Proportion</td>
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>0</td>
-<td>11</td>
-<td>19</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">6.1. Verify Evaluation Metrics Implementation</td>
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>0</td>
-<td>5</td>
-<td>25</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">6.2. Evaluate Model's Performance Against Thresholds</td>
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>0</td>
-<td>1</td>
-<td>29</td>
-</tr>
-<tr class="even">
-<td rowspan="7" data-quarto-table-cell-role="th" data-valign="top">qlib</td>
-<td data-quarto-table-cell-role="th">3.5. Check for Duplicate Records in Data</td>
-<td data-quarto-table-cell-role="th">0.0</td>
-<td>23</td>
-<td>7</td>
-<td>0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2.1. Ensure Data File Loads as Expected</td>
-<td data-quarto-table-cell-role="th">0.5</td>
-<td>0</td>
-<td>0</td>
-<td>30</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">4.2. Verify Data Split Proportion</td>
-<td data-quarto-table-cell-role="th">0.5</td>
-<td>3</td>
-<td>25</td>
-<td>2</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">3.2. Data in the Expected Format</td>
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>0</td>
-<td>14</td>
-<td>16</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">5.3. Ensure Model Output Shape Aligns with Expectation</td>
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>1</td>
-<td>25</td>
-<td>4</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">6.1. Verify Evaluation Metrics Implementation</td>
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>2</td>
-<td>18</td>
-<td>10</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">6.2. Evaluate Model's Performance Against Thresholds</td>
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>0</td>
-<td>24</td>
-<td>6</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="3">
-<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> df_repo__stat[[<span class="st">'repo'</span>, <span class="st">'std'</span>, <span class="st">'id_title'</span>]].pivot(index<span class="op">=</span><span class="st">'repo'</span>, columns<span class="op">=</span><span class="st">'id_title'</span>).copy()</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>stds.columns <span class="op">=</span> [col[<span class="dv">1</span>] <span class="cf">for</span> col <span class="kw">in</span> stds.columns]</span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> stds.reset_index()</span>
-<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> stds.melt(id_vars<span class="op">=</span><span class="st">'repo'</span>, var_name<span class="op">=</span><span class="st">'id_title'</span>)</span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>base <span class="op">=</span> alt.Chart(stds)</span>
-<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>box <span class="op">=</span> base.mark_boxplot(</span>
-<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'grey'</span>,</span>
-<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a>    opacity<span class="op">=</span><span class="fl">0.5</span>,</span>
-<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">20</span>,</span>
-<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>).encode(</span>
-<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'value:Q'</span>).title(<span class="st">'Standard Deviation of Scores'</span>),</span>
-<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'id_title:N'</span>, title<span class="op">=</span><span class="va">None</span>, axis<span class="op">=</span>alt.Axis(labelPadding<span class="op">=</span><span class="dv">10</span>, labelLimit<span class="op">=</span><span class="dv">1000</span>, grid<span class="op">=</span><span class="va">False</span>))</span>
-<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-17"><a href="#cb3-17" aria-hidden="true" tabindex="-1"></a>stripplot <span class="op">=</span> base.mark_circle(size<span class="op">=</span><span class="dv">100</span>).encode(</span>
-<span id="cb3-18"><a href="#cb3-18" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y( </span>
-<span id="cb3-19"><a href="#cb3-19" aria-hidden="true" tabindex="-1"></a>        <span class="st">'id_title:N'</span>,</span>
-<span id="cb3-20"><a href="#cb3-20" aria-hidden="true" tabindex="-1"></a>        axis<span class="op">=</span>alt.Axis(ticks<span class="op">=</span><span class="va">False</span>, grid<span class="op">=</span><span class="va">True</span>, labels<span class="op">=</span><span class="va">True</span>), </span>
-<span id="cb3-21"><a href="#cb3-21" aria-hidden="true" tabindex="-1"></a>        scale<span class="op">=</span>alt.Scale(), </span>
-<span id="cb3-22"><a href="#cb3-22" aria-hidden="true" tabindex="-1"></a>    ), </span>
-<span id="cb3-23"><a href="#cb3-23" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span><span class="st">'value:Q'</span>,</span>
-<span id="cb3-24"><a href="#cb3-24" aria-hidden="true" tabindex="-1"></a>    yOffset<span class="op">=</span><span class="st">"jitter:Q"</span>,</span>
-<span id="cb3-25"><a href="#cb3-25" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span>alt.Color(<span class="st">'id_title:N'</span>, legend<span class="op">=</span><span class="va">None</span>),</span>
-<span id="cb3-26"><a href="#cb3-26" aria-hidden="true" tabindex="-1"></a>    tooltip<span class="op">=</span><span class="st">'repo'</span></span>
-<span id="cb3-27"><a href="#cb3-27" aria-hidden="true" tabindex="-1"></a>).transform_calculate(</span>
-<span id="cb3-28"><a href="#cb3-28" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Generate Gaussian jitter with a Box-Muller transform</span></span>
-<span id="cb3-29"><a href="#cb3-29" aria-hidden="true" tabindex="-1"></a>    jitter<span class="op">=</span><span class="st">"sqrt(-2*log(random()))*cos(2*PI*random())"</span></span>
-<span id="cb3-30"><a href="#cb3-30" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb3-31"><a href="#cb3-31" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-32"><a href="#cb3-32" aria-hidden="true" tabindex="-1"></a>(</span>
-<span id="cb3-33"><a href="#cb3-33" aria-hidden="true" tabindex="-1"></a>    box <span class="op">+</span> stripplot</span>
-<span id="cb3-34"><a href="#cb3-34" aria-hidden="true" tabindex="-1"></a>).configure_view( </span>
-<span id="cb3-35"><a href="#cb3-35" aria-hidden="true" tabindex="-1"></a>    stroke<span class="op">=</span><span class="va">None</span></span>
-<span id="cb3-36"><a href="#cb3-36" aria-hidden="true" tabindex="-1"></a>).configure_axis( </span>
-<span id="cb3-37"><a href="#cb3-37" aria-hidden="true" tabindex="-1"></a>    labelFontSize<span class="op">=</span><span class="dv">12</span>, </span>
-<span id="cb3-38"><a href="#cb3-38" aria-hidden="true" tabindex="-1"></a>    titleFontSize<span class="op">=</span><span class="dv">12</span></span>
-<span id="cb3-39"><a href="#cb3-39" aria-hidden="true" tabindex="-1"></a>).properties(</span>
-<span id="cb3-40"><a href="#cb3-40" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">300</span>, </span>
-<span id="cb3-41"><a href="#cb3-41" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">600</span>,</span>
-<span id="cb3-42"><a href="#cb3-42" aria-hidden="true" tabindex="-1"></a>    title<span class="op">=</span><span class="st">"30 Runs on Openja's Repositories for each Checklist Item"</span></span>
-<span id="cb3-43"><a href="#cb3-43" aria-hidden="true" tabindex="-1"></a>) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="3">
-
-<style>
-  #altair-viz-0ea52a4d09614afca5f0fd61133d47be.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-0ea52a4d09614afca5f0fd61133d47be.vega-embed details,
-  #altair-viz-0ea52a4d09614afca5f0fd61133d47be.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-0ea52a4d09614afca5f0fd61133d47be"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-0ea52a4d09614afca5f0fd61133d47be") {
-      outputDiv = document.getElementById("altair-viz-0ea52a4d09614afca5f0fd61133d47be");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300, "stroke": null}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "layer": [{"mark": {"type": "boxplot", "color": "grey", "opacity": 0.5, "size": 20}, "encoding": {"x": {"field": "value", "title": "Standard Deviation of Scores", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "id_title", "title": null, "type": "nominal"}}}, {"mark": {"type": "circle", "size": 100}, "encoding": {"color": {"field": "id_title", "legend": null, "type": "nominal"}, "tooltip": {"field": "repo", "type": "nominal"}, "x": {"field": "value", "type": "quantitative"}, "y": {"axis": {"grid": true, "labels": true, "ticks": false}, "field": "id_title", "scale": {}, "type": "nominal"}, "yOffset": {"field": "jitter", "type": "quantitative"}}, "transform": [{"calculate": "sqrt(-2*log(random()))*cos(2*PI*random())", "as": "jitter"}]}], "data": {"name": "data-6781bc08998d89e40d13eed2c6299b07"}, "height": 300, "title": "30 Runs on Openja's Repositories for each Checklist Item", "width": 600, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-6781bc08998d89e40d13eed2c6299b07": [{"repo": "lightfm", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "magenta", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0912870929175276}, {"repo": "mmf", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "nanodet", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "qlib", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "lightfm", "id_title": "3.2. Data in the Expected Format", "value": 0.0}, {"repo": "magenta", "id_title": "3.2. Data in the Expected Format", "value": 0.0}, {"repo": "mmf", "id_title": "3.2. Data in the Expected Format", "value": 0.2330457998496995}, {"repo": "nanodet", "id_title": "3.2. Data in the Expected Format", "value": 0.2542738138578039}, {"repo": "qlib", "id_title": "3.2. Data in the Expected Format", "value": 0.2537081317024624}, {"repo": "lightfm", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.0}, {"repo": "magenta", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.0}, {"repo": "mmf", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.2537081317024624}, {"repo": "nanodet", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.0}, {"repo": "qlib", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.2150915335760381}, {"repo": "lightfm", "id_title": "4.2. Verify Data Split Proportion", "value": 0.2450662589267805}, {"repo": "magenta", "id_title": "4.2. Verify Data Split Proportion", "value": 0.0}, {"repo": "mmf", "id_title": "4.2. Verify Data Split Proportion", "value": 0.1728729518208802}, {"repo": "nanodet", "id_title": "4.2. Verify Data Split Proportion", "value": 0.0}, {"repo": "qlib", "id_title": "4.2. Verify Data Split Proportion", "value": 0.2069204966986668}, {"repo": "lightfm", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.0912870929175276}, {"repo": "magenta", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.0}, {"repo": "mmf", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.2491364395612199}, {"repo": "nanodet", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.0}, {"repo": "qlib", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.2012889499682243}, {"repo": "lightfm", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.1895245108947258}, {"repo": "magenta", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.0}, {"repo": "mmf", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.1525642883146823}, {"repo": "nanodet", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.2491364395612199}, {"repo": "qlib", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.2916461404928373}, {"repo": "lightfm", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.0912870929175276}, {"repo": "magenta", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.0}, {"repo": "mmf", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.2248882225544018}, {"repo": "nanodet", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.1268540658512312}, {"repo": "qlib", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.203419051086243}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-<div class="cell" data-execution_count="5">
-<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>df_repo_4o__stat <span class="op">=</span> pd.read_csv(<span class="st">'score_stat_by_repo_4o.csv'</span>)</span>
-<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>df_repo_4o__stat_with_gt <span class="op">=</span> df_repo_4o__stat.merge(gt, on<span class="op">=</span>[<span class="st">'id'</span>, <span class="st">'title'</span>, <span class="st">'repo'</span>])</span>
-<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>df_repo_4o__stat_with_gt[<span class="st">'model'</span>] <span class="op">=</span> <span class="st">'gpt-4o'</span></span>
-<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>df_repo_35turbo__stat_with_gt <span class="op">=</span> df_repo__stat_with_gt.query(<span class="st">"repo == 'lightfm'"</span>).copy()</span>
-<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>df_repo_35turbo__stat_with_gt[<span class="st">'model'</span>] <span class="op">=</span> <span class="st">'gpt-3.5-turbo'</span></span>
-<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>df_model_comp <span class="op">=</span> pd.concat(</span>
-<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>    (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), </span>
-<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a>    axis<span class="op">=</span><span class="dv">0</span></span>
-<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>base <span class="op">=</span> alt.Chart(</span>
-<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>    df_model_comp</span>
-<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a>).transform_calculate(</span>
-<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a>    <span class="bu">min</span><span class="op">=</span><span class="st">"max(0, datum.mean-datum.std)"</span>,</span>
-<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a>    <span class="bu">max</span><span class="op">=</span><span class="st">"min(1, datum.mean+datum.std)"</span></span>
-<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the points</span></span>
-<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a>points <span class="op">=</span> base.mark_point(</span>
-<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a>    filled<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">50</span>,</span>
-<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'black'</span></span>
-<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a>).encode(</span>
-<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'mean:Q'</span>).scale(domainMin<span class="op">=</span><span class="dv">0</span>, domainMax<span class="op">=</span><span class="dv">1</span>).title(<span class="st">"Score"</span>).axis(</span>
-<span id="cb4-27"><a href="#cb4-27" aria-hidden="true" tabindex="-1"></a>        labelExpr<span class="op">=</span><span class="st">"datum.value % 0.5 ? null : datum.label"</span></span>
-<span id="cb4-28"><a href="#cb4-28" aria-hidden="true" tabindex="-1"></a>    ),</span>
-<span id="cb4-29"><a href="#cb4-29" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'id_title:N'</span>, title<span class="op">=</span><span class="va">None</span>, axis<span class="op">=</span>alt.Axis(labelPadding<span class="op">=</span><span class="dv">10</span>, labelLimit<span class="op">=</span><span class="dv">1000</span>, grid<span class="op">=</span><span class="va">False</span>))<span class="co">#.scale(domainMin=0, domainMax=1).title('Score'),</span></span>
-<span id="cb4-30"><a href="#cb4-30" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb4-31"><a href="#cb4-31" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-32"><a href="#cb4-32" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the points for ground truth</span></span>
-<span id="cb4-33"><a href="#cb4-33" aria-hidden="true" tabindex="-1"></a>gt_points <span class="op">=</span> base.mark_point(</span>
-<span id="cb4-34"><a href="#cb4-34" aria-hidden="true" tabindex="-1"></a>    filled<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb4-35"><a href="#cb4-35" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb4-36"><a href="#cb4-36" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'green'</span>,</span>
-<span id="cb4-37"><a href="#cb4-37" aria-hidden="true" tabindex="-1"></a>    shape<span class="op">=</span><span class="st">"diamond"</span></span>
-<span id="cb4-38"><a href="#cb4-38" aria-hidden="true" tabindex="-1"></a>).encode(</span>
-<span id="cb4-39"><a href="#cb4-39" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'ground_truth:Q'</span>),</span>
-<span id="cb4-40"><a href="#cb4-40" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'id_title:N'</span>)</span>
-<span id="cb4-41"><a href="#cb4-41" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb4-42"><a href="#cb4-42" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-43"><a href="#cb4-43" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the error bars</span></span>
-<span id="cb4-44"><a href="#cb4-44" aria-hidden="true" tabindex="-1"></a>errorbars <span class="op">=</span> base.mark_errorbar().encode(</span>
-<span id="cb4-45"><a href="#cb4-45" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">"min:Q"</span>).title(<span class="st">'1 SD'</span>), <span class="co">#"id:N",</span></span>
-<span id="cb4-46"><a href="#cb4-46" aria-hidden="true" tabindex="-1"></a>    x2<span class="op">=</span><span class="st">"max:Q"</span>,</span>
-<span id="cb4-47"><a href="#cb4-47" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">"id_title:N"</span></span>
-<span id="cb4-48"><a href="#cb4-48" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb4-49"><a href="#cb4-49" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-50"><a href="#cb4-50" aria-hidden="true" tabindex="-1"></a>(gt_points <span class="op">+</span> points <span class="op">+</span> errorbars).facet(</span>
-<span id="cb4-51"><a href="#cb4-51" aria-hidden="true" tabindex="-1"></a>    column<span class="op">=</span>alt.Column(<span class="st">'model:N'</span>).title(<span class="va">None</span>)</span>
-<span id="cb4-52"><a href="#cb4-52" aria-hidden="true" tabindex="-1"></a>).configure_axis( </span>
-<span id="cb4-53"><a href="#cb4-53" aria-hidden="true" tabindex="-1"></a>    labelFontSize<span class="op">=</span><span class="dv">12</span>, </span>
-<span id="cb4-54"><a href="#cb4-54" aria-hidden="true" tabindex="-1"></a>    titleFontSize<span class="op">=</span><span class="dv">12</span></span>
-<span id="cb4-55"><a href="#cb4-55" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="5">
-
-<style>
-  #altair-viz-8eb7d45402d04510aba64c561c1696f8.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-8eb7d45402d04510aba64c561c1696f8.vega-embed details,
-  #altair-viz-8eb7d45402d04510aba64c561c1696f8.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-8eb7d45402d04510aba64c561c1696f8"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-8eb7d45402d04510aba64c561c1696f8") {
-      outputDiv = document.getElementById("altair-viz-8eb7d45402d04510aba64c561c1696f8");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "data": {"name": "data-fa92dc5f0050e8b2ad7776e083e19b07"}, "facet": {"column": {"field": "model", "title": null, "type": "nominal"}}, "spec": {"layer": [{"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 200}, "encoding": {"x": {"field": "ground_truth", "type": "quantitative"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelExpr": "datum.value % 0.5 ? null : datum.label"}, "field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "id_title", "title": null, "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "min", "title": "1 SD", "type": "quantitative"}, "x2": {"field": "max"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}]}, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-fa92dc5f0050e8b2ad7776e083e19b07": [{"repo": "lightfm", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 3.2, "count": 30.0, "mean": 0.5, "std": 0.0, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 3.5, "count": 30.0, "mean": 0.0, "std": 0.0, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 4.2, "count": 30.0, "mean": 0.8166666666666667, "std": 0.2450662589267805, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 5.3, "count": 30.0, "mean": 0.4833333333333333, "std": 0.0912870929175276, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 0.5, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 6.1, "count": 30.0, "mean": 0.9166666666666666, "std": 0.1895245108947258, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 6.2, "count": 30.0, "mean": 0.9833333333333332, "std": 0.0912870929175276, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 3.2, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 3.5, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 4.2, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 5.3, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 0.5, "model": "gpt-4o"}, {"repo": "lightfm", "id": 6.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 6.2, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0, "model": "gpt-4o"}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-
-
-
-</main> <!-- /main -->
-<script id="quarto-html-after-body" type="application/javascript">
-window.document.addEventListener("DOMContentLoaded", function (event) {
-  const toggleBodyColorMode = (bsSheetEl) => {
-    const mode = bsSheetEl.getAttribute("data-mode");
-    const bodyEl = window.document.querySelector("body");
-    if (mode === "dark") {
-      bodyEl.classList.add("quarto-dark");
-      bodyEl.classList.remove("quarto-light");
-    } else {
-      bodyEl.classList.add("quarto-light");
-      bodyEl.classList.remove("quarto-dark");
-    }
-  }
-  const toggleBodyColorPrimary = () => {
-    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
-    if (bsSheetEl) {
-      toggleBodyColorMode(bsSheetEl);
-    }
-  }
-  toggleBodyColorPrimary();  
-  const icon = "";
-  const anchorJS = new window.AnchorJS();
-  anchorJS.options = {
-    placement: 'right',
-    icon: icon
-  };
-  anchorJS.add('.anchored');
-  const isCodeAnnotation = (el) => {
-    for (const clz of el.classList) {
-      if (clz.startsWith('code-annotation-')) {                     
-        return true;
-      }
-    }
-    return false;
-  }
-  const clipboard = new window.ClipboardJS('.code-copy-button', {
-    text: function(trigger) {
-      const codeEl = trigger.previousElementSibling.cloneNode(true);
-      for (const childEl of codeEl.children) {
-        if (isCodeAnnotation(childEl)) {
-          childEl.remove();
-        }
-      }
-      return codeEl.innerText;
-    }
-  });
-  clipboard.on('success', function(e) {
-    // button target
-    const button = e.trigger;
-    // don't keep focus
-    button.blur();
-    // flash "checked"
-    button.classList.add('code-copy-button-checked');
-    var currentTitle = button.getAttribute("title");
-    button.setAttribute("title", "Copied!");
-    let tooltip;
-    if (window.bootstrap) {
-      button.setAttribute("data-bs-toggle", "tooltip");
-      button.setAttribute("data-bs-placement", "left");
-      button.setAttribute("data-bs-title", "Copied!");
-      tooltip = new bootstrap.Tooltip(button, 
-        { trigger: "manual", 
-          customClass: "code-copy-button-tooltip",
-          offset: [0, -8]});
-      tooltip.show();    
-    }
-    setTimeout(function() {
-      if (tooltip) {
-        tooltip.hide();
-        button.removeAttribute("data-bs-title");
-        button.removeAttribute("data-bs-toggle");
-        button.removeAttribute("data-bs-placement");
-      }
-      button.setAttribute("title", currentTitle);
-      button.classList.remove('code-copy-button-checked');
-    }, 1000);
-    // clear code selection
-    e.clearSelection();
-  });
-  function tippyHover(el, contentFn) {
-    const config = {
-      allowHTML: true,
-      content: contentFn,
-      maxWidth: 500,
-      delay: 100,
-      arrow: false,
-      appendTo: function(el) {
-          return el.parentElement;
-      },
-      interactive: true,
-      interactiveBorder: 10,
-      theme: 'quarto',
-      placement: 'bottom-start'
-    };
-    window.tippy(el, config); 
-  }
-  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
-  for (var i=0; i<noterefs.length; i++) {
-    const ref = noterefs[i];
-    tippyHover(ref, function() {
-      // use id or data attribute instead here
-      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
-      try { href = new URL(href).hash; } catch {}
-      const id = href.replace(/^#\/?/, "");
-      const note = window.document.getElementById(id);
-      return note.innerHTML;
-    });
-  }
-      let selectedAnnoteEl;
-      const selectorForAnnotation = ( cell, annotation) => {
-        let cellAttr = 'data-code-cell="' + cell + '"';
-        let lineAttr = 'data-code-annotation="' +  annotation + '"';
-        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
-        return selector;
-      }
-      const selectCodeLines = (annoteEl) => {
-        const doc = window.document;
-        const targetCell = annoteEl.getAttribute("data-target-cell");
-        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
-        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
-        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
-        const lineIds = lines.map((line) => {
-          return targetCell + "-" + line;
-        })
-        let top = null;
-        let height = null;
-        let parent = null;
-        if (lineIds.length > 0) {
-            //compute the position of the single el (top and bottom and make a div)
-            const el = window.document.getElementById(lineIds[0]);
-            top = el.offsetTop;
-            height = el.offsetHeight;
-            parent = el.parentElement.parentElement;
-          if (lineIds.length > 1) {
-            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
-            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
-            height = bottom - top;
-          }
-          if (top !== null && height !== null && parent !== null) {
-            // cook up a div (if necessary) and position it 
-            let div = window.document.getElementById("code-annotation-line-highlight");
-            if (div === null) {
-              div = window.document.createElement("div");
-              div.setAttribute("id", "code-annotation-line-highlight");
-              div.style.position = 'absolute';
-              parent.appendChild(div);
-            }
-            div.style.top = top - 2 + "px";
-            div.style.height = height + 4 + "px";
-            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
-            if (gutterDiv === null) {
-              gutterDiv = window.document.createElement("div");
-              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
-              gutterDiv.style.position = 'absolute';
-              const codeCell = window.document.getElementById(targetCell);
-              const gutter = codeCell.querySelector('.code-annotation-gutter');
-              gutter.appendChild(gutterDiv);
-            }
-            gutterDiv.style.top = top - 2 + "px";
-            gutterDiv.style.height = height + 4 + "px";
-          }
-          selectedAnnoteEl = annoteEl;
-        }
-      };
-      const unselectCodeLines = () => {
-        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
-        elementsIds.forEach((elId) => {
-          const div = window.document.getElementById(elId);
-          if (div) {
-            div.remove();
-          }
-        });
-        selectedAnnoteEl = undefined;
-      };
-      // Attach click handler to the DT
-      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
-      for (const annoteDlNode of annoteDls) {
-        annoteDlNode.addEventListener('click', (event) => {
-          const clickedEl = event.target;
-          if (clickedEl !== selectedAnnoteEl) {
-            unselectCodeLines();
-            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
-            if (activeEl) {
-              activeEl.classList.remove('code-annotation-active');
-            }
-            selectCodeLines(clickedEl);
-            clickedEl.classList.add('code-annotation-active');
-          } else {
-            // Unselect the line
-            unselectCodeLines();
-            clickedEl.classList.remove('code-annotation-active');
-          }
-        });
-      }
-  const findCites = (el) => {
-    const parentEl = el.parentElement;
-    if (parentEl) {
-      const cites = parentEl.dataset.cites;
-      if (cites) {
-        return {
-          el,
-          cites: cites.split(' ')
-        };
-      } else {
-        return findCites(el.parentElement)
-      }
-    } else {
-      return undefined;
-    }
-  };
-  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
-  for (var i=0; i<bibliorefs.length; i++) {
-    const ref = bibliorefs[i];
-    const citeInfo = findCites(ref);
-    if (citeInfo) {
-      tippyHover(citeInfo.el, function() {
-        var popup = window.document.createElement('div');
-        citeInfo.cites.forEach(function(cite) {
-          var citeDiv = window.document.createElement('div');
-          citeDiv.classList.add('hanging-indent');
-          citeDiv.classList.add('csl-entry');
-          var biblioDiv = window.document.getElementById('ref-' + cite);
-          if (biblioDiv) {
-            citeDiv.innerHTML = biblioDiv.innerHTML;
-          }
-          popup.appendChild(citeDiv);
-        });
-        return popup.innerHTML;
-      });
-    }
-  }
-});
-</script>
-</div> <!-- /content -->
-
-
-
-</body></html>
\ No newline at end of file
diff --git a/report/final_report/docs/04_plots-for-presentations.html b/report/final_report/docs/04_plots-for-presentations.html
deleted file mode 100644
index eea2073..0000000
--- a/report/final_report/docs/04_plots-for-presentations.html
+++ /dev/null
@@ -1,1036 +0,0 @@
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
-
-<meta charset="utf-8">
-<meta name="generator" content="quarto-1.3.450">
-
-<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
-
-
-<title>Accuracy: Contingency table</title>
-<style>
-code{white-space: pre-wrap;}
-span.smallcaps{font-variant: small-caps;}
-div.columns{display: flex; gap: min(4vw, 1.5em);}
-div.column{flex: auto; overflow-x: auto;}
-div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
-ul.task-list{list-style: none;}
-ul.task-list li input[type="checkbox"] {
-  width: 0.8em;
-  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
-  vertical-align: middle;
-}
-/* CSS for syntax highlighting */
-pre > code.sourceCode { white-space: pre; position: relative; }
-pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
-pre > code.sourceCode > span:empty { height: 1.2em; }
-.sourceCode { overflow: visible; }
-code.sourceCode > span { color: inherit; text-decoration: inherit; }
-div.sourceCode { margin: 1em 0; }
-pre.sourceCode { margin: 0; }
-@media screen {
-div.sourceCode { overflow: auto; }
-}
-@media print {
-pre > code.sourceCode { white-space: pre-wrap; }
-pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
-}
-pre.numberSource code
-  { counter-reset: source-line 0; }
-pre.numberSource code > span
-  { position: relative; left: -4em; counter-increment: source-line; }
-pre.numberSource code > span > a:first-child::before
-  { content: counter(source-line);
-    position: relative; left: -1em; text-align: right; vertical-align: baseline;
-    border: none; display: inline-block;
-    -webkit-touch-callout: none; -webkit-user-select: none;
-    -khtml-user-select: none; -moz-user-select: none;
-    -ms-user-select: none; user-select: none;
-    padding: 0 4px; width: 4em;
-  }
-pre.numberSource { margin-left: 3em;  padding-left: 4px; }
-div.sourceCode
-  {   }
-@media screen {
-pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
-}
-</style>
-
-
-<script src="site_libs/quarto-nav/quarto-nav.js"></script>
-<script src="site_libs/quarto-nav/headroom.min.js"></script>
-<script src="site_libs/clipboard/clipboard.min.js"></script>
-<script src="site_libs/quarto-search/autocomplete.umd.js"></script>
-<script src="site_libs/quarto-search/fuse.min.js"></script>
-<script src="site_libs/quarto-search/quarto-search.js"></script>
-<meta name="quarto:offset" content="./">
-<script src="site_libs/quarto-html/quarto.js"></script>
-<script src="site_libs/quarto-html/popper.min.js"></script>
-<script src="site_libs/quarto-html/tippy.umd.min.js"></script>
-<script src="site_libs/quarto-html/anchor.min.js"></script>
-<link href="site_libs/quarto-html/tippy.css" rel="stylesheet">
-<link href="site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
-<script src="site_libs/bootstrap/bootstrap.min.js"></script>
-<link href="site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
-<link href="site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
-<script id="quarto-search-options" type="application/json">{
-  "location": "sidebar",
-  "copy-button": false,
-  "collapse-after": 3,
-  "panel-placement": "start",
-  "type": "textbox",
-  "limit": 20,
-  "language": {
-    "search-no-results-text": "No results",
-    "search-matching-documents-text": "matching documents",
-    "search-copy-link-title": "Copy link to search",
-    "search-hide-matches-text": "Hide additional matches",
-    "search-more-match-text": "more match in this document",
-    "search-more-matches-text": "more matches in this document",
-    "search-clear-button-title": "Clear",
-    "search-detached-cancel-button-title": "Cancel",
-    "search-submit-button-title": "Submit",
-    "search-label": "Search"
-  }
-}</script>
-<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
-<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script>
-<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
-
-
-<link rel="stylesheet" href="styles.css">
-</head>
-
-<body class="nav-sidebar docked">
-
-<div id="quarto-search-results"></div>
-  <header id="quarto-header" class="headroom fixed-top">
-  <nav class="quarto-secondary-nav">
-    <div class="container-fluid d-flex">
-      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
-        <i class="bi bi-layout-text-sidebar-reverse"></i>
-      </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item">Accuracy: Contingency table</li></ol></nav>
-      <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
-      </a>
-      <button type="button" class="btn quarto-search-button" aria-label="" onclick="window.quartoOpenSearch();">
-        <i class="bi bi-search"></i>
-      </button>
-    </div>
-  </nav>
-</header>
-<!-- content -->
-<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
-<!-- sidebar -->
-  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal sidebar-navigation docked overflow-auto">
-    <div class="pt-lg-2 mt-2 text-left sidebar-header">
-      <a href="./index.html" class="sidebar-logo-link">
-      <img src="./logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
-      </a>
-      </div>
-        <div class="mt-2 flex-shrink-0 align-items-center">
-        <div class="sidebar-search">
-        <div id="quarto-search" class="" title="Search"></div>
-        </div>
-        </div>
-    <div class="sidebar-menu-container"> 
-    <ul class="list-unstyled mt-1">
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
- <span class="menu-text">Final Report</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./final_report.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">DSCI591 Capstone Final Report</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
- <span class="menu-text">Proposal</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./proposal.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-    </ul>
-    </div>
-</nav>
-<div id="quarto-sidebar-glass" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass"></div>
-<!-- margin-sidebar -->
-    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
-        <nav id="TOC" role="doc-toc" class="toc-active">
-    <h2 id="toc-title">On this page</h2>
-   
-  <ul>
-  <li><a href="#consistency-jitterbox-plot" id="toc-consistency-jitterbox-plot" class="nav-link active" data-scroll-target="#consistency-jitterbox-plot">Consistency: jitterbox plot</a></li>
-  <li><a href="#improvement-from-3.5-to-4o" id="toc-improvement-from-3.5-to-4o" class="nav-link" data-scroll-target="#improvement-from-3.5-to-4o">improvement from 3.5 to 4o</a></li>
-  </ul>
-</nav>
-    </div>
-<!-- main -->
-<main class="content" id="quarto-document-content">
-
-<header id="title-block-header" class="quarto-title-block default">
-<div class="quarto-title">
-<h1 class="title">Accuracy: Contingency table</h1>
-</div>
-
-
-
-<div class="quarto-title-meta">
-
-    
-  
-    
-  </div>
-  
-
-</header>
-
-<div class="cell" data-scrolled="true" data-execution_count="5">
-<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>pip install scipy altair</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-stdout">
-<pre><code>Requirement already satisfied: scipy in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (1.13.1)
-Collecting altair
-  Using cached altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
-Requirement already satisfied: numpy&lt;2.3,&gt;=1.22.4 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from scipy) (1.26.4)
-Requirement already satisfied: jinja2 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (3.1.4)
-Requirement already satisfied: jsonschema&gt;=3.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (4.22.0)
-Requirement already satisfied: packaging in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (23.2)
-Requirement already satisfied: pandas&gt;=0.25 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (2.2.2)
-Collecting toolz (from altair)
-  Using cached toolz-0.12.1-py3-none-any.whl.metadata (5.1 kB)
-Requirement already satisfied: attrs&gt;=22.2.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (23.2.0)
-Requirement already satisfied: jsonschema-specifications&gt;=2023.03.6 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (2023.12.1)
-Requirement already satisfied: referencing&gt;=0.28.4 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (0.35.1)
-Requirement already satisfied: rpds-py&gt;=0.7.1 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (0.18.1)
-Requirement already satisfied: python-dateutil&gt;=2.8.2 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas&gt;=0.25-&gt;altair) (2.9.0.post0)
-Requirement already satisfied: pytz&gt;=2020.1 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas&gt;=0.25-&gt;altair) (2024.1)
-Requirement already satisfied: tzdata&gt;=2022.7 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas&gt;=0.25-&gt;altair) (2024.1)
-Requirement already satisfied: MarkupSafe&gt;=2.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jinja2-&gt;altair) (2.1.5)
-Requirement already satisfied: six&gt;=1.5 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from python-dateutil&gt;=2.8.2-&gt;pandas&gt;=0.25-&gt;altair) (1.16.0)
-Using cached altair-5.3.0-py3-none-any.whl (857 kB)
-Using cached toolz-0.12.1-py3-none-any.whl (56 kB)
-Installing collected packages: toolz, altair
-Successfully installed altair-5.3.0 toolz-0.12.1</code></pre>
-</div>
-</div>
-<div class="cell" data-execution_count="1">
-<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> scipy</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pickle</span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> json</span>
-<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> yaml</span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
-<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> altair <span class="im">as</span> alt</span>
-<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> Counter</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="2">
-<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_report(response):</span>
-<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>    report <span class="op">=</span> []</span>
-<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> result <span class="kw">in</span> response.call_results:</span>
-<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> result.parsed_response:</span>
-<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>            resp <span class="op">=</span> result.parsed_response[<span class="st">'results'</span>]</span>
-<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>            <span class="cf">for</span> item <span class="kw">in</span> resp:</span>
-<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'file'</span>] <span class="op">=</span> result.files_evaluated[<span class="dv">0</span>] </span>
-<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'success'</span>] <span class="op">=</span> result.success</span>
-<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>                report.append(item)</span>
-<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a>        <span class="cf">else</span>:</span>
-<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a>            report.append({</span>
-<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a>                <span class="st">'ID'</span>: <span class="st">'2.1'</span>, <span class="co"># </span><span class="al">FIXME</span></span>
-<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Title'</span>: <span class="st">''</span>,</span>
-<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Requirement'</span>: <span class="st">''</span>,</span>
-<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Observation'</span>: <span class="st">''</span>,</span>
-<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Functions'</span>: [],</span>
-<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Evaluation'</span>: <span class="st">''</span>,</span>
-<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Score'</span>: <span class="dv">0</span>,</span>
-<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a>                <span class="st">'file'</span>: result.files_evaluated[<span class="dv">0</span>],</span>
-<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a>                <span class="st">'success'</span>: result.success</span>
-<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a>            })</span>
-<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> pd.DataFrame(report)</span>
-<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_report_json(response):</span>
-<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a>    report <span class="op">=</span> []</span>
-<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> result <span class="kw">in</span> response[<span class="st">'call_results'</span>]:</span>
-<span id="cb4-27"><a href="#cb4-27" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> result[<span class="st">'parsed_response'</span>]:</span>
-<span id="cb4-28"><a href="#cb4-28" aria-hidden="true" tabindex="-1"></a>            resp <span class="op">=</span> result[<span class="st">'parsed_response'</span>][<span class="st">'results'</span>]</span>
-<span id="cb4-29"><a href="#cb4-29" aria-hidden="true" tabindex="-1"></a>            <span class="cf">for</span> item <span class="kw">in</span> resp:</span>
-<span id="cb4-30"><a href="#cb4-30" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'file'</span>] <span class="op">=</span> result[<span class="st">'files_evaluated'</span>][<span class="dv">0</span>] </span>
-<span id="cb4-31"><a href="#cb4-31" aria-hidden="true" tabindex="-1"></a>                item[<span class="st">'success'</span>] <span class="op">=</span> result[<span class="st">'success'</span>]</span>
-<span id="cb4-32"><a href="#cb4-32" aria-hidden="true" tabindex="-1"></a>                report.append(item)</span>
-<span id="cb4-33"><a href="#cb4-33" aria-hidden="true" tabindex="-1"></a>        <span class="cf">else</span>:</span>
-<span id="cb4-34"><a href="#cb4-34" aria-hidden="true" tabindex="-1"></a>            report.append({</span>
-<span id="cb4-35"><a href="#cb4-35" aria-hidden="true" tabindex="-1"></a>                <span class="st">'ID'</span>: <span class="st">'2.1'</span>, <span class="co"># </span><span class="al">FIXME</span></span>
-<span id="cb4-36"><a href="#cb4-36" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Title'</span>: <span class="st">''</span>,</span>
-<span id="cb4-37"><a href="#cb4-37" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Requirement'</span>: <span class="st">''</span>,</span>
-<span id="cb4-38"><a href="#cb4-38" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Observation'</span>: <span class="st">''</span>,</span>
-<span id="cb4-39"><a href="#cb4-39" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Functions'</span>: [],</span>
-<span id="cb4-40"><a href="#cb4-40" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Evaluation'</span>: <span class="st">''</span>,</span>
-<span id="cb4-41"><a href="#cb4-41" aria-hidden="true" tabindex="-1"></a>                <span class="st">'Score'</span>: <span class="dv">0</span>,</span>
-<span id="cb4-42"><a href="#cb4-42" aria-hidden="true" tabindex="-1"></a>                <span class="st">'file'</span>: result.files_evaluated[<span class="dv">0</span>],</span>
-<span id="cb4-43"><a href="#cb4-43" aria-hidden="true" tabindex="-1"></a>                <span class="st">'success'</span>: result.success</span>
-<span id="cb4-44"><a href="#cb4-44" aria-hidden="true" tabindex="-1"></a>            })</span>
-<span id="cb4-45"><a href="#cb4-45" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> pd.DataFrame(report)</span>
-<span id="cb4-46"><a href="#cb4-46" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-47"><a href="#cb4-47" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> extract_file_and_scores(resp_path, verbose<span class="op">=</span><span class="va">False</span>):</span>
-<span id="cb4-48"><a href="#cb4-48" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> verbose:</span>
-<span id="cb4-49"><a href="#cb4-49" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(resp_path)</span>
-<span id="cb4-50"><a href="#cb4-50" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> <span class="bu">open</span>(resp_path, <span class="st">'rb'</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
-<span id="cb4-51"><a href="#cb4-51" aria-hidden="true" tabindex="-1"></a>        <span class="cf">try</span>:</span>
-<span id="cb4-52"><a href="#cb4-52" aria-hidden="true" tabindex="-1"></a>            response <span class="op">=</span> pickle.load(<span class="bu">file</span>)</span>
-<span id="cb4-53"><a href="#cb4-53" aria-hidden="true" tabindex="-1"></a>            report <span class="op">=</span> get_report(response)</span>
-<span id="cb4-54"><a href="#cb4-54" aria-hidden="true" tabindex="-1"></a>        <span class="cf">except</span>:</span>
-<span id="cb4-55"><a href="#cb4-55" aria-hidden="true" tabindex="-1"></a>            response <span class="op">=</span> json.load(<span class="bu">file</span>)</span>
-<span id="cb4-56"><a href="#cb4-56" aria-hidden="true" tabindex="-1"></a>            report <span class="op">=</span> get_report_json(response)</span>
-<span id="cb4-57"><a href="#cb4-57" aria-hidden="true" tabindex="-1"></a>    df <span class="op">=</span> (</span>
-<span id="cb4-58"><a href="#cb4-58" aria-hidden="true" tabindex="-1"></a>        report</span>
-<span id="cb4-59"><a href="#cb4-59" aria-hidden="true" tabindex="-1"></a>        .pivot(index<span class="op">=</span><span class="st">'file'</span>, columns<span class="op">=</span><span class="st">'ID'</span>, values<span class="op">=</span><span class="st">'Score'</span>)</span>
-<span id="cb4-60"><a href="#cb4-60" aria-hidden="true" tabindex="-1"></a>        .rename_axis(<span class="va">None</span>, axis<span class="op">=</span><span class="dv">1</span>)</span>
-<span id="cb4-61"><a href="#cb4-61" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb4-62"><a href="#cb4-62" aria-hidden="true" tabindex="-1"></a>    df[<span class="st">'success'</span>] <span class="op">=</span> report.groupby([<span class="st">'file'</span>])[<span class="st">'success'</span>].<span class="bu">all</span>()</span>
-<span id="cb4-63"><a href="#cb4-63" aria-hidden="true" tabindex="-1"></a>    df[<span class="st">'response_path'</span>] <span class="op">=</span> resp_path</span>
-<span id="cb4-64"><a href="#cb4-64" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> df.reset_index()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="3">
-<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>checklist_ids <span class="op">=</span> [<span class="st">'2.1'</span>, <span class="st">'3.2'</span>, <span class="st">'3.5'</span>, <span class="st">'4.2'</span>, <span class="st">'5.3'</span>, <span class="st">'6.1'</span>, <span class="st">'6.2'</span>]</span>
-<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> read_and_preprocess(result_path):</span>
-<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>    <span class="cf">with</span> <span class="bu">open</span>(result_path, <span class="st">'r'</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
-<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>        config <span class="op">=</span> pd.DataFrame(yaml.safe_load(<span class="bu">file</span>))</span>
-<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a>    <span class="co"># prepare score data by repo, run, file</span></span>
-<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a>    tmp <span class="op">=</span> [</span>
-<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a>        extract_file_and_scores(path) <span class="cf">for</span> path <span class="kw">in</span> config[<span class="st">'response_path'</span>] <span class="co"># </span><span class="al">FIXME</span><span class="co">: excluded deepchem</span></span>
-<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a>    ]</span>
-<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a>    tmp <span class="op">=</span> pd.concat(tmp, axis<span class="op">=</span><span class="dv">0</span>).reset_index(drop<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a>    raw_df_repo_run_file <span class="op">=</span> config.merge(tmp, on<span class="op">=</span><span class="st">'response_path'</span>, how<span class="op">=</span><span class="st">'left'</span>)</span>
-<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a>    <span class="co"># filter non-test files in qlib</span></span>
-<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a>    df_repo_run_file <span class="op">=</span> raw_df_repo_run_file.query(<span class="st">'(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))'</span>)</span>
-<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a>    <span class="co"># prepare score data by repo, run</span></span>
-<span id="cb5-19"><a href="#cb5-19" aria-hidden="true" tabindex="-1"></a>    df_repo_run <span class="op">=</span> df_repo_run_file.groupby([<span class="st">'repo'</span>, <span class="st">'run'</span>]).agg({</span>
-<span id="cb5-20"><a href="#cb5-20" aria-hidden="true" tabindex="-1"></a>        <span class="bu">id</span>: [<span class="st">'max'</span>] <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> checklist_ids</span>
-<span id="cb5-21"><a href="#cb5-21" aria-hidden="true" tabindex="-1"></a>    })</span>
-<span id="cb5-22"><a href="#cb5-22" aria-hidden="true" tabindex="-1"></a>    df_repo_run.columns <span class="op">=</span> [col[<span class="dv">0</span>] <span class="cf">for</span> col <span class="kw">in</span> df_repo_run.columns]</span>
-<span id="cb5-23"><a href="#cb5-23" aria-hidden="true" tabindex="-1"></a>    df_repo_run <span class="op">=</span> df_repo_run.reset_index()</span>
-<span id="cb5-24"><a href="#cb5-24" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-25"><a href="#cb5-25" aria-hidden="true" tabindex="-1"></a>    <span class="co"># prepare statistics of scores by repo</span></span>
-<span id="cb5-26"><a href="#cb5-26" aria-hidden="true" tabindex="-1"></a>    df_repo__stat <span class="op">=</span> df_repo_run.groupby([<span class="st">'repo'</span>]).agg({</span>
-<span id="cb5-27"><a href="#cb5-27" aria-hidden="true" tabindex="-1"></a>        <span class="bu">id</span>: [<span class="st">'mean'</span>, <span class="st">'std'</span>, <span class="st">'count'</span>] <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> checklist_ids</span>
-<span id="cb5-28"><a href="#cb5-28" aria-hidden="true" tabindex="-1"></a>    })</span>
-<span id="cb5-29"><a href="#cb5-29" aria-hidden="true" tabindex="-1"></a>    df_repo__stat <span class="op">=</span> pd.melt(df_repo__stat.reset_index(), id_vars<span class="op">=</span>[(<span class="st">'repo'</span>, <span class="st">''</span>)])</span>
-<span id="cb5-30"><a href="#cb5-30" aria-hidden="true" tabindex="-1"></a>    df_repo__stat.columns <span class="op">=</span> [<span class="st">'repo'</span>, <span class="st">'id'</span>, <span class="st">'stat'</span>, <span class="st">'value'</span>]</span>
-<span id="cb5-31"><a href="#cb5-31" aria-hidden="true" tabindex="-1"></a>    df_repo__stat <span class="op">=</span> (</span>
-<span id="cb5-32"><a href="#cb5-32" aria-hidden="true" tabindex="-1"></a>        df_repo__stat.pivot(index<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>], columns<span class="op">=</span><span class="st">'stat'</span>, values<span class="op">=</span><span class="st">'value'</span>)</span>
-<span id="cb5-33"><a href="#cb5-33" aria-hidden="true" tabindex="-1"></a>        .reset_index()</span>
-<span id="cb5-34"><a href="#cb5-34" aria-hidden="true" tabindex="-1"></a>        .rename_axis(<span class="va">None</span>, axis<span class="op">=</span><span class="dv">1</span>)</span>
-<span id="cb5-35"><a href="#cb5-35" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb5-36"><a href="#cb5-36" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-37"><a href="#cb5-37" aria-hidden="true" tabindex="-1"></a>    <span class="co"># prepare counting of scores by repo</span></span>
-<span id="cb5-38"><a href="#cb5-38" aria-hidden="true" tabindex="-1"></a>    df_repo__count <span class="op">=</span> df_repo_run.groupby([<span class="st">'repo'</span>])[<span class="st">'2.1'</span>].<span class="bu">apply</span>(Counter).reset_index()</span>
-<span id="cb5-39"><a href="#cb5-39" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> <span class="bu">id</span> <span class="kw">in</span> checklist_ids[<span class="dv">1</span>:]:</span>
-<span id="cb5-40"><a href="#cb5-40" aria-hidden="true" tabindex="-1"></a>        df_repo__count <span class="op">=</span> df_repo__count.merge(</span>
-<span id="cb5-41"><a href="#cb5-41" aria-hidden="true" tabindex="-1"></a>            df_repo_run.groupby([<span class="st">'repo'</span>])[<span class="bu">id</span>].<span class="bu">apply</span>(Counter).reset_index(),</span>
-<span id="cb5-42"><a href="#cb5-42" aria-hidden="true" tabindex="-1"></a>            on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'level_1'</span>],</span>
-<span id="cb5-43"><a href="#cb5-43" aria-hidden="true" tabindex="-1"></a>            how<span class="op">=</span><span class="st">'outer'</span></span>
-<span id="cb5-44"><a href="#cb5-44" aria-hidden="true" tabindex="-1"></a>        )</span>
-<span id="cb5-45"><a href="#cb5-45" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-46"><a href="#cb5-46" aria-hidden="true" tabindex="-1"></a>    df_repo__count <span class="op">=</span> df_repo__count.fillna(<span class="dv">0</span>)</span>
-<span id="cb5-47"><a href="#cb5-47" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-48"><a href="#cb5-48" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> (df_repo_run_file, df_repo_run, df_repo__stat, df_repo__count)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="4">
-<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ground truth</span></span>
-<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>ground_truth <span class="op">=</span> pd.DataFrame([</span>
-<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'lightfm'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="fl">0.5</span>},</span>
-<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'qlib'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">1</span>},</span>
-<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'2.1'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'3.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'3.5'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-20"><a href="#cb6-20" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'4.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-21"><a href="#cb6-21" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'5.3'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-22"><a href="#cb6-22" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'6.1'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-23"><a href="#cb6-23" aria-hidden="true" tabindex="-1"></a>    {<span class="st">'repo'</span>: <span class="st">'DeepSpeech'</span>, <span class="st">'id'</span>: <span class="st">'6.2'</span>, <span class="st">'score'</span>: <span class="dv">0</span>},</span>
-<span id="cb6-24"><a href="#cb6-24" aria-hidden="true" tabindex="-1"></a>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="5">
-<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>id_item_map <span class="op">=</span> {</span>
-<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>    <span class="st">'2.1'</span>: <span class="st">'Ensure Data File Loads as Expected'</span>,</span>
-<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>    <span class="st">'3.2'</span>: <span class="st">'Data in the Expected Format'</span>,</span>
-<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>    <span class="st">'3.5'</span>: <span class="st">'Check for Duplicate Records in Data'</span>,</span>
-<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>    <span class="st">'4.2'</span>: <span class="st">'Verify Data Split Proportion'</span>,</span>
-<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>    <span class="st">'5.3'</span>: <span class="st">'Ensure Model Output Shape Aligns with Expectation'</span>,</span>
-<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a>    <span class="st">'6.1'</span>: <span class="st">'Verify Evaluation Metrics Implementation'</span>,</span>
-<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>    <span class="st">'6.2'</span>: <span class="st">"Evaluate Model's Performance Against Thresholds"</span></span>
-<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="6">
-<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co">#result_path = '../draft/batch_run_results/record_combine.yml'</span></span>
-<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>df_repo_run_file, df_repo_run, df_repo__stat, df_repo__count <span class="op">=</span> read_and_preprocess(</span>
-<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>    <span class="st">'../data/processed/batch_run/record_combine.yml'</span></span>
-<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="7">
-<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>cont_table <span class="op">=</span> pd.melt(</span>
-<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>    df_repo_run.query(<span class="st">'(repo == "lightfm")'</span>)[[<span class="st">'repo'</span>, <span class="st">'run'</span>, <span class="st">'3.5'</span>, <span class="st">'4.2'</span>, <span class="st">'5.3'</span>]], </span>
-<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>    id_vars<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'run'</span>], var_name<span class="op">=</span><span class="st">'id'</span>, value_name<span class="op">=</span><span class="st">'System Output'</span>)</span>
-<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>cont_table <span class="op">=</span> pd.merge(cont_table, ground_truth, how<span class="op">=</span><span class="st">'inner'</span>, on<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>])</span>
-<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>cont_table <span class="op">=</span> cont_table.rename(columns<span class="op">=</span>{<span class="st">'score'</span>: <span class="st">'ground_truth'</span>})</span>
-<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>cont_table[<span class="st">'title'</span>] <span class="op">=</span> cont_table[<span class="st">'id'</span>].<span class="bu">apply</span>(<span class="kw">lambda</span> x: id_item_map[x])</span>
-<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a><span class="co">#cont_table = cont_table[['repo', 'title', 'ground_truth', 'System Output', 'run']]</span></span>
-<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a>cont_table <span class="op">=</span> pd.pivot_table(cont_table, values<span class="op">=</span><span class="st">'run'</span>, index<span class="op">=</span>[<span class="st">'repo'</span>, <span class="st">'id'</span>, <span class="st">'title'</span>, <span class="st">'ground_truth'</span>], columns<span class="op">=</span>[<span class="st">'System Output'</span>], aggfunc<span class="op">=</span><span class="st">'count'</span>, fill_value<span class="op">=</span><span class="dv">0</span>)</span>
-<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a>cont_table.index.names <span class="op">=</span> [<span class="st">'Repository'</span>, <span class="st">'ID'</span>, <span class="st">'Title'</span>, <span class="st">'Ground Truth'</span>]</span>
-<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a>cont_table.sort_index(level<span class="op">=</span><span class="dv">3</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="7">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">System Output</th>
-<th data-quarto-table-cell-role="th">0.0</th>
-<th data-quarto-table-cell-role="th">0.5</th>
-<th data-quarto-table-cell-role="th">1.0</th>
-</tr>
-<tr class="odd">
-<th data-quarto-table-cell-role="th">Repository</th>
-<th data-quarto-table-cell-role="th">ID</th>
-<th data-quarto-table-cell-role="th">Title</th>
-<th data-quarto-table-cell-role="th">Ground Truth</th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th"></th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td rowspan="3" data-quarto-table-cell-role="th" data-valign="top">lightfm</td>
-<td data-quarto-table-cell-role="th">3.5</td>
-<td data-quarto-table-cell-role="th">Check for Duplicate Records in Data</td>
-<td data-quarto-table-cell-role="th">0.0</td>
-<td>19</td>
-<td>6</td>
-<td>5</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">5.3</td>
-<td data-quarto-table-cell-role="th">Ensure Model Output Shape Aligns with Expectation</td>
-<td data-quarto-table-cell-role="th">0.5</td>
-<td>18</td>
-<td>12</td>
-<td>0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4.2</td>
-<td data-quarto-table-cell-role="th">Verify Data Split Proportion</td>
-<td data-quarto-table-cell-role="th">1.0</td>
-<td>0</td>
-<td>27</td>
-<td>3</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<section id="consistency-jitterbox-plot" class="level3">
-<h3 class="anchored" data-anchor-id="consistency-jitterbox-plot">Consistency: jitterbox plot</h3>
-<div class="cell" data-execution_count="9">
-<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> df_repo__stat[[<span class="st">'repo'</span>, <span class="st">'std'</span>, <span class="st">'id'</span>]].pivot(index<span class="op">=</span><span class="st">'repo'</span>, columns<span class="op">=</span><span class="st">'id'</span>).copy()</span>
-<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>stds.columns <span class="op">=</span> [col[<span class="dv">1</span>] <span class="cf">for</span> col <span class="kw">in</span> stds.columns]</span>
-<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> stds.reset_index()</span>
-<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a>stds <span class="op">=</span> stds.melt(id_vars<span class="op">=</span><span class="st">'repo'</span>, var_name<span class="op">=</span><span class="st">'id'</span>)</span>
-<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a>stds[<span class="st">'title'</span>] <span class="op">=</span> stds[<span class="st">'id'</span>].<span class="bu">apply</span>(<span class="kw">lambda</span> x: id_item_map[x])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="10">
-<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>box <span class="op">=</span> alt.Chart().mark_boxplot(</span>
-<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'grey'</span>,</span>
-<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>    opacity<span class="op">=</span><span class="fl">0.5</span>,</span>
-<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">20</span>,</span>
-<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a>).encode(</span>
-<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'value:Q'</span>).title(<span class="st">'System Output Uncertainty'</span>),</span>
-<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'title:N'</span>, title<span class="op">=</span><span class="va">None</span>, axis<span class="op">=</span>alt.Axis(labelPadding<span class="op">=</span><span class="dv">10</span>, labelLimit<span class="op">=</span><span class="dv">1000</span>, grid<span class="op">=</span><span class="va">False</span>))</span>
-<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb11-10"><a href="#cb11-10" aria-hidden="true" tabindex="-1"></a>stripplot <span class="op">=</span> alt.Chart().mark_circle(size<span class="op">=</span><span class="dv">100</span>).encode(</span>
-<span id="cb11-11"><a href="#cb11-11" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y( </span>
-<span id="cb11-12"><a href="#cb11-12" aria-hidden="true" tabindex="-1"></a>        <span class="st">'title:N'</span>,</span>
-<span id="cb11-13"><a href="#cb11-13" aria-hidden="true" tabindex="-1"></a>        axis<span class="op">=</span>alt.Axis(ticks<span class="op">=</span><span class="va">False</span>, grid<span class="op">=</span><span class="va">True</span>, labels<span class="op">=</span><span class="va">True</span>), </span>
-<span id="cb11-14"><a href="#cb11-14" aria-hidden="true" tabindex="-1"></a>        scale<span class="op">=</span>alt.Scale(), </span>
-<span id="cb11-15"><a href="#cb11-15" aria-hidden="true" tabindex="-1"></a>    ), </span>
-<span id="cb11-16"><a href="#cb11-16" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span><span class="st">'value:Q'</span>,</span>
-<span id="cb11-17"><a href="#cb11-17" aria-hidden="true" tabindex="-1"></a>    yOffset<span class="op">=</span><span class="st">"jitter:Q"</span>,</span>
-<span id="cb11-18"><a href="#cb11-18" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span>alt.Color(<span class="st">'id:N'</span>, legend<span class="op">=</span><span class="va">None</span>),</span>
-<span id="cb11-19"><a href="#cb11-19" aria-hidden="true" tabindex="-1"></a>    tooltip<span class="op">=</span><span class="st">'repo'</span></span>
-<span id="cb11-20"><a href="#cb11-20" aria-hidden="true" tabindex="-1"></a>).transform_calculate(</span>
-<span id="cb11-21"><a href="#cb11-21" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Generate Gaussian jitter with a Box-Muller transform</span></span>
-<span id="cb11-22"><a href="#cb11-22" aria-hidden="true" tabindex="-1"></a>    jitter<span class="op">=</span><span class="st">"sqrt(-2*log(random()))*cos(2*PI*random())"</span></span>
-<span id="cb11-23"><a href="#cb11-23" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb11-24"><a href="#cb11-24" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb11-25"><a href="#cb11-25" aria-hidden="true" tabindex="-1"></a>plot <span class="op">=</span> alt.layer(</span>
-<span id="cb11-26"><a href="#cb11-26" aria-hidden="true" tabindex="-1"></a>    box,</span>
-<span id="cb11-27"><a href="#cb11-27" aria-hidden="true" tabindex="-1"></a>    stripplot,</span>
-<span id="cb11-28"><a href="#cb11-28" aria-hidden="true" tabindex="-1"></a>    data<span class="op">=</span>stds</span>
-<span id="cb11-29"><a href="#cb11-29" aria-hidden="true" tabindex="-1"></a>).configure_view( </span>
-<span id="cb11-30"><a href="#cb11-30" aria-hidden="true" tabindex="-1"></a>    stroke<span class="op">=</span><span class="va">None</span></span>
-<span id="cb11-31"><a href="#cb11-31" aria-hidden="true" tabindex="-1"></a>).configure_axis( </span>
-<span id="cb11-32"><a href="#cb11-32" aria-hidden="true" tabindex="-1"></a>    labelFontSize<span class="op">=</span><span class="dv">12</span>, </span>
-<span id="cb11-33"><a href="#cb11-33" aria-hidden="true" tabindex="-1"></a>    titleFontSize<span class="op">=</span><span class="dv">12</span></span>
-<span id="cb11-34"><a href="#cb11-34" aria-hidden="true" tabindex="-1"></a>).properties(</span>
-<span id="cb11-35"><a href="#cb11-35" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">300</span>, </span>
-<span id="cb11-36"><a href="#cb11-36" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">600</span>,</span>
-<span id="cb11-37"><a href="#cb11-37" aria-hidden="true" tabindex="-1"></a>    title<span class="op">=</span><span class="st">"30 Runs on Openja's Repositories for each Checklist Item"</span></span>
-<span id="cb11-38"><a href="#cb11-38" aria-hidden="true" tabindex="-1"></a>) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="11">
-<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>plot</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="11">
-
-<style>
-  #altair-viz-98bc7ab37dbc4cd183c09ce60b37e3e6.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-98bc7ab37dbc4cd183c09ce60b37e3e6.vega-embed details,
-  #altair-viz-98bc7ab37dbc4cd183c09ce60b37e3e6.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-98bc7ab37dbc4cd183c09ce60b37e3e6"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-98bc7ab37dbc4cd183c09ce60b37e3e6") {
-      outputDiv = document.getElementById("altair-viz-98bc7ab37dbc4cd183c09ce60b37e3e6");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300, "stroke": null}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "layer": [{"mark": {"type": "boxplot", "color": "grey", "opacity": 0.5, "size": 20}, "encoding": {"x": {"field": "value", "title": "System Output Uncertainty", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "title", "title": null, "type": "nominal"}}}, {"mark": {"type": "circle", "size": 100}, "encoding": {"color": {"field": "id", "legend": null, "type": "nominal"}, "tooltip": {"field": "repo", "type": "nominal"}, "x": {"field": "value", "type": "quantitative"}, "y": {"axis": {"grid": true, "labels": true, "ticks": false}, "field": "title", "scale": {}, "type": "nominal"}, "yOffset": {"field": "jitter", "type": "quantitative"}}, "transform": [{"calculate": "sqrt(-2*log(random()))*cos(2*PI*random())", "as": "jitter"}]}], "data": {"name": "data-f627f748e40db5fba424f203fa34877c"}, "height": 300, "title": "30 Runs on Openja's Repositories for each Checklist Item", "width": 600, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-f627f748e40db5fba424f203fa34877c": [{"repo": "DeepSpeech", "id": "2.1", "value": 0.47946330148538413, "title": "Ensure Data File Loads as Expected"}, {"repo": "apollo", "id": "2.1", "value": 0.15256428831468236, "title": "Ensure Data File Loads as Expected"}, {"repo": "lightfm", "id": "2.1", "value": 0.09128709291752768, "title": "Ensure Data File Loads as Expected"}, {"repo": "magenta", "id": "2.1", "value": 0.4234505222898965, "title": "Ensure Data File Loads as Expected"}, {"repo": "mmf", "id": "2.1", "value": 0.0, "title": "Ensure Data File Loads as Expected"}, {"repo": "mycroft-core", "id": "2.1", "value": 0.0, "title": "Ensure Data File Loads as Expected"}, {"repo": "nanodet", "id": "2.1", "value": 0.0, "title": "Ensure Data File Loads as Expected"}, {"repo": "paperless-ng", "id": "2.1", "value": 0.24211709905575113, "title": "Ensure Data File Loads as Expected"}, {"repo": "qlib", "id": "2.1", "value": 0.0, "title": "Ensure Data File Loads as Expected"}, {"repo": "DeepSpeech", "id": "3.2", "value": 0.40683810217248617, "title": "Data in the Expected Format"}, {"repo": "apollo", "id": "3.2", "value": 0.21509153357603822, "title": "Data in the Expected Format"}, {"repo": "lightfm", "id": "3.2", "value": 0.09128709291752768, "title": "Data in the Expected Format"}, {"repo": "magenta", "id": "3.2", "value": 0.18952451089472586, "title": "Data in the Expected Format"}, {"repo": "mmf", "id": "3.2", "value": 0.2450662589267805, "title": "Data in the Expected Format"}, {"repo": "mycroft-core", "id": "3.2", "value": 0.2780267083837678, "title": "Data in the Expected Format"}, {"repo": "nanodet", "id": "3.2", "value": 0.2397316507426921, "title": "Data in the Expected Format"}, {"repo": "paperless-ng", "id": "3.2", "value": 0.09128709291752768, "title": "Data in the Expected Format"}, {"repo": "qlib", "id": "3.2", "value": 0.285673231861683, "title": "Data in the Expected Format"}, {"repo": "DeepSpeech", "id": "3.5", "value": 0.0, "title": "Check for Duplicate Records in Data"}, {"repo": "apollo", "id": "3.5", "value": 0.0, "title": "Check for Duplicate Records in Data"}, {"repo": "lightfm", "id": "3.5", "value": 0.38803957613068046, "title": "Check for Duplicate Records in Data"}, {"repo": "magenta", "id": "3.5", "value": 0.25200346649686545, "title": "Check for Duplicate Records in Data"}, {"repo": "mmf", "id": "3.5", "value": 0.1268540658512312, "title": "Check for Duplicate Records in Data"}, {"repo": "mycroft-core", "id": "3.5", "value": 0.0, "title": "Check for Duplicate Records in Data"}, {"repo": "nanodet", "id": "3.5", "value": 0.25200346649686545, "title": "Check for Duplicate Records in Data"}, {"repo": "paperless-ng", "id": "3.5", "value": 0.0, "title": "Check for Duplicate Records in Data"}, {"repo": "qlib", "id": "3.5", "value": 0.09128709291752768, "title": "Check for Duplicate Records in Data"}, {"repo": "DeepSpeech", "id": "4.2", "value": 0.0, "title": "Verify Data Split Proportion"}, {"repo": "apollo", "id": "4.2", "value": 0.0, "title": "Verify Data Split Proportion"}, {"repo": "lightfm", "id": "4.2", "value": 0.15256428831468233, "title": "Verify Data Split Proportion"}, {"repo": "magenta", "id": "4.2", "value": 0.09128709291752768, "title": "Verify Data Split Proportion"}, {"repo": "mmf", "id": "4.2", "value": 0.12685406585123124, "title": "Verify Data Split Proportion"}, {"repo": "mycroft-core", "id": "4.2", "value": 0.0, "title": "Verify Data Split Proportion"}, {"repo": "nanodet", "id": "4.2", "value": 0.2542738138578039, "title": "Verify Data Split Proportion"}, {"repo": "paperless-ng", "id": "4.2", "value": 0.0, "title": "Verify Data Split Proportion"}, {"repo": "qlib", "id": "4.2", "value": 0.15256428831468236, "title": "Verify Data Split Proportion"}, {"repo": "DeepSpeech", "id": "5.3", "value": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "apollo", "id": "5.3", "value": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "lightfm", "id": "5.3", "value": 0.2491364395612199, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "magenta", "id": "5.3", "value": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "mmf", "id": "5.3", "value": 0.1268540658512312, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "mycroft-core", "id": "5.3", "value": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "nanodet", "id": "5.3", "value": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "paperless-ng", "id": "5.3", "value": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "qlib", "id": "5.3", "value": 0.2537081317024624, "title": "Ensure Model Output Shape Aligns with Expectation"}, {"repo": "DeepSpeech", "id": "6.1", "value": 0.3511066249289032, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "apollo", "id": "6.1", "value": 0.1728729518208802, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "lightfm", "id": "6.1", "value": 0.23973165074269212, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "magenta", "id": "6.1", "value": 0.25200346649686545, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "mmf", "id": "6.1", "value": 0.23304579984969953, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "mycroft-core", "id": "6.1", "value": 0.0, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "nanodet", "id": "6.1", "value": 0.2856732318616829, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "paperless-ng", "id": "6.1", "value": 0.0, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "qlib", "id": "6.1", "value": 0.2248882225544018, "title": "Verify Evaluation Metrics Implementation"}, {"repo": "DeepSpeech", "id": "6.2", "value": 0.0, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "apollo", "id": "6.2", "value": 0.0, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "lightfm", "id": "6.2", "value": 0.1728729518208802, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "magenta", "id": "6.2", "value": 0.0, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "mmf", "id": "6.2", "value": 0.20128894996822438, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "mycroft-core", "id": "6.2", "value": 0.2537081317024625, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "nanodet", "id": "6.2", "value": 0.2604152298810939, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "paperless-ng", "id": "6.2", "value": 0.1268540658512312, "title": "Evaluate Model's Performance Against Thresholds"}, {"repo": "qlib", "id": "6.2", "value": 0.18257418583505536, "title": "Evaluate Model's Performance Against Thresholds"}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-</section>
-<section id="improvement-from-3.5-to-4o" class="level3">
-<h3 class="anchored" data-anchor-id="improvement-from-3.5-to-4o">improvement from 3.5 to 4o</h3>
-<div class="cell" data-execution_count="13">
-<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="co">#result_path = '../draft/batch_run_results/record_combine.yml'</span></span>
-<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>df_repo_run_file_4o, df_repo_run_4o, df_repo_4o__stat, df_repo_4o__count <span class="op">=</span> read_and_preprocess(</span>
-<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>    <span class="st">'../data/processed/batch_run_4o/record_combine.yml'</span></span>
-<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-scrolled="true" data-execution_count="14">
-<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>df_repo_4o__stat</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="14">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">repo</th>
-<th data-quarto-table-cell-role="th">id</th>
-<th data-quarto-table-cell-role="th">count</th>
-<th data-quarto-table-cell-role="th">mean</th>
-<th data-quarto-table-cell-role="th">std</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>lightfm</td>
-<td>2.1</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>lightfm</td>
-<td>3.2</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>lightfm</td>
-<td>3.5</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>lightfm</td>
-<td>4.2</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>lightfm</td>
-<td>5.3</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">5</td>
-<td>lightfm</td>
-<td>6.1</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">6</td>
-<td>lightfm</td>
-<td>6.2</td>
-<td>30.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="24">
-<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>df1 <span class="op">=</span> df_repo__stat.query(<span class="st">'(repo == "lightfm") &amp; (id == "4.2")'</span>).copy()</span>
-<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>df1[<span class="st">'model'</span>] <span class="op">=</span> [<span class="st">'gpt-3.5-turbo'</span>]</span>
-<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a>df2 <span class="op">=</span> df_repo_4o__stat.query(<span class="st">'(repo == "lightfm") &amp; (id == "4.2")'</span>).copy()</span>
-<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a>df2[<span class="st">'model'</span>] <span class="op">=</span> [<span class="st">'gpt-4o'</span>]</span>
-<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a>df_model_comp <span class="op">=</span> pd.concat((df1, df2), axis<span class="op">=</span><span class="dv">0</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="35">
-<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>base <span class="op">=</span> alt.Chart(df_model_comp).transform_calculate(</span>
-<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>    <span class="bu">min</span><span class="op">=</span><span class="st">"max(0, datum.mean-datum.std)"</span>,</span>
-<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a>    <span class="bu">max</span><span class="op">=</span><span class="st">"min(1, datum.mean+datum.std)"</span></span>
-<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the points</span></span>
-<span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a>points <span class="op">=</span> base.mark_point(</span>
-<span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a>    filled<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb16-9"><a href="#cb16-9" aria-hidden="true" tabindex="-1"></a>    size<span class="op">=</span><span class="dv">50</span>,</span>
-<span id="cb16-10"><a href="#cb16-10" aria-hidden="true" tabindex="-1"></a>    color<span class="op">=</span><span class="st">'black'</span></span>
-<span id="cb16-11"><a href="#cb16-11" aria-hidden="true" tabindex="-1"></a>).encode(</span>
-<span id="cb16-12"><a href="#cb16-12" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">'mean:Q'</span>).scale(domainMin<span class="op">=</span><span class="dv">0</span>, domainMax<span class="op">=</span><span class="dv">1</span>).title(<span class="st">"System Output"</span>).axis(</span>
-<span id="cb16-13"><a href="#cb16-13" aria-hidden="true" tabindex="-1"></a>        labelExpr<span class="op">=</span><span class="st">"datum.value % 0.5 ? null : datum.label"</span></span>
-<span id="cb16-14"><a href="#cb16-14" aria-hidden="true" tabindex="-1"></a>    ),</span>
-<span id="cb16-15"><a href="#cb16-15" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span>alt.Y(<span class="st">'model:N'</span>).title(<span class="st">"Model"</span>)<span class="co">#.scale(domainMin=0, domainMax=1).title('Score'),</span></span>
-<span id="cb16-16"><a href="#cb16-16" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb16-17"><a href="#cb16-17" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb16-18"><a href="#cb16-18" aria-hidden="true" tabindex="-1"></a><span class="co"># generate the error bars</span></span>
-<span id="cb16-19"><a href="#cb16-19" aria-hidden="true" tabindex="-1"></a>errorbars <span class="op">=</span> base.mark_errorbar().encode(</span>
-<span id="cb16-20"><a href="#cb16-20" aria-hidden="true" tabindex="-1"></a>    x<span class="op">=</span>alt.X(<span class="st">"min:Q"</span>).title(<span class="st">'1 SD'</span>), <span class="co">#"id:N",</span></span>
-<span id="cb16-21"><a href="#cb16-21" aria-hidden="true" tabindex="-1"></a>    x2<span class="op">=</span><span class="st">"max:Q"</span>,</span>
-<span id="cb16-22"><a href="#cb16-22" aria-hidden="true" tabindex="-1"></a>    y<span class="op">=</span><span class="st">"model:N"</span></span>
-<span id="cb16-23"><a href="#cb16-23" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb16-24"><a href="#cb16-24" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb16-25"><a href="#cb16-25" aria-hidden="true" tabindex="-1"></a>(points <span class="op">+</span> errorbars).configure_axis( </span>
-<span id="cb16-26"><a href="#cb16-26" aria-hidden="true" tabindex="-1"></a>    labelFontSize<span class="op">=</span><span class="dv">12</span>, </span>
-<span id="cb16-27"><a href="#cb16-27" aria-hidden="true" tabindex="-1"></a>    titleFontSize<span class="op">=</span><span class="dv">12</span></span>
-<span id="cb16-28"><a href="#cb16-28" aria-hidden="true" tabindex="-1"></a>).properties(</span>
-<span id="cb16-29"><a href="#cb16-29" aria-hidden="true" tabindex="-1"></a>    height<span class="op">=</span><span class="dv">200</span>,</span>
-<span id="cb16-30"><a href="#cb16-30" aria-hidden="true" tabindex="-1"></a>    width<span class="op">=</span><span class="dv">400</span>,</span>
-<span id="cb16-31"><a href="#cb16-31" aria-hidden="true" tabindex="-1"></a>    title<span class="op">=</span>{</span>
-<span id="cb16-32"><a href="#cb16-32" aria-hidden="true" tabindex="-1"></a>        <span class="st">'text'</span>: <span class="st">'30 Runs on Checklist Item: "Ensure Data File Loads as Expected"'</span>,</span>
-<span id="cb16-33"><a href="#cb16-33" aria-hidden="true" tabindex="-1"></a>        <span class="st">'subtitle'</span>: <span class="st">"Ground Truth = 1"</span></span>
-<span id="cb16-34"><a href="#cb16-34" aria-hidden="true" tabindex="-1"></a>    }</span>
-<span id="cb16-35"><a href="#cb16-35" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="35">
-
-<style>
-  #altair-viz-32d45b35f788410cb417352673c05412.vega-embed {
-    width: 100%;
-    display: flex;
-  }
-
-  #altair-viz-32d45b35f788410cb417352673c05412.vega-embed details,
-  #altair-viz-32d45b35f788410cb417352673c05412.vega-embed details summary {
-    position: relative;
-  }
-</style>
-<div id="altair-viz-32d45b35f788410cb417352673c05412"></div>
-<script type="text/javascript">
-  var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
-  (function(spec, embedOpt){
-    let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-32d45b35f788410cb417352673c05412") {
-      outputDiv = document.getElementById("altair-viz-32d45b35f788410cb417352673c05412");
-    }
-    const paths = {
-      "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
-      "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
-      "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
-    };
-
-    function maybeLoadScript(lib, version) {
-      var key = `${lib.replace("-", "")}_version`;
-      return (VEGA_DEBUG[key] == version) ?
-        Promise.resolve(paths[lib]) :
-        new Promise(function(resolve, reject) {
-          var s = document.createElement('script');
-          document.getElementsByTagName("head")[0].appendChild(s);
-          s.async = true;
-          s.onload = () => {
-            VEGA_DEBUG[key] = version;
-            return resolve(paths[lib]);
-          };
-          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);
-          s.src = paths[lib];
-        });
-    }
-
-    function showError(err) {
-      outputDiv.innerHTML = `<div class="error" style="color:red;">${err}</div>`;
-      throw err;
-    }
-
-    function displayChart(vegaEmbed) {
-      vegaEmbed(outputDiv, spec, embedOpt)
-        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));
-    }
-
-    if(typeof define === "function" && define.amd) {
-      requirejs.config({paths});
-      require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
-    } else {
-      maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
-        .then(() => maybeLoadScript("vega-embed", "6"))
-        .catch(showError)
-        .then(() => displayChart(vegaEmbed));
-    }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "layer": [{"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelExpr": "datum.value % 0.5 ? null : datum.label"}, "field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "System Output", "type": "quantitative"}, "y": {"field": "model", "title": "Model", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "min", "title": "1 SD", "type": "quantitative"}, "x2": {"field": "max"}, "y": {"field": "model", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}], "data": {"name": "data-7cad0a7145ca39f2dfe68f7ca4f09daa"}, "height": 200, "title": {"text": "30 Runs on Checklist Item: \"Ensure Data File Loads as Expected\"", "subtitle": "Ground Truth = 1"}, "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-7cad0a7145ca39f2dfe68f7ca4f09daa": [{"repo": "lightfm", "id": "4.2", "count": 30.0, "mean": 0.55, "std": 0.15256428831468233, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": "4.2", "count": 30.0, "mean": 1.0, "std": 0.0, "model": "gpt-4o"}]}}, {"mode": "vega-lite"});
-</script>
-</div>
-</div>
-
-
-</section>
-
-</main> <!-- /main -->
-<script id="quarto-html-after-body" type="application/javascript">
-window.document.addEventListener("DOMContentLoaded", function (event) {
-  const toggleBodyColorMode = (bsSheetEl) => {
-    const mode = bsSheetEl.getAttribute("data-mode");
-    const bodyEl = window.document.querySelector("body");
-    if (mode === "dark") {
-      bodyEl.classList.add("quarto-dark");
-      bodyEl.classList.remove("quarto-light");
-    } else {
-      bodyEl.classList.add("quarto-light");
-      bodyEl.classList.remove("quarto-dark");
-    }
-  }
-  const toggleBodyColorPrimary = () => {
-    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
-    if (bsSheetEl) {
-      toggleBodyColorMode(bsSheetEl);
-    }
-  }
-  toggleBodyColorPrimary();  
-  const icon = "";
-  const anchorJS = new window.AnchorJS();
-  anchorJS.options = {
-    placement: 'right',
-    icon: icon
-  };
-  anchorJS.add('.anchored');
-  const isCodeAnnotation = (el) => {
-    for (const clz of el.classList) {
-      if (clz.startsWith('code-annotation-')) {                     
-        return true;
-      }
-    }
-    return false;
-  }
-  const clipboard = new window.ClipboardJS('.code-copy-button', {
-    text: function(trigger) {
-      const codeEl = trigger.previousElementSibling.cloneNode(true);
-      for (const childEl of codeEl.children) {
-        if (isCodeAnnotation(childEl)) {
-          childEl.remove();
-        }
-      }
-      return codeEl.innerText;
-    }
-  });
-  clipboard.on('success', function(e) {
-    // button target
-    const button = e.trigger;
-    // don't keep focus
-    button.blur();
-    // flash "checked"
-    button.classList.add('code-copy-button-checked');
-    var currentTitle = button.getAttribute("title");
-    button.setAttribute("title", "Copied!");
-    let tooltip;
-    if (window.bootstrap) {
-      button.setAttribute("data-bs-toggle", "tooltip");
-      button.setAttribute("data-bs-placement", "left");
-      button.setAttribute("data-bs-title", "Copied!");
-      tooltip = new bootstrap.Tooltip(button, 
-        { trigger: "manual", 
-          customClass: "code-copy-button-tooltip",
-          offset: [0, -8]});
-      tooltip.show();    
-    }
-    setTimeout(function() {
-      if (tooltip) {
-        tooltip.hide();
-        button.removeAttribute("data-bs-title");
-        button.removeAttribute("data-bs-toggle");
-        button.removeAttribute("data-bs-placement");
-      }
-      button.setAttribute("title", currentTitle);
-      button.classList.remove('code-copy-button-checked');
-    }, 1000);
-    // clear code selection
-    e.clearSelection();
-  });
-  function tippyHover(el, contentFn) {
-    const config = {
-      allowHTML: true,
-      content: contentFn,
-      maxWidth: 500,
-      delay: 100,
-      arrow: false,
-      appendTo: function(el) {
-          return el.parentElement;
-      },
-      interactive: true,
-      interactiveBorder: 10,
-      theme: 'quarto',
-      placement: 'bottom-start'
-    };
-    window.tippy(el, config); 
-  }
-  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
-  for (var i=0; i<noterefs.length; i++) {
-    const ref = noterefs[i];
-    tippyHover(ref, function() {
-      // use id or data attribute instead here
-      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
-      try { href = new URL(href).hash; } catch {}
-      const id = href.replace(/^#\/?/, "");
-      const note = window.document.getElementById(id);
-      return note.innerHTML;
-    });
-  }
-      let selectedAnnoteEl;
-      const selectorForAnnotation = ( cell, annotation) => {
-        let cellAttr = 'data-code-cell="' + cell + '"';
-        let lineAttr = 'data-code-annotation="' +  annotation + '"';
-        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
-        return selector;
-      }
-      const selectCodeLines = (annoteEl) => {
-        const doc = window.document;
-        const targetCell = annoteEl.getAttribute("data-target-cell");
-        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
-        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
-        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
-        const lineIds = lines.map((line) => {
-          return targetCell + "-" + line;
-        })
-        let top = null;
-        let height = null;
-        let parent = null;
-        if (lineIds.length > 0) {
-            //compute the position of the single el (top and bottom and make a div)
-            const el = window.document.getElementById(lineIds[0]);
-            top = el.offsetTop;
-            height = el.offsetHeight;
-            parent = el.parentElement.parentElement;
-          if (lineIds.length > 1) {
-            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
-            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
-            height = bottom - top;
-          }
-          if (top !== null && height !== null && parent !== null) {
-            // cook up a div (if necessary) and position it 
-            let div = window.document.getElementById("code-annotation-line-highlight");
-            if (div === null) {
-              div = window.document.createElement("div");
-              div.setAttribute("id", "code-annotation-line-highlight");
-              div.style.position = 'absolute';
-              parent.appendChild(div);
-            }
-            div.style.top = top - 2 + "px";
-            div.style.height = height + 4 + "px";
-            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
-            if (gutterDiv === null) {
-              gutterDiv = window.document.createElement("div");
-              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
-              gutterDiv.style.position = 'absolute';
-              const codeCell = window.document.getElementById(targetCell);
-              const gutter = codeCell.querySelector('.code-annotation-gutter');
-              gutter.appendChild(gutterDiv);
-            }
-            gutterDiv.style.top = top - 2 + "px";
-            gutterDiv.style.height = height + 4 + "px";
-          }
-          selectedAnnoteEl = annoteEl;
-        }
-      };
-      const unselectCodeLines = () => {
-        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
-        elementsIds.forEach((elId) => {
-          const div = window.document.getElementById(elId);
-          if (div) {
-            div.remove();
-          }
-        });
-        selectedAnnoteEl = undefined;
-      };
-      // Attach click handler to the DT
-      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
-      for (const annoteDlNode of annoteDls) {
-        annoteDlNode.addEventListener('click', (event) => {
-          const clickedEl = event.target;
-          if (clickedEl !== selectedAnnoteEl) {
-            unselectCodeLines();
-            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
-            if (activeEl) {
-              activeEl.classList.remove('code-annotation-active');
-            }
-            selectCodeLines(clickedEl);
-            clickedEl.classList.add('code-annotation-active');
-          } else {
-            // Unselect the line
-            unselectCodeLines();
-            clickedEl.classList.remove('code-annotation-active');
-          }
-        });
-      }
-  const findCites = (el) => {
-    const parentEl = el.parentElement;
-    if (parentEl) {
-      const cites = parentEl.dataset.cites;
-      if (cites) {
-        return {
-          el,
-          cites: cites.split(' ')
-        };
-      } else {
-        return findCites(el.parentElement)
-      }
-    } else {
-      return undefined;
-    }
-  };
-  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
-  for (var i=0; i<bibliorefs.length; i++) {
-    const ref = bibliorefs[i];
-    const citeInfo = findCites(ref);
-    if (citeInfo) {
-      tippyHover(citeInfo.el, function() {
-        var popup = window.document.createElement('div');
-        citeInfo.cites.forEach(function(cite) {
-          var citeDiv = window.document.createElement('div');
-          citeDiv.classList.add('hanging-indent');
-          citeDiv.classList.add('csl-entry');
-          var biblioDiv = window.document.getElementById('ref-' + cite);
-          if (biblioDiv) {
-            citeDiv.innerHTML = biblioDiv.innerHTML;
-          }
-          popup.appendChild(citeDiv);
-        });
-        return popup.innerHTML;
-      });
-    }
-  }
-});
-</script>
-</div> <!-- /content -->
-
-
-
-</body></html>
\ No newline at end of file
diff --git a/report/final_report/docs/final_report.html b/report/final_report/docs/final_report.html
index 268a2b9..9f4d0ee 100644
--- a/report/final_report/docs/final_report.html
+++ b/report/final_report/docs/final_report.html
@@ -7,7 +7,7 @@
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 
 
-<title>DSCI591 Capstone Final Report</title>
+<title>final_report</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -129,7 +129,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="./final_report.html">Final Report</a></li><li class="breadcrumb-item"><a href="./final_report.html">DSCI591 Capstone Final Report</a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="./final_report.html">Capstone Final Report</a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="" onclick="window.quartoOpenSearch();">
@@ -144,7 +144,7 @@
   <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal sidebar-navigation docked overflow-auto">
     <div class="pt-lg-2 mt-2 text-left sidebar-header">
       <a href="./index.html" class="sidebar-logo-link">
-      <img src="./logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      <img src="./img/logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
       </a>
       </div>
         <div class="mt-2 flex-shrink-0 align-items-center">
@@ -154,40 +154,18 @@
         </div>
     <div class="sidebar-menu-container"> 
     <ul class="list-unstyled mt-1">
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
- <span class="menu-text">Final Report</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
+        <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./final_report.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text">DSCI591 Capstone Final Report</span></a>
+ <span class="menu-text">Capstone Final Report</span></a>
   </div>
 </li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
- <span class="menu-text">Proposal</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
+        <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./proposal.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
+ <span class="menu-text">Capstone Proposal</span></a>
   </div>
 </li>
-      </ul>
-  </li>
     </ul>
     </div>
 </nav>
@@ -198,9 +176,7 @@
     <h2 id="toc-title">On this page</h2>
    
   <ul>
-  <li><a href="#final-report---checklists-and-llm-prompts-for-efficient-and-effective-test-creation-in-data-analysis" id="toc-final-report---checklists-and-llm-prompts-for-efficient-and-effective-test-creation-in-data-analysis" class="nav-link active" data-scroll-target="#final-report---checklists-and-llm-prompts-for-efficient-and-effective-test-creation-in-data-analysis">Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</a>
-  <ul class="collapse">
-  <li><a href="#executive-summary" id="toc-executive-summary" class="nav-link" data-scroll-target="#executive-summary">Executive Summary</a></li>
+  <li><a href="#executive-summary" id="toc-executive-summary" class="nav-link active" data-scroll-target="#executive-summary">Executive Summary</a></li>
   <li><a href="#introduction" id="toc-introduction" class="nav-link" data-scroll-target="#introduction">Introduction</a>
   <ul class="collapse">
   <li><a href="#problem-statement" id="toc-problem-statement" class="nav-link" data-scroll-target="#problem-statement">Problem Statement</a></li>
@@ -223,7 +199,6 @@ <h2 id="toc-title">On this page</h2>
   <li><a href="#limitation-future-improvement" id="toc-limitation-future-improvement" class="nav-link" data-scroll-target="#limitation-future-improvement">Limitation &amp; Future Improvement</a></li>
   </ul></li>
   <li><a href="#references" id="toc-references" class="nav-link" data-scroll-target="#references">References</a></li>
-  </ul></li>
   </ul>
 </nav>
     </div>
@@ -232,7 +207,7 @@ <h2 id="toc-title">On this page</h2>
 
 <header id="title-block-header" class="quarto-title-block default">
 <div class="quarto-title">
-<h1 class="title">DSCI591 Capstone Final Report</h1>
+<h1 class="title">Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</h1>
 </div>
 
 
@@ -247,8 +222,6 @@ <h1 class="title">DSCI591 Capstone Final Report</h1>
 
 </header>
 
-<section id="final-report---checklists-and-llm-prompts-for-efficient-and-effective-test-creation-in-data-analysis" class="level1">
-<h1>Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</h1>
 <p>by John Shiu, Orix Au Yeung, Tony Shum, Yingzi Jin</p>
 <section id="executive-summary" class="level2">
 <h2 class="anchored" data-anchor-id="executive-summary">Executive Summary</h2>
@@ -258,8 +231,8 @@ <h2 class="anchored" data-anchor-id="executive-summary">Executive Summary</h2>
 <h2 class="anchored" data-anchor-id="introduction">Introduction</h2>
 <section id="problem-statement" class="level3">
 <h3 class="anchored" data-anchor-id="problem-statement">Problem Statement</h3>
-<p>The global artificial intelligence (AI) market is growing exponentially <span class="citation" data-cites="grand2021artificial">(<a href="#ref-grand2021artificial" role="doc-biblioref">Grand-View-Research 2021</a>)</span>, driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.</p>
-<p>However, ensuring the software quality of these systems remains a significant challenge <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span>. Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as substantial financial losses (<span class="citation" data-cites="Asheeta2019">(<a href="#ref-Asheeta2019" role="doc-biblioref">Regidi 2019</a>)</span>, <span class="citation" data-cites="Asheeta2019">(<a href="#ref-Asheeta2019" role="doc-biblioref">Regidi 2019</a>)</span>, <span class="citation" data-cites="Asheeta2019">(<a href="#ref-Asheeta2019" role="doc-biblioref">Regidi 2019</a>)</span>) and safety hazards. (FIXME: false information to clients, bias to society, substantial financial losses)</p>
+<p>The global artificial intelligence (AI) market is growing exponentially (<span class="citation" data-cites="grand2021artificial">(<a href="#ref-grand2021artificial" role="doc-biblioref">Grand-View-Research 2021</a>)</span>), driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.</p>
+<p>However, ensuring the software quality of these systems remains a significant challenge (<span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span>). Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as misinformation (<span class="citation" data-cites="Ashley2024">(<a href="#ref-Ashley2024" role="doc-biblioref">Belanger 2024</a>)</span>), social bias (<span class="citation" data-cites="Alice2023">(<a href="#ref-Alice2023" role="doc-biblioref">Nunwick 2023</a>)</span>), substantial financial losses (<span class="citation" data-cites="Asheeta2019">(<a href="#ref-Asheeta2019" role="doc-biblioref">Regidi 2019</a>)</span>) and safety hazards (<span class="citation" data-cites="David2023">(<a href="#ref-David2023" role="doc-biblioref">Shepardson 2023</a>)</span>)</p>
 <p>Therefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?</p>
 </section>
 <section id="our-objectives" class="level3">
@@ -271,68 +244,85 @@ <h3 class="anchored" data-anchor-id="our-objectives">Our Objectives</h3>
 <h2 class="anchored" data-anchor-id="data-science-methods">Data Science Methods</h2>
 <section id="current-approaches" class="level3">
 <h3 class="anchored" data-anchor-id="current-approaches">Current Approaches</h3>
-<p>To ensure the reproducibility, trustworthiness and free-of-bias ML system, comprehensive assessment (FIXME: or comprehensive testing?) is essential. We have observed some traditional approaches in assessing the quality (FIXME: test completeness) of ML systems, which contain different advantages and drawbacks as follows.</p>
-<section id="code-coverage" class="level4">
-<h4 class="anchored" data-anchor-id="code-coverage">1. Code Coverage</h4>
-<p>Code coverage is a measure of the proportion of source code of a program executed when a particular test suite is run. It is widely used in software development domain as one of the measurements. It quantifies the test quality and is scalable given the short process time. However, it cannot provide the reasons and in which ML areas that the test suites fall short under the context of ML system development.</p>
-</section>
-<section id="manual-evaluation" class="level4">
-<h4 class="anchored" data-anchor-id="manual-evaluation">2. Manual Evaluation</h4>
-<p>Manual evaluation involves human expert review at the source code, whom can take the business logic into considerations and find vulnerabilites. Manual evaluation usually delivers comments for improvement under specific development context, and it is still one of the most reliable methods in practice. (FIXME: references) However, the time cost is large and it is not scalable due to the scarcity of time and human expert. Different human expert might put emphasis on different ML test areas instead of a comprehensive and holistic review on the ML system test suites.</p>
-</section>
+<p>To ensure the reproducibility, trustworthiness, and lack of bias in ML systems, comprehensive testing is essential. We outlined some traditional approaches for assessing the completeness of ML system tests with their advantages and drawbacks as follows.</p>
+<ol type="1">
+<li><strong>Code Coverage</strong></li>
+</ol>
+<p>Code coverage measures the proportion of source code of a program executed when a particular test suite is run. Widely used in software development, it quantifies test quality and is scalable due to its short processing time. However, it cannot indicate the reasons or specific ML areas where the test suites fall short under the context of ML system development.</p>
+<ol start="2" type="1">
+<li><strong>Manual Evaluation</strong></li>
+</ol>
+<p>Manual evaluation involves human experts reviewing the source code, whom can take the business logic into considerations and identify vulnerabilites. It often provides context-specific improvement suggestions and remains one of the most reliable practices (<span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span>, <span class="citation" data-cites="alexander2023evaluating">(<a href="#ref-alexander2023evaluating" role="doc-biblioref">Alexander et al. 2023</a>)</span>). However, it is time-consuming and not scalable due to the scarcity of human experts. Moreover, different experts might put emphasis on different ML test areas and lack a comprehensive and holistic review of the ML system test suites.</p>
 </section>
 <section id="our-approach" class="level3">
 <h3 class="anchored" data-anchor-id="our-approach">Our Approach</h3>
-<p>Our approach is to deliver an automated code review tool with the best practices of ML test suites embedded, which can be used by ML users to learn the best practices as well as to obtain a comprehensive evaluation on their ML system codes.</p>
-<p>To come up with the best practices of ML test suites, ML research paper and recognized online resources are our data. Under the collaboration with our partner, we have researched industrial best practices (cite: Microsoft, Jordan) and published academic literature (cite: OpenJa) and consolidated the testing strategies of ML projects into a format which is easily legible and editable by human (researchers, ML engineers, etc.). The format is also machine-friendly that can be easily incorporated into the automated tool.</p>
-<p>To develop our automated code review tool, GitHub repositories of ML projects are our data. We have collected 11 repositories studied in <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span>, where these projects include comprehensive test suites and are written in Python programming language, for our product development. Our tool is capable of understanding the test suites in these projects, comparing and contrasting the test suites with the embedded best practices, and delivering evaluations and suggestions (FIXME: no suggestions right?) to the current test suites.</p>
-<p>By developing our approach, we expect that it can provide reliable test suites evaluation to multiple ML projects in a scalable manner. However, we acknowledged that the consolidation of best practices currently only focused on a few high priority test areas due to time constraint, where we expect to expand in the future. The test evaluation results provided by our tool are yet as reliable as human evaluation, where we will quantify its performance using the success metrics below.</p>
+<p>Our approach is to deliver an automated code review tool with the best practices of ML test suites embedded. This tool aims to educate ML users on best practices while providing comprehensive evaluations of their ML system codes.</p>
+<p>To establish these best practices, we utilized data from ML research papers and recognized online resources. In collaboration with our partner, we researched industrial best practices (<span class="citation" data-cites="msise2023">(<a href="#ref-msise2023" role="doc-biblioref">Team 2023</a>)</span>, <span class="citation" data-cites="jordan2020">(<a href="#ref-jordan2020" role="doc-biblioref">Jordan 2020</a>)</span>) and academic literature (<span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span>), and consolidated testing strategies into a human-readable and machine-friendly checklist that can be embedded into the automated tool.</p>
+<p>For development, we collected 11 GitHub repositories of ML projects as studied in <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span>. These Python-based projects include comprehensive test suites. Our tool should be able to analyze these test suites, compare them with embedded best practices, and deliver evaluations.</p>
+<p>We expect that our approach will provide scalable and reliable test suite evaluations for multiple ML projects. However, we recognize that our current best practices only focus on a few high-priority test areas due to time constraints. We plan to expand this scope in the future. While our tool’s evaluations are not yet as reliable as human evaluations, we will quantify its performance.</p>
 </section>
 <section id="success-metrics" class="level3">
 <h3 class="anchored" data-anchor-id="success-metrics">Success Metrics</h3>
-<p>To properly assess the performance of our tool which leverages the capability of LLMs, we have researched and taken reference of the methods in <span class="citation" data-cites="alexander2023evaluating">(<a href="#ref-alexander2023evaluating" role="doc-biblioref">Alexander et al. 2023</a>)</span> and defined the 2 success metrics: accuracy and consistency. With these metrics, our users (researchers, ML engineers, etc.) can assess the degree of trustworthiness of the evaluation results from our tool.</p>
+<p>To properly assess the performance of our tool which leverages LLMs capability, we have taken reference of the methods in <span class="citation" data-cites="alexander2023evaluating">(<a href="#ref-alexander2023evaluating" role="doc-biblioref">Alexander et al. 2023</a>)</span> and defined two success metrics: accuracy and consistency. These metrics will help users (researchers, ML engineers, etc.) gauge the trustworthiness of our tool’s evaluation results.</p>
 <ol type="1">
-<li><strong>Accuracy of the Application vs Human Expert Judgement</strong></li>
+<li><strong>Accuracy vs Human Expert Judgement</strong></li>
 </ol>
-<p>We run our tool on the ML projects in <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span> to obtain the evaluation results (i.e.&nbsp;completeness score) per each ML test best practice item. We then manually assess the test suites of these ML projects using the same criteria as the ground truth data. Machine evaluation results are compared and contrasted with the ground truth data. Accuracy is defined as the number of matching results over total number of results.</p>
+<p>We run our tool on ML projects from <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span> to obtain evaluation results for each ML checklist item. These results are then compared with our manually assessed ground truth data based on the same criteria. Accuracy is calculated as the proportion of matching results to the total number of results.</p>
 <ol start="2" type="1">
-<li><strong>Consistency of the Application</strong></li>
+<li><strong>Consistency</strong></li>
 </ol>
-<p>Multiple runs on each ML project are performed and the evaluation results per each ML test best practice item are obtained. Standard deviation of these results per ML projects are calculated as a measure of consistency.</p>
+<p>We perform multiple runs on each ML project to obtain evaluation results for each checklist item. Consistency is measured by calculating the standard deviation of these results across multiple runs for each project.</p>
 </section>
 </section>
 <section id="data-product-results" class="level2">
 <h2 class="anchored" data-anchor-id="data-product-results">Data Product &amp; Results</h2>
 <section id="data-products" class="level3">
 <h3 class="anchored" data-anchor-id="data-products">Data Products</h3>
-<p>Our solution offers both a curated checklist on robust ML testing, and a Python package that facilitates the use of LLMs in checklist-based evaluation on the robustness of users’ ML projects. The Python package is made publicly available for distribution on the Python Packaging Index (PyPI).</p>
-<p>The justifications for creating these products are, on one hand, checklists have been shown to decrease errors in software systems and promote code submissions (cite: Gawande 2010, Pineau et al.&nbsp;(2021) from Tiffany PDF). Moreover, Python is chosen to be the programming language of our package given its prevalence in the ML landscape, its ubiquitous presence across different OSes and the existence of Python libraries for the integration with LLMs. This lowers the barrier to use and develop our package and provides better user experience.</p>
+<p>Our solution includes a curated checklist for robust ML testing and a Python package for checklist-based evaluation of ML project testing robustness using LLMs. The package is publicly available on the Python Packaging Index (PyPI).</p>
+<p>Justifications for these products are:</p>
+<ul>
+<li>Checklists have been shown to reduce errors in software systems and promote code submissions (<span class="citation" data-cites="Atul2010">(<a href="#ref-Atul2010" role="doc-biblioref">Gawande 2010</a>)</span>, <span class="citation" data-cites="pineau2021improving">(<a href="#ref-pineau2021improving" role="doc-biblioref">Pineau et al. 2021</a>)</span>).</li>
+<li>Python is widely used in ML, compatible with various OSes, and integrates well with LLMs. These ensure the ease of use and development.</li>
+</ul>
 <section id="how-to-use-the-product" class="level4">
 <h4 class="anchored" data-anchor-id="how-to-use-the-product">How to use the product</h4>
 <p>There are two ways to make use of this package:</p>
 <ol type="1">
-<li><p><strong>As a CLI tool.</strong> A runnable command <code>fixml</code> is provided by the package. Once installed, users can perform the codebase evaluation, test function specification generation and other relevant tasks by running subcommands under <code>fixml</code> in terminal environment.</p></li>
-<li><p><strong>As a high-level API.</strong> Alternatively, one can use the package to import all components necessary for performing the tasks as part of their own system. Documentations are provided in terms of docstrings.</p></li>
+<li><p><strong>As a CLI tool.</strong> A runnable command <code>fixml</code> is provided by the package. Once installed, users can perform codebase evaluations, generate test function specifications, and more by running subcommands under <code>fixml</code> in the terminal.</p></li>
+<li><p><strong>As a high-level API.</strong> Users can import necessary components from the package into their own systems. Documentation is available through docstrings.</p></li>
 </ol>
-<p>By formatting our product as a CLI tool and API, one (researchers, ML engineers, etc.) will find it user-friendly to interact with. Moreover, it is versatile to support various use cases, such as web application development, data science research, etc.</p>
+<p>By offering it as both CLI tool and API, our product is user-friendly to interact with, and versatile to support various use cases such as web application development and scientific research.</p>
 </section>
 <section id="system-design" class="level4">
 <h4 class="anchored" data-anchor-id="system-design">System Design</h4>
-<p>(FIXME: To be revised) <img src="../../img/proposed_system_overview.png" class="img-fluid" alt="image"></p>
-<p>The design principle of our package adheres to object-oriented design and SOLID principles, which is fully modular. One can easily switch between different prompts, models and checklists to use. This facilitates code reusability and users’ collaboration to extend its functionality.</p>
+<p>(FIXME To be revised) <img src="img/proposed_system_overview.png" class="img-fluid" alt="image"></p>
+<p>The design of our package follows object-oriented and SOLID principles, which is fully modularity. Users can easily switch between different prompts, models, and checklists, which facilitates code reusability and collaboration to extend its functionality.</p>
 <p>There are five components in the system of our package:</p>
 <ol type="1">
-<li><p><strong>Code Analyzer</strong> This component extracts the information relevant to test suites from the input codebase, which is essential for injecting only the most relevant information to LLMs given its token limits.</p></li>
-<li><p><strong>Prompt Templates</strong> This component stores the prompt template necessary for instructing LLM to behave and return responses in consistent and expected format. Few-shot learning is applied for the instruction. (FIXME: do we have few-shot learning?)</p></li>
-<li><p><strong>Checklist</strong> This component reads the curated checklist, which is stored in CSV format, as a dict with fixed schema for injection into prompt. Default checklist is also included inside the package for distribution.</p></li>
-<li><p><strong>Runners</strong> This component involves the Evaluator module, which evaluates each file from the test suites using LLMs and outputs evaluation results, and Generator module, which generates test specifications. Both modules include validation and retry logics and record all relevant information in the responses.</p></li>
-<li><p><strong>Parsers</strong> This components parses the responses from Evaluator into evaluation reports in various formats (HTML, PDF) using Jinja template engine. Adhering to our design principle, this enables flexibility in creating customized report structure.</p></li>
+<li><strong>Code Analyzer</strong></li>
+</ol>
+<p>It extracts test suites from the input codebase, to ensure only the most relevants details are provided to LLMs given token limits.</p>
+<ol start="2" type="1">
+<li><strong>Prompt Templates</strong></li>
+</ol>
+<p>It stores prompt templates for instructing LLMs to generate responses in the expected format.</p>
+<ol start="3" type="1">
+<li><strong>Checklist</strong></li>
+</ol>
+<p>It reads the curated checklist from a CSV file into a dictionary with a fixed schema for LLM injection. The package includes a default checklist for distribution.</p>
+<ol start="4" type="1">
+<li><strong>Runners</strong></li>
+</ol>
+<p>It includes the Evaluator module, which assesses each test suite file using LLMs and outputs evaluation results, and the Generator module, which creates test specifications. Both modules feature validation, retry logic, and record response and relevant information.</p>
+<ol start="5" type="1">
+<li><strong>Parsers</strong></li>
 </ol>
+<p>It converts Evaluator responses into evaluation reports in various formats (HTML, PDF) using the Jinja template engine, which enables customizable report structures.</p>
 </section>
 <section id="checklist-design" class="level4">
 <h4 class="anchored" data-anchor-id="checklist-design">Checklist Design</h4>
-<p>The package will incorporate a checklist (<a href="overview-diagram">Fig. 1</a>) which contains the best practices in testing ML pipeline and is curated manually based on ML researches and recognized online resources. Prompt engineering is applied to the checklist for better performance. This also helps combating the hallucination of LLMs (<span class="citation" data-cites="zhang2023sirens">(<a href="#ref-zhang2023sirens" role="doc-biblioref">Zhang et al. 2023</a>)</span>) during the evaluation of ML projects by prompting it to follow <strong>exactly</strong> the checklist.</p>
-<p>Here is an example of how the checklist would be structured:</p>
+<p>The embedded checklist contains best practices for testing ML pipelines, and is curated from ML research and recognized online resources. Prompt engineering further improves performance. THis helps mitigate LLM hallucinations (<span class="citation" data-cites="zhang2023sirens">(<a href="#ref-zhang2023sirens" role="doc-biblioref">Zhang et al. 2023</a>)</span>) by ensuring strict adherence to the checklist.</p>
+<p>Example checklist structure:</p>
 <table class="table">
 <colgroup>
 <col style="width: 26%">
@@ -347,59 +337,64 @@ <h4 class="anchored" data-anchor-id="checklist-design">Checklist Design</h4>
 <tbody>
 <tr class="odd">
 <td style="text-align: right;">ID</td>
-<td style="text-align: left;">The Unique Identifier of the checklist item</td>
+<td style="text-align: left;">Unique Identifier of the checklist item</td>
 </tr>
 <tr class="even">
 <td style="text-align: right;">Topic</td>
-<td style="text-align: left;">The Test Area of the checklist item</td>
+<td style="text-align: left;">Test Area of the checklist item</td>
 </tr>
 <tr class="odd">
 <td style="text-align: right;">Title</td>
-<td style="text-align: left;">The Title of the checklist item</td>
+<td style="text-align: left;">Title of the checklist item</td>
 </tr>
 <tr class="even">
 <td style="text-align: right;">Requirement</td>
-<td style="text-align: left;">The Prompt of the checklist item to be injected into LLMs for evaluation</td>
+<td style="text-align: left;">Prompt for the checklist item to be injected into LLMs for evaluation</td>
 </tr>
 <tr class="odd">
 <td style="text-align: right;">Explanations</td>
-<td style="text-align: left;">Detailed explanations of the checklist item for human understanding</td>
+<td style="text-align: left;">Detailed explanations for human understanding</td>
 </tr>
 <tr class="even">
 <td style="text-align: right;">Reference</td>
-<td style="text-align: left;">References of the checklist item, e.g.&nbsp;academic paper</td>
+<td style="text-align: left;">References for the checklist item, e.g., academic papers</td>
 </tr>
 <tr class="odd">
 <td style="text-align: right;">Is Evaluator Applicable</td>
-<td style="text-align: left;">Whether the checklist item is selected to be used during evaluation. 0 indicates No, 1 indicates Yes</td>
+<td style="text-align: left;">Indicates if the checklist item is used during evaluation (0 = No, 1 = Yes)</td>
 </tr>
 </tbody>
 </table>
-<p>(FIXME: To be revised) <img src="../../img/checklist_sample.png" width="200"></p>
+<p>(FIXME To be revised) <img src="img/checklist_sample.png" width="600"></p>
 </section>
 <section id="artifacts" class="level4">
 <h4 class="anchored" data-anchor-id="artifacts">Artifacts</h4>
-<p>There are three artifacts after using our package:</p>
+<p>Using our package results in three artifacts:</p>
 <ol type="1">
-<li><strong>Evaluation Responses</strong> The artifact stores both the evaluation responses from LLMs and meta-data of the process in JSON format. This supports downstream tasks, such as report render, scientific research, etc.</li>
+<li><strong>Evaluation Responses</strong></li>
 </ol>
-<p>(FIXME: To be revised) schema of the JSON saved &amp; what kind of information is stored</p>
+<p>These responses include both LLM evaluation results and process metadata stored in JSON format.This supports downsteam tasks like report rendering and scientific research, etc.</p>
+<p>(FIXME To be revised) schema of the JSON saved &amp; what kind of information is stored</p>
 <ol start="2" type="1">
-<li><strong>Evaluation Report</strong> The artifact stores the evaluation results of the ML projects in a structured format, which includes completeness score breakdown and corresponding detailed reasons.</li>
+<li><strong>Evaluation Report</strong></li>
 </ol>
-<p>(FIXME: To be revised) <img src="../../img/test_evaluation_report_sample.png" width="200"></p>
+<p>This report presents structured evaluation results of ML projects, which includes a detailed breakdown of completeness scores and reasons for each score.</p>
+<p>(FIXME To be revised) <img src="img/test_evaluation_report_sample.png" width="600"></p>
 <ol start="3" type="1">
-<li><strong>Test Specification Script</strong> The artifacts stores the test specification responses from LLMs in Python script format.</li>
+<li><strong>Test Specification Script</strong></li>
 </ol>
-<p>(FIXME: To be revised) <img src="../../img/test_spec_sample.png" width="200"></p>
+<p>Generated test specifications are stored as Python scripts.</p>
+<p>(FIXME To be revised) <img src="img/test_spec_sample.png" width="600"></p>
 </section>
 </section>
 <section id="evaluation-results" class="level3">
 <h3 class="anchored" data-anchor-id="evaluation-results">Evaluation Results</h3>
-<p>As illustrated in <code>Success Metrics</code>, we ran 30 iterations on each of the repositories in <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span> and examined the breakdown of the completeness score to assessed the quality of evaluation determined by our tool. (FIXME: would it be better to show a table of the repos? like how the Openja does?)</p>
-<section id="accuracy" class="level4">
-<h4 class="anchored" data-anchor-id="accuracy">Accuracy</h4>
-<p>For accuracy, we targeted 3 of the repositories (<code>lightfm</code> (FIXME: link), <code>qlib</code> (FIXME: link), <code>DeepSpeech</code> (FIXME: link)) for human evaluation and compared the ground truth with the outputs from our tool.</p>
+<p>As described in <code>Success Metrics</code>, we conducted 30 iterations on each repository from <span class="citation" data-cites="openja2023studying">(<a href="#ref-openja2023studying" role="doc-biblioref">Openja et al. 2023</a>)</span> and examined the breakdown of the completeness score to assess our tool’s evaluation quality.</p>
+<p>(FIXME: would it be better to show a table of the repos? like how the Openja does?)</p>
+<ol type="1">
+<li><strong>Accuracy</strong></li>
+</ol>
+<p>We targeted 3 of the repositories (<a href="https://github.com/lyst/lightfm"><code>lightfm</code></a>, <a href="https://github.com/microsoft/qlib"><code>qlib</code></a>, <a href="https://github.com/mozilla/DeepSpeech"><code>DeepSpeech</code></a>) for human evaluation compared our tool’s outputs with the ground truth.</p>
 <div class="cell" data-execution_count="1">
 <details>
 <summary>Code</summary>
@@ -407,7 +402,7 @@ <h4 class="anchored" data-anchor-id="accuracy">Accuracy</h4>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>gt <span class="op">=</span> pd.read_csv(<span class="st">'ground_truth.csv'</span>)</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>gt</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </details>
-<div class="cell-output cell-output-display" data-execution_count="1">
+<div class="cell-output cell-output-display" data-execution_count="21">
 <div>
 
 
@@ -486,7 +481,7 @@ <h4 class="anchored" data-anchor-id="accuracy">Accuracy</h4>
 </div>
 </div>
 <blockquote class="blockquote">
-<p>Caption: Ground truth data on the 3 repositories</p>
+<p>Ground truth data for the 3 repositories. (1 = fully satisfied, 0.5 = partially satisfied, 0 = not satisfied)</p>
 </blockquote>
 <div class="cell" data-execution_count="2">
 <details>
@@ -545,31 +540,31 @@ <h4 class="anchored" data-anchor-id="accuracy">Accuracy</h4>
 <span id="cb2-52"><a href="#cb2-52" aria-hidden="true" tabindex="-1"></a>    titleFontSize<span class="op">=</span><span class="dv">12</span></span>
 <span id="cb2-53"><a href="#cb2-53" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </details>
-<div class="cell-output cell-output-display" data-execution_count="2">
+<div class="cell-output cell-output-display" data-execution_count="22">
 
 <style>
-  #altair-viz-359a7ca203b3445f87fd89321e312866.vega-embed {
+  #altair-viz-559a5d7f63344848ba0c79151d499e2f.vega-embed {
     width: 100%;
     display: flex;
   }
 
-  #altair-viz-359a7ca203b3445f87fd89321e312866.vega-embed details,
-  #altair-viz-359a7ca203b3445f87fd89321e312866.vega-embed details summary {
+  #altair-viz-559a5d7f63344848ba0c79151d499e2f.vega-embed details,
+  #altair-viz-559a5d7f63344848ba0c79151d499e2f.vega-embed details summary {
     position: relative;
   }
 </style>
-<div id="altair-viz-359a7ca203b3445f87fd89321e312866"></div>
+<div id="altair-viz-559a5d7f63344848ba0c79151d499e2f"></div>
 <script type="text/javascript">
   var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
   (function(spec, embedOpt){
     let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-359a7ca203b3445f87fd89321e312866") {
-      outputDiv = document.getElementById("altair-viz-359a7ca203b3445f87fd89321e312866");
+    if (outputDiv.id !== "altair-viz-559a5d7f63344848ba0c79151d499e2f") {
+      outputDiv = document.getElementById("altair-viz-559a5d7f63344848ba0c79151d499e2f");
     }
     const paths = {
       "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
       "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
+      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.16.3?noext",
       "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
     };
 
@@ -605,19 +600,19 @@ <h4 class="anchored" data-anchor-id="accuracy">Accuracy</h4>
       require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
     } else {
       maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
+        .then(() => maybeLoadScript("vega-lite", "5.16.3"))
         .then(() => maybeLoadScript("vega-embed", "6"))
         .catch(showError)
         .then(() => displayChart(vegaEmbed));
     }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "data": {"name": "data-0cd69fd1c95de279cdd7d1f0310bd508"}, "facet": {"column": {"field": "repo", "title": null, "type": "nominal"}}, "spec": {"layer": [{"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 200}, "encoding": {"x": {"field": "ground_truth", "type": "quantitative"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelExpr": "datum.value % 0.5 ? null : datum.label"}, "field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "id_title", "title": null, "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "min", "title": "1 SD", "type": "quantitative"}, "x2": {"field": "max"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}]}, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-0cd69fd1c95de279cdd7d1f0310bd508": [{"repo": "lightfm", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 1.0}, {"repo": "lightfm", "id": 3.2, "count": 30.0, "mean": 0.5, "std": 0.0, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0}, {"repo": "lightfm", "id": 3.5, "count": 30.0, "mean": 0.0, "std": 0.0, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0}, {"repo": "lightfm", "id": 4.2, "count": 30.0, "mean": 0.8166666666666667, "std": 0.2450662589267805, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 1.0}, {"repo": "lightfm", "id": 5.3, "count": 30.0, "mean": 0.4833333333333333, "std": 0.0912870929175276, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 0.5}, {"repo": "lightfm", "id": 6.1, "count": 30.0, "mean": 0.9166666666666666, "std": 0.1895245108947258, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0}, {"repo": "lightfm", "id": 6.2, "count": 30.0, "mean": 0.9833333333333332, "std": 0.0912870929175276, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0}, {"repo": "qlib", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 0.5}, {"repo": "qlib", "id": 3.2, "count": 30.0, "mean": 0.7666666666666667, "std": 0.2537081317024624, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0}, {"repo": "qlib", "id": 3.5, "count": 30.0, "mean": 0.1166666666666666, "std": 0.2150915335760381, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0}, {"repo": "qlib", "id": 4.2, "count": 30.0, "mean": 0.4833333333333333, "std": 0.2069204966986668, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 0.5}, {"repo": "qlib", "id": 5.3, "count": 30.0, "mean": 0.55, "std": 0.2012889499682243, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 1.0}, {"repo": "qlib", "id": 6.1, "count": 30.0, "mean": 0.6333333333333333, "std": 0.2916461404928373, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0}, {"repo": "qlib", "id": 6.2, "count": 30.0, "mean": 0.6, "std": 0.203419051086243, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0}]}}, {"mode": "vega-lite"});
+  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "data": {"name": "data-b66570b036e4238b498b9ed062c47ac1"}, "facet": {"column": {"field": "repo", "title": null, "type": "nominal"}}, "spec": {"layer": [{"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 200}, "encoding": {"x": {"field": "ground_truth", "type": "quantitative"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelExpr": "datum.value % 0.5 ? null : datum.label"}, "field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "id_title", "title": null, "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "min", "title": "1 SD", "type": "quantitative"}, "x2": {"field": "max"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}]}, "$schema": "https://vega.github.io/schema/vega-lite/v5.16.3.json", "datasets": {"data-b66570b036e4238b498b9ed062c47ac1": [{"repo": "lightfm", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 1.0}, {"repo": "lightfm", "id": 3.2, "count": 30.0, "mean": 0.5, "std": 0.0, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0}, {"repo": "lightfm", "id": 3.5, "count": 30.0, "mean": 0.0, "std": 0.0, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0}, {"repo": "lightfm", "id": 4.2, "count": 30.0, "mean": 0.8166666666666667, "std": 0.2450662589267805, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 1.0}, {"repo": "lightfm", "id": 5.3, "count": 30.0, "mean": 0.4833333333333333, "std": 0.0912870929175276, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 0.5}, {"repo": "lightfm", "id": 6.1, "count": 30.0, "mean": 0.9166666666666666, "std": 0.1895245108947258, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0}, {"repo": "lightfm", "id": 6.2, "count": 30.0, "mean": 0.9833333333333332, "std": 0.0912870929175276, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0}, {"repo": "qlib", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 0.5}, {"repo": "qlib", "id": 3.2, "count": 30.0, "mean": 0.7666666666666667, "std": 0.2537081317024624, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0}, {"repo": "qlib", "id": 3.5, "count": 30.0, "mean": 0.1166666666666666, "std": 0.2150915335760381, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0}, {"repo": "qlib", "id": 4.2, "count": 30.0, "mean": 0.4833333333333333, "std": 0.2069204966986668, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 0.5}, {"repo": "qlib", "id": 5.3, "count": 30.0, "mean": 0.55, "std": 0.2012889499682243, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 1.0}, {"repo": "qlib", "id": 6.1, "count": 30.0, "mean": 0.6333333333333333, "std": 0.2916461404928373, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0}, {"repo": "qlib", "id": 6.2, "count": 30.0, "mean": 0.6, "std": 0.203419051086243, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0}]}}, {"mode": "vega-lite"});
 </script>
 </div>
 </div>
 <blockquote class="blockquote">
-<p>Caption: Comparison of the satisfaction determined by our system versus the ground truth for each checklist item and repository</p>
+<p>Comparison of our system’s satisfaction determination versus the ground truth for each checklist item and repository</p>
 </blockquote>
-<p>We found that our tool tends to undermine the actual satisfying cases. For the items that are actually satisfied (score = 1), our tool tends to classify as partially satisfied (score = 0.5), (FIXME: in the newer run, the actual 0.5 seems to be tagged quite accurately) while for those that are partially satisfied (score = 0.5), our tool often classifies as not satisfied (score = 0).</p>
+<p>Our tool tends to underrate satisfying cases, which often classifies fully satisfied items as partially satisfied and partially satisfied items as not satisfied.</p>
 <div class="cell" data-execution_count="3">
 <details>
 <summary>Code</summary>
@@ -636,7 +631,7 @@ <h4 class="anchored" data-anchor-id="accuracy">Accuracy</h4>
 <span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a>contingency_table.index.names <span class="op">=</span> [<span class="st">'Repository'</span>, <span class="st">'Checklist Item'</span>, <span class="st">'Ground Truth'</span>]</span>
 <span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>contingency_table.sort_index(level<span class="op">=</span>[<span class="dv">0</span>, <span class="dv">2</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </details>
-<div class="cell-output cell-output-display" data-execution_count="3">
+<div class="cell-output cell-output-display" data-execution_count="23">
 <div>
 
 
@@ -767,13 +762,13 @@ <h4 class="anchored" data-anchor-id="accuracy">Accuracy</h4>
 </div>
 </div>
 <blockquote class="blockquote">
-<p>Contingency table of the satisfaction determined by our system versus the ground truth</p>
+<p>Contingency table of our system’s satisfaction determination versus the ground truth</p>
 </blockquote>
-<p>The accuracy issue may be attributed to the need for improvement of prompts in our checklist.</p>
-</section>
-<section id="consistency" class="level4">
-<h4 class="anchored" data-anchor-id="consistency">Consistency</h4>
-<p>Since the completeness score from LLMs contain randomness, we further studied the consistency of scores across checklist items and repositories.</p>
+<p>The accuracy issue may be attributed to a need to improve our checklist prompts.</p>
+<ol start="2" type="1">
+<li><strong>Consistency</strong></li>
+</ol>
+<p>As the completeness scores from LLMs contain randomness, we examined the consistency of completeness scores across checklist items and repositories.</p>
 <div class="cell" data-execution_count="4">
 <details>
 <summary>Code</summary>
@@ -821,31 +816,31 @@ <h4 class="anchored" data-anchor-id="consistency">Consistency</h4>
 <span id="cb4-42"><a href="#cb4-42" aria-hidden="true" tabindex="-1"></a>    title<span class="op">=</span><span class="st">"30 Runs on Openja's Repositories for each Checklist Item"</span></span>
 <span id="cb4-43"><a href="#cb4-43" aria-hidden="true" tabindex="-1"></a>) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </details>
-<div class="cell-output cell-output-display" data-execution_count="4">
+<div class="cell-output cell-output-display" data-execution_count="24">
 
 <style>
-  #altair-viz-eba218e2d2424d34a7c86a5d9ff4286a.vega-embed {
+  #altair-viz-3e80eba69b234319b871f10f1a35af5e.vega-embed {
     width: 100%;
     display: flex;
   }
 
-  #altair-viz-eba218e2d2424d34a7c86a5d9ff4286a.vega-embed details,
-  #altair-viz-eba218e2d2424d34a7c86a5d9ff4286a.vega-embed details summary {
+  #altair-viz-3e80eba69b234319b871f10f1a35af5e.vega-embed details,
+  #altair-viz-3e80eba69b234319b871f10f1a35af5e.vega-embed details summary {
     position: relative;
   }
 </style>
-<div id="altair-viz-eba218e2d2424d34a7c86a5d9ff4286a"></div>
+<div id="altair-viz-3e80eba69b234319b871f10f1a35af5e"></div>
 <script type="text/javascript">
   var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
   (function(spec, embedOpt){
     let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-eba218e2d2424d34a7c86a5d9ff4286a") {
-      outputDiv = document.getElementById("altair-viz-eba218e2d2424d34a7c86a5d9ff4286a");
+    if (outputDiv.id !== "altair-viz-3e80eba69b234319b871f10f1a35af5e") {
+      outputDiv = document.getElementById("altair-viz-3e80eba69b234319b871f10f1a35af5e");
     }
     const paths = {
       "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
       "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
+      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.16.3?noext",
       "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
     };
 
@@ -881,24 +876,30 @@ <h4 class="anchored" data-anchor-id="consistency">Consistency</h4>
       require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
     } else {
       maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
+        .then(() => maybeLoadScript("vega-lite", "5.16.3"))
         .then(() => maybeLoadScript("vega-embed", "6"))
         .catch(showError)
         .then(() => displayChart(vegaEmbed));
     }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300, "stroke": null}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "layer": [{"mark": {"type": "boxplot", "color": "grey", "opacity": 0.5, "size": 20}, "encoding": {"x": {"field": "value", "title": "Standard Deviation of Scores", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "id_title", "title": null, "type": "nominal"}}}, {"mark": {"type": "circle", "size": 100}, "encoding": {"color": {"field": "id_title", "legend": null, "type": "nominal"}, "tooltip": {"field": "repo", "type": "nominal"}, "x": {"field": "value", "type": "quantitative"}, "y": {"axis": {"grid": true, "labels": true, "ticks": false}, "field": "id_title", "scale": {}, "type": "nominal"}, "yOffset": {"field": "jitter", "type": "quantitative"}}, "transform": [{"calculate": "sqrt(-2*log(random()))*cos(2*PI*random())", "as": "jitter"}]}], "data": {"name": "data-6781bc08998d89e40d13eed2c6299b07"}, "height": 300, "title": "30 Runs on Openja's Repositories for each Checklist Item", "width": 600, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-6781bc08998d89e40d13eed2c6299b07": [{"repo": "lightfm", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "magenta", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0912870929175276}, {"repo": "mmf", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "nanodet", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "qlib", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "lightfm", "id_title": "3.2. Data in the Expected Format", "value": 0.0}, {"repo": "magenta", "id_title": "3.2. Data in the Expected Format", "value": 0.0}, {"repo": "mmf", "id_title": "3.2. Data in the Expected Format", "value": 0.2330457998496995}, {"repo": "nanodet", "id_title": "3.2. Data in the Expected Format", "value": 0.2542738138578039}, {"repo": "qlib", "id_title": "3.2. Data in the Expected Format", "value": 0.2537081317024624}, {"repo": "lightfm", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.0}, {"repo": "magenta", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.0}, {"repo": "mmf", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.2537081317024624}, {"repo": "nanodet", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.0}, {"repo": "qlib", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.2150915335760381}, {"repo": "lightfm", "id_title": "4.2. Verify Data Split Proportion", "value": 0.2450662589267805}, {"repo": "magenta", "id_title": "4.2. Verify Data Split Proportion", "value": 0.0}, {"repo": "mmf", "id_title": "4.2. Verify Data Split Proportion", "value": 0.1728729518208802}, {"repo": "nanodet", "id_title": "4.2. Verify Data Split Proportion", "value": 0.0}, {"repo": "qlib", "id_title": "4.2. Verify Data Split Proportion", "value": 0.2069204966986668}, {"repo": "lightfm", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.0912870929175276}, {"repo": "magenta", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.0}, {"repo": "mmf", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.2491364395612199}, {"repo": "nanodet", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.0}, {"repo": "qlib", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.2012889499682243}, {"repo": "lightfm", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.1895245108947258}, {"repo": "magenta", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.0}, {"repo": "mmf", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.1525642883146823}, {"repo": "nanodet", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.2491364395612199}, {"repo": "qlib", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.2916461404928373}, {"repo": "lightfm", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.0912870929175276}, {"repo": "magenta", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.0}, {"repo": "mmf", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.2248882225544018}, {"repo": "nanodet", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.1268540658512312}, {"repo": "qlib", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.203419051086243}]}}, {"mode": "vega-lite"});
+  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300, "stroke": null}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "layer": [{"mark": {"type": "boxplot", "color": "grey", "opacity": 0.5, "size": 20}, "encoding": {"x": {"field": "value", "title": "Standard Deviation of Scores", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "id_title", "title": null, "type": "nominal"}}}, {"mark": {"type": "circle", "size": 100}, "encoding": {"color": {"field": "id_title", "legend": null, "type": "nominal"}, "tooltip": {"field": "repo", "type": "nominal"}, "x": {"field": "value", "type": "quantitative"}, "y": {"axis": {"grid": true, "labels": true, "ticks": false}, "field": "id_title", "scale": {}, "type": "nominal"}, "yOffset": {"field": "jitter", "type": "quantitative"}}, "transform": [{"calculate": "sqrt(-2*log(random()))*cos(2*PI*random())", "as": "jitter"}]}], "data": {"name": "data-94635df82462083524b4d13e8a96bc71"}, "height": 300, "title": "30 Runs on Openja's Repositories for each Checklist Item", "width": 600, "$schema": "https://vega.github.io/schema/vega-lite/v5.16.3.json", "datasets": {"data-94635df82462083524b4d13e8a96bc71": [{"repo": "lightfm", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "magenta", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0912870929175276}, {"repo": "mmf", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "nanodet", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "qlib", "id_title": "2.1. Ensure Data File Loads as Expected", "value": 0.0}, {"repo": "lightfm", "id_title": "3.2. Data in the Expected Format", "value": 0.0}, {"repo": "magenta", "id_title": "3.2. Data in the Expected Format", "value": 0.0}, {"repo": "mmf", "id_title": "3.2. Data in the Expected Format", "value": 0.2330457998496995}, {"repo": "nanodet", "id_title": "3.2. Data in the Expected Format", "value": 0.2542738138578039}, {"repo": "qlib", "id_title": "3.2. Data in the Expected Format", "value": 0.2537081317024624}, {"repo": "lightfm", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.0}, {"repo": "magenta", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.0}, {"repo": "mmf", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.2537081317024624}, {"repo": "nanodet", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.0}, {"repo": "qlib", "id_title": "3.5. Check for Duplicate Records in Data", "value": 0.2150915335760381}, {"repo": "lightfm", "id_title": "4.2. Verify Data Split Proportion", "value": 0.2450662589267805}, {"repo": "magenta", "id_title": "4.2. Verify Data Split Proportion", "value": 0.0}, {"repo": "mmf", "id_title": "4.2. Verify Data Split Proportion", "value": 0.1728729518208802}, {"repo": "nanodet", "id_title": "4.2. Verify Data Split Proportion", "value": 0.0}, {"repo": "qlib", "id_title": "4.2. Verify Data Split Proportion", "value": 0.2069204966986668}, {"repo": "lightfm", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.0912870929175276}, {"repo": "magenta", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.0}, {"repo": "mmf", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.2491364395612199}, {"repo": "nanodet", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.0}, {"repo": "qlib", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "value": 0.2012889499682243}, {"repo": "lightfm", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.1895245108947258}, {"repo": "magenta", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.0}, {"repo": "mmf", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.1525642883146823}, {"repo": "nanodet", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.2491364395612199}, {"repo": "qlib", "id_title": "6.1. Verify Evaluation Metrics Implementation", "value": 0.2916461404928373}, {"repo": "lightfm", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.0912870929175276}, {"repo": "magenta", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.0}, {"repo": "mmf", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.2248882225544018}, {"repo": "nanodet", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.1268540658512312}, {"repo": "qlib", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "value": 0.203419051086243}]}}, {"mode": "vega-lite"});
 </script>
 </div>
 </div>
 <blockquote class="blockquote">
-<p>Caption: Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores of 30 runs of a single repository</p>
+<p>Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores from 30 runs of a single repository.</p>
 </blockquote>
-<p>We found 2 diverging cases. For example, it shows high standard deviations across repositories for item <code>3.2 Data in the Expected Format</code>. This might be a proof of poor prompt quality, making it ambiguous for the LLM and hence hard to produce consistent results. Prompt engineering might solve this problem.</p>
-<p>On the other hand, there are outliers yielding exceptionally high standard deviations for item <code>5.3 Ensure Model Output Shape Aligns with Expectation</code>. This may be because those repositories are unorthodox, but careful manual examination is required for a more definite conclusion.</p>
-</section>
+<p>We identified two diverging cases:</p>
+<ol type="i">
+<li><strong>High Standard Deviations</strong></li>
+</ol>
+<p>Items like <code>3.2 Data in the Expected Format</code> showed high standard deviations across repositories. This might indicate potential poor prompt quality for the LLM to produce consistent results. Improved prompt engineering could address this issue.</p>
+<ol start="2" type="i">
+<li><strong>Outliers with High Standard Deviations</strong></li>
+</ol>
+<p>Items like <code>5.3 Ensure Model Output Shape Aligns with Expectation</code> had outliers with exceptionally high standard deviations, which is possibly due to unorthodox repositories. A careful manual examination is required for a more definitive conclusion.</p>
 <section id="comparison-of-gpt-3.5-turbo-and-gpt-4o" class="level4">
 <h4 class="anchored" data-anchor-id="comparison-of-gpt-3.5-turbo-and-gpt-4o">Comparison of <code>gpt-3.5-turbo</code> and <code>gpt-4o</code></h4>
-<p>To examine if newer LLMs help in both metrics, we preliminarily compared system outputs from <code>gpt-4o</code> and <code>gpt-3.5-turbo</code> on the <code>lightfm</code> repository, we observed that the <code>gpt-4o</code> system consistently returned “Satisfied”, which deviates from the ground truth.</p>
+<p>To evaluate if newer LLMs improve performance, we preliminarily compared outputs from <code>gpt-4o</code> and <code>gpt-3.5-turbo</code> on the <code>lightfm</code> repository. We observed that <code>gpt-4o</code> consistently returned “Satisfied,” which deviated from the ground truth.</p>
 <div class="cell" data-execution_count="5">
 <details>
 <summary>Code</summary>
@@ -959,31 +960,31 @@ <h4 class="anchored" data-anchor-id="comparison-of-gpt-3.5-turbo-and-gpt-4o">Com
 <span id="cb5-55"><a href="#cb5-55" aria-hidden="true" tabindex="-1"></a>    titleFontSize<span class="op">=</span><span class="dv">12</span></span>
 <span id="cb5-56"><a href="#cb5-56" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </details>
-<div class="cell-output cell-output-display" data-execution_count="5">
+<div class="cell-output cell-output-display" data-execution_count="25">
 
 <style>
-  #altair-viz-25d34cfeb746493c859e23f5d35cc270.vega-embed {
+  #altair-viz-131a6c120ebb47b19e0141d88109bfc1.vega-embed {
     width: 100%;
     display: flex;
   }
 
-  #altair-viz-25d34cfeb746493c859e23f5d35cc270.vega-embed details,
-  #altair-viz-25d34cfeb746493c859e23f5d35cc270.vega-embed details summary {
+  #altair-viz-131a6c120ebb47b19e0141d88109bfc1.vega-embed details,
+  #altair-viz-131a6c120ebb47b19e0141d88109bfc1.vega-embed details summary {
     position: relative;
   }
 </style>
-<div id="altair-viz-25d34cfeb746493c859e23f5d35cc270"></div>
+<div id="altair-viz-131a6c120ebb47b19e0141d88109bfc1"></div>
 <script type="text/javascript">
   var VEGA_DEBUG = (typeof VEGA_DEBUG == "undefined") ? {} : VEGA_DEBUG;
   (function(spec, embedOpt){
     let outputDiv = document.currentScript.previousElementSibling;
-    if (outputDiv.id !== "altair-viz-25d34cfeb746493c859e23f5d35cc270") {
-      outputDiv = document.getElementById("altair-viz-25d34cfeb746493c859e23f5d35cc270");
+    if (outputDiv.id !== "altair-viz-131a6c120ebb47b19e0141d88109bfc1") {
+      outputDiv = document.getElementById("altair-viz-131a6c120ebb47b19e0141d88109bfc1");
     }
     const paths = {
       "vega": "https://cdn.jsdelivr.net/npm/vega@5?noext",
       "vega-lib": "https://cdn.jsdelivr.net/npm/vega-lib?noext",
-      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.17.0?noext",
+      "vega-lite": "https://cdn.jsdelivr.net/npm/vega-lite@5.16.3?noext",
       "vega-embed": "https://cdn.jsdelivr.net/npm/vega-embed@6?noext",
     };
 
@@ -1019,19 +1020,19 @@ <h4 class="anchored" data-anchor-id="comparison-of-gpt-3.5-turbo-and-gpt-4o">Com
       require(["vega-embed"], displayChart, err => showError(`Error loading script: ${err.message}`));
     } else {
       maybeLoadScript("vega", "5")
-        .then(() => maybeLoadScript("vega-lite", "5.17.0"))
+        .then(() => maybeLoadScript("vega-lite", "5.16.3"))
         .then(() => maybeLoadScript("vega-embed", "6"))
         .catch(showError)
         .then(() => displayChart(vegaEmbed));
     }
-  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "data": {"name": "data-fa92dc5f0050e8b2ad7776e083e19b07"}, "facet": {"column": {"field": "model", "title": null, "type": "nominal"}}, "spec": {"layer": [{"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 200}, "encoding": {"x": {"field": "ground_truth", "type": "quantitative"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelExpr": "datum.value % 0.5 ? null : datum.label"}, "field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "id_title", "title": null, "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "min", "title": "1 SD", "type": "quantitative"}, "x2": {"field": "max"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}]}, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-fa92dc5f0050e8b2ad7776e083e19b07": [{"repo": "lightfm", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 3.2, "count": 30.0, "mean": 0.5, "std": 0.0, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 3.5, "count": 30.0, "mean": 0.0, "std": 0.0, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 4.2, "count": 30.0, "mean": 0.8166666666666667, "std": 0.2450662589267805, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 5.3, "count": 30.0, "mean": 0.4833333333333333, "std": 0.0912870929175276, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 0.5, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 6.1, "count": 30.0, "mean": 0.9166666666666666, "std": 0.1895245108947258, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 6.2, "count": 30.0, "mean": 0.9833333333333332, "std": 0.0912870929175276, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 3.2, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 3.5, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 4.2, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 5.3, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 0.5, "model": "gpt-4o"}, {"repo": "lightfm", "id": 6.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 6.2, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0, "model": "gpt-4o"}]}}, {"mode": "vega-lite"});
+  })({"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"labelFontSize": 12, "titleFontSize": 12}}, "data": {"name": "data-294ed38626231524251e2b4c25cd06fc"}, "facet": {"column": {"field": "model", "title": null, "type": "nominal"}}, "spec": {"layer": [{"mark": {"type": "point", "color": "green", "filled": true, "shape": "diamond", "size": 200}, "encoding": {"x": {"field": "ground_truth", "type": "quantitative"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "point", "color": "black", "filled": true, "size": 50}, "encoding": {"x": {"axis": {"labelExpr": "datum.value % 0.5 ? null : datum.label"}, "field": "mean", "scale": {"domainMin": 0, "domainMax": 1}, "title": "Score", "type": "quantitative"}, "y": {"axis": {"grid": false, "labelLimit": 1000, "labelPadding": 10}, "field": "id_title", "title": null, "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}, {"mark": {"type": "errorbar"}, "encoding": {"x": {"field": "min", "title": "1 SD", "type": "quantitative"}, "x2": {"field": "max"}, "y": {"field": "id_title", "type": "nominal"}}, "transform": [{"calculate": "max(0, datum.mean-datum.std)", "as": "min"}, {"calculate": "min(1, datum.mean+datum.std)", "as": "max"}]}]}, "$schema": "https://vega.github.io/schema/vega-lite/v5.16.3.json", "datasets": {"data-294ed38626231524251e2b4c25cd06fc": [{"repo": "lightfm", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 3.2, "count": 30.0, "mean": 0.5, "std": 0.0, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 3.5, "count": 30.0, "mean": 0.0, "std": 0.0, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 4.2, "count": 30.0, "mean": 0.8166666666666667, "std": 0.2450662589267805, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 5.3, "count": 30.0, "mean": 0.4833333333333333, "std": 0.0912870929175276, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 0.5, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 6.1, "count": 30.0, "mean": 0.9166666666666666, "std": 0.1895245108947258, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 6.2, "count": 30.0, "mean": 0.9833333333333332, "std": 0.0912870929175276, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0, "model": "gpt-3.5-turbo"}, {"repo": "lightfm", "id": 2.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Data File Loads as Expected", "id_title": "2.1. Ensure Data File Loads as Expected", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 3.2, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Data in the Expected Format", "id_title": "3.2. Data in the Expected Format", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 3.5, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Check for Duplicate Records in Data", "id_title": "3.5. Check for Duplicate Records in Data", "ground_truth": 0.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 4.2, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Verify Data Split Proportion", "id_title": "4.2. Verify Data Split Proportion", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 5.3, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Ensure Model Output Shape Aligns with Expectation", "id_title": "5.3. Ensure Model Output Shape Aligns with Expectation", "ground_truth": 0.5, "model": "gpt-4o"}, {"repo": "lightfm", "id": 6.1, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Verify Evaluation Metrics Implementation", "id_title": "6.1. Verify Evaluation Metrics Implementation", "ground_truth": 1.0, "model": "gpt-4o"}, {"repo": "lightfm", "id": 6.2, "count": 30.0, "mean": 1.0, "std": 0.0, "title": "Evaluate Model's Performance Against Thresholds", "id_title": "6.2. Evaluate Model's Performance Against Thresholds", "ground_truth": 1.0, "model": "gpt-4o"}]}}, {"mode": "vega-lite"});
 </script>
 </div>
 </div>
 <blockquote class="blockquote">
-<p>Caption: Comparison of the satisfaction using <code>gpt-4o</code> versus using <code>gpt-3.5-turbo</code> for each checklist item on <code>lightfm</code></p>
+<p>Comparison of satisfaction using <code>gpt-4o</code> versus <code>gpt-3.5-turbo</code> for each checklist item on lightfm</p>
 </blockquote>
-<p>Further investigation into <code>gpt-4o</code> is required to address this issue and enhance the system performance.</p>
+<p>Further investigation into <code>gpt-4o</code> is required to determine its effectiveness in system performance.</p>
 </section>
 </section>
 </section>
@@ -1039,34 +1040,32 @@ <h4 class="anchored" data-anchor-id="comparison-of-gpt-3.5-turbo-and-gpt-4o">Com
 <h2 class="anchored" data-anchor-id="conclusion">Conclusion</h2>
 <section id="wrap-up" class="level3">
 <h3 class="anchored" data-anchor-id="wrap-up">Wrap Up</h3>
-<p>Our project, FixML, represents a significant step forward in the field of machine learning (ML) testing by providing curated checklists and automated tools that enhance the evaluation and creation of test suites for ML models. The development and implementation of FixML have been driven by both the need of better quality assurance in ML systems, and the current limitations of traditional testing methods on ML projects which are either too general without comprehensive clarification, or are too human-reliant.</p>
-<p>FixML seamlessly takes in the user’s ML codebase, identifies and extracted its existing test suites. Together with the curated checklist on ML testing, FixML leverages Large Language Models (LLMs) to assess the completeness of the test suites and output detailed evaluation reports with completeness scores and specific reasons. This assists users in understanding the performance of their current test suites with insights. Additionally, FixML can generate test function specifications corresponding to the curated checklist, helping users utilizing their test suites.</p>
-<p>In return, FixML solution combines the scalability of automated testing with the reliability of expert evaluation. By automating the evaluation process, FixML significantly reduces the time and human effort required to assess the quality of ML test suites. This popularizes thorough and efficient quality assessment on ML projects.</p>
+<p>The development of FixML has been driven by the need of better quality assurance in ML systems and the current limitations of traditional testing methods on ML projects. FixML provides curated checklists and automated tools that enhance the evaluation and creation of test suites for ML projects. This in return, significantly reduces the time and effort required to assess the completeness of ML test suites, and thus promotes thorough and efficient assessment on ML projects.</p>
 </section>
 <section id="limitation-future-improvement" class="level3">
 <h3 class="anchored" data-anchor-id="limitation-future-improvement">Limitation &amp; Future Improvement</h3>
-<p>While FixML provides substantial benefits, there are limitations and areas that aim to be addressed in future development:</p>
+<p>While FixML provides substantial benefits, there are limitations and areas to be addressed in future development:</p>
 <ol type="1">
 <li><strong>Specialized Checklist</strong></li>
 </ol>
-<p>The current checklist is designed to be general and may not cover all specific requirements for different ML projects. Future development will focus on creating more specialized checklists for different domains and project types, allowing for more tailored evaluations. Since the format of the checklist is designed to allow users to easily expand, edit and select checklist items based on their specific use case, we welcome any collaboration with ML researchers on the creation of specalized checklists.</p>
+<p>The default checklist is general and may not cover all requirements for different ML projects. Future development will focus on creating specialized checklists for tailored evaluations across various domains and project types. Collaboration with ML researchers is welcomed for creating specialized checklists based on specific use cases.</p>
 <ol start="2" type="1">
 <li><strong>Enhanced Test Evaluator</strong></li>
 </ol>
-<p>Our current study unveils the varying accuracy and consistency issues on the evaluation results using OpenAI GPT models. Future improvements involves prompt enhancement with prompt engineering techniques and support for multiple LLMs for higher performance and flexibility of FixML test evaluator functionality. We also expect to deliver user guidelines in editing the prompts in our system, where ML developers can customize prompts for better performance and collaborate with us to embed them into the system.</p>
+<p>Our study reveals the accuracy and consistency issues on the evaluation results using OpenAI GPT-3.5-turbo model. Future improvements involves better prompt engineering techniques and support for multiple LLMs for enhanced performance and flexibility. User guidelines in prompt creation will be provided to facilitate collaboration with ML developers.</p>
 <ol start="3" type="1">
 <li><strong>Customized Test Specification</strong></li>
 </ol>
-<p>FixML test specification generator currently produces general test function skeletons solely based on the curated checklist without the context of the specific ML projects. Future developments will involve the integration of the ML project codebase in the generation process to output customized test functions skeletons. This further lower the barrier of ML users in creating comprehensive test suites relevant to the projects.</p>
+<p>Future developments will integrate project-specific information to produce customized test function skeletons. This may further encourage users to create comprehensive tests.</p>
 <ol start="4" type="1">
 <li>Workflow Optimization #FIXME: have to review whether to include as it seems lower priority.</li>
 </ol>
-<p>The current test evaluator and test specification generator are separate entities. This could be improved by embedding a workflow engine that allows the system to automatically take actions based on the LLM response. For instance, if the LLM response suggests that test suites are partially satisfied or non-satisfied, the system could automatically run the test generator to produce test function skeletons and then reevaluate them until they are satisfied or some threshold is met. This would create a more cohesive and efficient workflow, reducing manual intervention and improving overall system performance.</p>
+<p>The test evaluator and test specification generator are currently separate. Future improvements could embed a workflow engine that automatically takes actions based on LLM responses. This creates a more cohesive and efficient workflow, recues manual intervention, and improves overall system performance.</p>
 <ol start="5" type="1">
 <li>Performance Optimization #FIXME: have to review whether to include as it seems lower priority.</li>
 </ol>
-<p>Performance optimization is another critical area for future development. As FixML handles large codebases and complex evaluations, optimizing the system to handle these tasks more efficiently is essential. This includes improving the speed and accuracy of the LLM responses, reducing the time taken to analyze and generate reports, and ensuring the system can scale effectively to handle more extensive and more complex projects.</p>
-<p>By addressing these limitations and focusing on these future improvements, FixML will become an even more powerful tool for ensuring the quality and robustness of machine learning and data science projects.</p>
+<p>As FixML handles large codebases and complex evaluations, performance optimization is essential. Future developments will focus on improving the speed and accuracy of LLM responses, reducing analysis and report generation times, and ensuring scalability for handling larger and more complex projects.</p>
+<p>By addressing these limitations and implementing future improvements, we aim for FixML to achieve better performance and contribute to the development of better ML systems, and ultimately enhance human life.</p>
 </section>
 </section>
 <section id="references" class="level2">
@@ -1074,25 +1073,45 @@ <h3 class="anchored" data-anchor-id="limitation-future-improvement">Limitation &
 
 
 
-</section>
 </section>
 
 <div id="quarto-appendix" class="default"><section class="quarto-appendix-contents" role="doc-bibliography"><h2 class="anchored quarto-appendix-heading">References</h2><div id="refs" class="references csl-bib-body hanging-indent" role="list">
 <div id="ref-alexander2023evaluating" class="csl-entry" role="listitem">
 Alexander, Rohan, Lindsay Katz, Callandra Moore, and Zane Schwartz. 2023. <span>“Evaluating the Decency and Consistency of Data Validation Tests Generated by LLMs.”</span> <em>arXiv Preprint arXiv:2310.01402</em>.
 </div>
+<div id="ref-Ashley2024" class="csl-entry" role="listitem">
+Belanger, Ashley. 2024. <span>“Air Canada Must Honor Refund Policy Invented by Airline’s Chatbot.”</span> Ars Technica. <a href="https://arstechnica.com/tech-policy/2024/02/air-canada-must-honor-refund-policy-invented-by-airlines-chatbot/">https://arstechnica.com/tech-policy/2024/02/air-canada-must-honor-refund-policy-invented-by-airlines-chatbot/</a>.
+</div>
+<div id="ref-Atul2010" class="csl-entry" role="listitem">
+Gawande, Atul. 2010. <em>Checklist Manifesto, the (HB)</em>. Penguin Books India.
+</div>
 <div id="ref-grand2021artificial" class="csl-entry" role="listitem">
 Grand-View-Research. 2021. <span>“Artificial Intelligence Market Size, Share &amp; Trends Analysis Report by Solution, by Technology (Deep Learning, Machine Learning), by End-Use, by Region, and Segment Forecasts, 2023 2030.”</span> Grand View Research San Francisco.
 </div>
+<div id="ref-jordan2020" class="csl-entry" role="listitem">
+Jordan, Jeremy. 2020. <span>“Effective Testing for Machine Learning Systems.”</span> <a href="https://www.jeremyjordan.me/testing-ml/">https://www.jeremyjordan.me/testing-ml/</a>.
+</div>
 <div id="ref-kapoor2022leakage" class="csl-entry" role="listitem">
 Kapoor, Sayash, and Arvind Narayanan. 2022. <span>“Leakage and the Reproducibility Crisis in ML-Based Science.”</span> <em>arXiv Preprint arXiv:2207.07048</em>.
 </div>
+<div id="ref-Alice2023" class="csl-entry" role="listitem">
+Nunwick, Alice. 2023. <span>“ITutorGroup Settles AI Hiring Lawsuit Alleging Age Discrimination.”</span> Verdict. <a href="https://www.verdict.co.uk/itutorgroup-settles-ai-hiring-lawsuit-alleging-age-discrimination/">https://www.verdict.co.uk/itutorgroup-settles-ai-hiring-lawsuit-alleging-age-discrimination/</a>.
+</div>
 <div id="ref-openja2023studying" class="csl-entry" role="listitem">
 Openja, Moses, Foutse Khomh, Armstrong Foundjem, Zhen Ming, Mouna Abidi, Ahmed E Hassan, et al. 2023. <span>“Studying the Practices of Testing Machine Learning Software in the Wild.”</span> <em>arXiv Preprint arXiv:2312.12604</em>.
 </div>
+<div id="ref-pineau2021improving" class="csl-entry" role="listitem">
+Pineau, Joelle, Philippe Vincent-Lamarre, Koustuv Sinha, Vincent Larivière, Alina Beygelzimer, Florence d’Alché-Buc, Emily Fox, and Hugo Larochelle. 2021. <span>“Improving Reproducibility in Machine Learning Research (a Report from the Neurips 2019 Reproducibility Program).”</span> <em>Journal of Machine Learning Research</em> 22 (164): 1–20.
+</div>
 <div id="ref-Asheeta2019" class="csl-entry" role="listitem">
 Regidi, Asheeta. 2019. <span>“SEBI’s Circular: The Black Box Conundrum and Misrepresentation in AI-Based Mutual Funds.”</span> Firstpost. <a href="https://www.firstpost.com/business/sebis-circular-the-black-box-conundrum-and-misrepresentation-in-ai-based-mutual-funds-6625161.html">https://www.firstpost.com/business/sebis-circular-the-black-box-conundrum-and-misrepresentation-in-ai-based-mutual-funds-6625161.html</a>.
 </div>
+<div id="ref-David2023" class="csl-entry" role="listitem">
+Shepardson, David. 2023. <span>“GM’s Cruise Recalling 950 Driverless Cars After Pedestrian Dragged in Crash.”</span> Reuters. <a href="https://www.reuters.com/business/autos-transportation/gms-cruise-recall-950-driverless-cars-after-accident-involving-pedestrian-2023-11-08/">https://www.reuters.com/business/autos-transportation/gms-cruise-recall-950-driverless-cars-after-accident-involving-pedestrian-2023-11-08/</a>.
+</div>
+<div id="ref-msise2023" class="csl-entry" role="listitem">
+Team, Microsoft Industry Solutions Engineering. 2023. <span>“Testing Data Science and MLOps Code.”</span> <em>Testing Data Science and MLOps Code - Engineering Fundamentals Playbook</em>. <a href="https://microsoft.github.io/code-with-engineering-playbook/machine-learning/ml-testing/">https://microsoft.github.io/code-with-engineering-playbook/machine-learning/ml-testing/</a>.
+</div>
 <div id="ref-zhang2023sirens" class="csl-entry" role="listitem">
 Zhang, Yue, Yafu Li, Leyang Cui, Deng Cai, Lemao Liu, Tingchen Fu, Xinting Huang, et al. 2023. <span>“Siren’s Song in the AI Ocean: A Survey on Hallucination in Large Language Models.”</span> <a href="https://arxiv.org/abs/2309.01219">https://arxiv.org/abs/2309.01219</a>.
 </div>
diff --git a/report/final_report/docs/img/checklist_sample.png b/report/final_report/docs/img/checklist_sample.png
new file mode 100644
index 0000000..96bfa2d
Binary files /dev/null and b/report/final_report/docs/img/checklist_sample.png differ
diff --git a/report/final_report/docs/logo.png b/report/final_report/docs/img/logo.png
similarity index 100%
rename from report/final_report/docs/logo.png
rename to report/final_report/docs/img/logo.png
diff --git a/report/final_report/docs/img/proposed_system_overview.png b/report/final_report/docs/img/proposed_system_overview.png
new file mode 100644
index 0000000..f830376
Binary files /dev/null and b/report/final_report/docs/img/proposed_system_overview.png differ
diff --git a/report/final_report/docs/img/test_evaluation_report_sample.png b/report/final_report/docs/img/test_evaluation_report_sample.png
new file mode 100644
index 0000000..2bc1528
Binary files /dev/null and b/report/final_report/docs/img/test_evaluation_report_sample.png differ
diff --git a/report/final_report/docs/img/test_spec_sample.png b/report/final_report/docs/img/test_spec_sample.png
new file mode 100644
index 0000000..4eb2baf
Binary files /dev/null and b/report/final_report/docs/img/test_spec_sample.png differ
diff --git a/report/final_report/docs/index.html b/report/final_report/docs/index.html
index d8cf711..09f42ed 100644
--- a/report/final_report/docs/index.html
+++ b/report/final_report/docs/index.html
@@ -1,7 +1,7 @@
 <html xmlns="http://www.w3.org/1999/xhtml">    
   <head>      
-    <title>Redirect to 02_plots-for-final-report.html</title>      
-    <meta http-equiv="refresh" content="0;URL='02_plots-for-final-report.html'" />    
+    <title>Redirect to final_report.html</title>      
+    <meta http-equiv="refresh" content="0;URL='final_report.html'" />    
   </head>    
   <body> 
   </body>  
diff --git a/report/final_report/docs/proposal.html b/report/final_report/docs/proposal.html
index 9079d53..9a14a8b 100644
--- a/report/final_report/docs/proposal.html
+++ b/report/final_report/docs/proposal.html
@@ -7,7 +7,7 @@
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 
 
-<title>Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</title>
+<title>proposal</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -92,7 +92,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="./proposal.html">Proposal</a></li><li class="breadcrumb-item"><a href="./proposal.html">Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="./proposal.html">Capstone Proposal</a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="" onclick="window.quartoOpenSearch();">
@@ -107,7 +107,7 @@
   <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal sidebar-navigation docked overflow-auto">
     <div class="pt-lg-2 mt-2 text-left sidebar-header">
       <a href="./index.html" class="sidebar-logo-link">
-      <img src="./logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
+      <img src="./img/logo.png" alt="" class="sidebar-logo py-0 d-lg-inline d-none">
       </a>
       </div>
         <div class="mt-2 flex-shrink-0 align-items-center">
@@ -117,40 +117,18 @@
         </div>
     <div class="sidebar-menu-container"> 
     <ul class="list-unstyled mt-1">
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
- <span class="menu-text">Final Report</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
+        <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./final_report.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">DSCI591 Capstone Final Report</span></a>
+ <span class="menu-text">Capstone Final Report</span></a>
   </div>
 </li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
- <span class="menu-text">Proposal</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
+        <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./proposal.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text">Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis</span></a>
+ <span class="menu-text">Capstone Proposal</span></a>
   </div>
 </li>
-      </ul>
-  </li>
     </ul>
     </div>
 </nav>
@@ -221,32 +199,17 @@ <h3 class="anchored" data-anchor-id="our-objectives">Our Objectives</h3>
 <section id="our-product" class="level2">
 <h2 class="anchored" data-anchor-id="our-product">Our Product</h2>
 <p>Our solution offers an end-to-end application for evaluating and enhancing the robustness of users’ ML systems.</p>
-<table class="table">
-<thead>
-<tr class="header">
-<th>```ngsvnkff ../../img/proposed_system_overview.png</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td>name: overview-diagram</td>
-</tr>
-</tbody>
-</table>
-<p>Main components and workflow of the proposed system. The checklist would be written in <a href="https://yaml.org/">YAML</a> to maximize readability for both humans and machines. We hope this will encourage researchers/users to read, understand and modify the checklist items, while keeping the checklist closely integrated with other components in our system.</p>
-<pre><code>
-One big challenge in utilizing LLMs to reliably and consistently evaluate ML systems is their tendency to generate illogical and/or factually wrong information known as hallucination [@zhang2023sirens].
-
-To combat this, the proposed system will incorporate a checklist ([Fig. 1](overview-diagram)) which would be curated manually and incorporate best practices in software testing and identified areas to be tested inside ML pipeline from human experts and past research.
-
-This checklist will be our basis in evaluating the effectiveness and completeness of existing tests in a given codebase. Relevant information will be injected into a prompt template, which the LLMs would then be prompted to follow the checklist **exactly** during the evaluation.
-
-Here is an example of how the proposed checklist would be structured:
-
-
-```{toggle}
-```yaml
-%YAML 1.2
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure">
+<p><img src="img/proposed_system_overview.png" class="img-fluid figure-img"></p>
+<figcaption class="figure-caption">Main components and workflow of the proposed system. The checklist would be written in <a href="https://yaml.org/">YAML</a> to maximize readability for both humans and machines. We hope this will encourage researchers/users to read, understand and modify the checklist items, while keeping the checklist closely integrated with other components in our system.</figcaption>
+</figure>
+</div>
+<p>One big challenge in utilizing LLMs to reliably and consistently evaluate ML systems is their tendency to generate illogical and/or factually wrong information known as hallucination <span class="citation" data-cites="zhang2023sirens">(<a href="#ref-zhang2023sirens" role="doc-biblioref">Zhang et al. 2023</a>)</span>.</p>
+<p>To combat this, the proposed system will incorporate a checklist (<a href="overview-diagram">Fig. 1</a>) which would be curated manually and incorporate best practices in software testing and identified areas to be tested inside ML pipeline from human experts and past research.</p>
+<p>This checklist will be our basis in evaluating the effectiveness and completeness of existing tests in a given codebase. Relevant information will be injected into a prompt template, which the LLMs would then be prompted to follow the checklist <strong>exactly</strong> during the evaluation.</p>
+<p>Here is an example of how the proposed checklist would be structured:</p>
+<pre class="{yaml}"><code>%YAML 1.2
 ---
 Title: Checklist for Tests in Machine Learning Projects
 Description: &gt;
@@ -457,8 +420,7 @@ <h2 class="anchored" data-anchor-id="delivery-timeline">Delivery Timeline</h2>
 </table>
 </section>
 <section id="references" class="level2">
-<h2 class="anchored" data-anchor-id="references">References</h2>
-<pre class="{bibliography}"><code></code></pre>
+
 
 
 
@@ -483,6 +445,9 @@ <h2 class="anchored" data-anchor-id="references">References</h2>
 <div id="ref-wattanakriengkrai2022github" class="csl-entry" role="listitem">
 Wattanakriengkrai, Supatsara, Bodin Chinthanet, Hideaki Hata, Raula Gaikovina Kula, Christoph Treude, Jin Guo, and Kenichi Matsumoto. 2022. <span>“GitHub Repositories with Links to Academic Papers: Public Access, Traceability, and Evolution.”</span> <em>Journal of Systems and Software</em> 183: 111117.
 </div>
+<div id="ref-zhang2023sirens" class="csl-entry" role="listitem">
+Zhang, Yue, Yafu Li, Leyang Cui, Deng Cai, Lemao Liu, Tingchen Fu, Xinting Huang, et al. 2023. <span>“Siren’s Song in the AI Ocean: A Survey on Hallucination in Large Language Models.”</span> <a href="https://arxiv.org/abs/2309.01219">https://arxiv.org/abs/2309.01219</a>.
+</div>
 </div></section></div></main> <!-- /main -->
 <script id="quarto-html-after-body" type="application/javascript">
 window.document.addEventListener("DOMContentLoaded", function (event) {
diff --git a/report/final_report/docs/search.json b/report/final_report/docs/search.json
index f8aebd1..0f3e480 100644
--- a/report/final_report/docs/search.json
+++ b/report/final_report/docs/search.json
@@ -1,74 +1,4 @@
 [
-  {
-    "objectID": "02_finding-report.html",
-    "href": "02_finding-report.html",
-    "title": "NOTE: the result is based on the code base abb9a21, which is similar to the commit 69d61a9 in the main branch",
-    "section": "",
-    "text": "import scipy\nimport pickle\nimport yaml\nimport pandas as pd\nimport altair as alt\nfrom collections import Counter\n\n\ndef get_report(response):\n    report = []\n    for result in response.call_results:\n        if result.parsed_response:\n            resp = result.parsed_response['results']\n            for item in resp:\n                item['file'] = result.files_evaluated[0] \n                item['success'] = result.success\n                report.append(item)\n        else:\n            report.append({\n                'ID': '2.1', # FIXME\n                'Title': '',\n                'Requirement': '',\n                'Observation': '',\n                'Functions': [],\n                'Evaluation': '',\n                'Score': 0,\n                'file': result.files_evaluated[0],\n                'success': result.success\n            })\n    return pd.DataFrame(report)\n\ndef extract_file_and_scores(resp_path, verbose=False):\n    if verbose:\n        print(resp_path)\n    with open(resp_path, 'rb') as file:\n        response = pickle.load(file)\n    report = get_report(response)\n    df = (\n        report\n        .pivot(index='file', columns='ID', values='Score')\n        .rename_axis(None, axis=1)\n    )\n    df['success'] = report.groupby(['file'])['success'].all()\n    df['response_path'] = resp_path\n    return df.reset_index()\n\ndef generate_stat_plot(df_repo__stat, ground_truth=None, facet_col='repo', repo=None, id=None):\n    \"\"\"\n    Generate Stat plot across all repo and all checklist item\n    Optional to incorporate ground truth and select specific repo/checklist item\n    \"\"\"\n    if facet_col == 'repo':\n        x_col = 'id'\n        x_title = 'Checklist ID'\n    elif facet_col == 'id':\n        x_col = 'repo'\n        x_title = 'Repository'\n    \n    # the base chart\n    if repo:\n        df_repo__stat = df_repo__stat.query(f'repo == \"{repo}\"')\n    if id:\n        df_repo__stat = df_repo__stat.query(f'id == \"{id}\"')\n    \n    base = alt.Chart().transform_calculate(\n        min=\"max(0, datum.mean-datum.std)\",\n        max=\"min(1, datum.mean+datum.std)\"\n    )\n    \n    # generate the points\n    points = base.mark_point(\n        filled=True,\n        size=50,\n        color='black'\n    ).encode(\n        x=alt.X(f'{x_col}:O').axis(labelAngle=0).title(x_title),\n        y=alt.Y('mean:Q').scale(domainMin=0, domainMax=1).title('Score'),\n    )\n    \n    # generate the error bars\n    errorbars = base.mark_errorbar().encode(\n        x=f\"{x_col}:O\",\n        y=alt.Y(\"min:Q\").title('1 SD'),\n        y2=\"max:Q\"\n    )\n\n    plot = points + errorbars\n    \n    if ground_truth is not None:\n        # generate points of ground truth\n        if repo:\n            ground_truth = ground_truth.query(f'repo == \"{repo}\"')\n        if id:\n            ground_truth = ground_truth.query(f'id == \"{id}\"')\n        \n        df_repo__stat = pd.merge(df_repo__stat, ground_truth, how='left', on=['repo', 'id'])\n        \n        gt_points = alt.Chart().mark_point(\n            filled=True,\n            size=100,\n            color='green',\n            shape=\"diamond\"\n        ).encode(\n            x=alt.X(f'{x_col}:O'),\n            y=alt.Y('score:Q')\n        )\n\n        plot += gt_points\n\n    plot = alt.layer(\n                plot,\n                data=df_repo__stat\n            ).properties(\n                width=400,\n            ).facet(\n                column=f'{facet_col}',\n                columns=2\n            )\n\n    return plot\n\n\npreprocess data\n\nchecklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']\n\n#result_path = '../draft/batch_run_results/record_combine.yml'\nresult_path = '../data/processed/batch_run/record_combine.yml'\nwith open(result_path, 'r') as file:\n    config = pd.DataFrame(yaml.safe_load(file))\n\n# prepare score data by repo, run, file\ntmp = [\n    extract_file_and_scores(path) for path in config['response_path'] # FIXME: excluded deepchem\n]\ntmp = pd.concat(tmp, axis=0).reset_index(drop=True)\n\nraw_df_repo_run_file = config.merge(tmp, on='response_path', how='left')\n\n\n# filter non-test files in qlib\ndf_repo_run_file = raw_df_repo_run_file.query('(repo != \"qlib\") | (file.str.contains(\"../data/raw/openja/qlib/tests/\"))')\n\n# prepare score data by repo, run\ndf_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({\n    id: ['max'] for id in checklist_ids\n})\ndf_repo_run.columns = [col[0] for col in df_repo_run.columns]\ndf_repo_run = df_repo_run.reset_index()\n\n# prepare statistics of scores by repo\ndf_repo__stat = df_repo_run.groupby(['repo']).agg({\n    id: ['mean', 'std', 'count'] for id in checklist_ids\n})\ndf_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])\ndf_repo__stat.columns = ['repo', 'id', 'stat', 'value']\ndf_repo__stat = (\n    df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')\n    .reset_index()\n    .rename_axis(None, axis=1)\n)\n\n# prepare counting of scores by repo\ndf_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()\nfor id in checklist_ids[1:]:\n    df_repo__count = df_repo__count.merge(\n        df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),\n        on=['repo', 'level_1'],\n        how='outer'\n    )\n\ndf_repo__count = df_repo__count.fillna(0)\n\n\n\nRuns Quality\n\n1. Some non-test files are included in the evaluation\nFor example, the ./nanodet/nanodet/trainer/task.py\n\nraw_df_repo_run_file.query('repo == \"nanodet\"')['file'].unique()[:3]\n\narray(['../data/raw/openja/nanodet/nanodet/trainer/task.py',\n       '../data/raw/openja/nanodet/tests/test_configs/test_config.py',\n       '../data/raw/openja/nanodet/tests/test_data/test_batch_process.py'],\n      dtype=object)\n\n\n\n\n2. Evaluation on the file magenta/magenta/models/music_vae/data_test.py is always failed\n\ndf_repo_run_file[~df_repo_run_file.success]['file'].unique()\n\narray(['../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',\n       '../data/raw/openja/paperless-ng/src/documents/tests/test_api.py'],\n      dtype=object)\n\n\n\n\n3. DeepSpeech, lightfm and magenta have the least (Python) test files\n\ndf_repo_run_file.query('run == 1').groupby(['repo'])['file'].count().reset_index()\n\n\n\n\n\n\n\n\nrepo\nfile\n\n\n\n\n0\nDeepSpeech\n3\n\n\n1\napollo\n14\n\n\n2\nlightfm\n7\n\n\n3\nmagenta\n8\n\n\n4\nmmf\n70\n\n\n5\nmycroft-core\n64\n\n\n6\nnanodet\n42\n\n\n7\npaperless-ng\n35\n\n\n8\nqlib\n31\n\n\n\n\n\n\n\n\n\n4. The test files are not always in a tests/ folder. Is it be good practice to always do that? Should it be one of the checklist item to ensure all tests placed under tests/ folder?\nFor example, magenta\n\ndf_repo_run_file.query('repo == \"magenta\"')['file'].unique()\n\narray(['../data/raw/openja/magenta/conftest.py',\n       '../data/raw/openja/magenta/magenta/common/state_util_test.py',\n       '../data/raw/openja/magenta/magenta/models/coconet/export_saved_model_test.py',\n       '../data/raw/openja/magenta/magenta/models/coconet/lib_data.py',\n       '../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',\n       '../data/raw/openja/magenta/magenta/models/onsets_frames_transcription/create_dataset_lib_test.py',\n       '../data/raw/openja/magenta/magenta/models/score2perf/datagen_beam_test.py',\n       '../data/raw/openja/magenta/magenta/pipelines/pipeline_test.py'],\n      dtype=object)\n\n\n\n\n\nFindings on 8 repos\n\ndf_repo_run_file.repo.unique()\n\narray(['lightfm', 'qlib', 'mmf', 'nanodet', 'magenta', 'DeepSpeech',\n       'paperless-ng', 'mycroft-core', 'apollo'], dtype=object)\n\n\n\n1. Overview of accuracy and consistency lightfm evaluation\nLet the ground truth of the lightfm is as the following:\n\n# Ground truth\nground_truth = pd.DataFrame([\n    {'repo': 'lightfm', 'id': '2.1', 'score': 1},\n    {'repo': 'lightfm', 'id': '3.2', 'score': 1},\n    {'repo': 'lightfm', 'id': '3.5', 'score': 0},\n    {'repo': 'lightfm', 'id': '4.2', 'score': 1},\n    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},\n    {'repo': 'lightfm', 'id': '6.1', 'score': 1},\n    {'repo': 'lightfm', 'id': '6.2', 'score': 1},\n    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},\n    {'repo': 'qlib', 'id': '3.2', 'score': 1},\n    {'repo': 'qlib', 'id': '3.5', 'score': 0},\n    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},\n    {'repo': 'qlib', 'id': '5.3', 'score': 1},\n    {'repo': 'qlib', 'id': '6.1', 'score': 1},\n    {'repo': 'qlib', 'id': '6.2', 'score': 1},\n    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},\n])\nground_truth[ground_truth.repo == 'lightfm']\n\n\n\n\n\n\n\n\nrepo\nid\nscore\n\n\n\n\n0\nlightfm\n2.1\n1.0\n\n\n1\nlightfm\n3.2\n1.0\n\n\n2\nlightfm\n3.5\n0.0\n\n\n3\nlightfm\n4.2\n1.0\n\n\n4\nlightfm\n5.3\n0.5\n\n\n5\nlightfm\n6.1\n1.0\n\n\n6\nlightfm\n6.2\n1.0\n\n\n\n\n\n\n\n\ngenerate_stat_plot(df_repo__stat, ground_truth=ground_truth, repo=\"lightfm\", facet_col='repo')\n\n\n\n\n\n\n\nThe distribution of the scores for each checklist items:\n\ndf_repo__count.query('repo == \"lightfm\"')\n\n\n\n\n\n\n\n\nrepo\nlevel_1\n2.1\n3.2\n3.5\n4.2\n5.3\n6.1\n6.2\n\n\n\n\n6\nlightfm\n0.0\n0.0\n1.0\n19.0\n0.0\n18.0\n0.0\n0.0\n\n\n7\nlightfm\n0.5\n1.0\n29.0\n6.0\n27.0\n12.0\n20.0\n4.0\n\n\n8\nlightfm\n1.0\n29.0\n0.0\n5.0\n3.0\n0.0\n10.0\n26.0\n\n\n\n\n\n\n\nObservations: The system evaluation kind of aligns with our evaluation, that is, - for those items that we believe “Satisfied” (Score = 1), the system mostly output 0.5 or 1 - for those items that we believe “Partially Satisfied” or “Not Satisfied”, the system mostly output 0.5 or 0 - some checklist items display high variance, e.g. 3.5, 5.3 and 6.1.\n\n\n2. Overview of qlib\nLet the ground truth of the qlib is as the following (FIXME: to be confirmed):\n\n# Ground truth\nground_truth[ground_truth.repo == 'qlib']\n\n\n\n\n\n\n\n\nrepo\nid\nscore\n\n\n\n\n7\nqlib\n2.1\n0.5\n\n\n8\nqlib\n3.2\n1.0\n\n\n9\nqlib\n3.5\n0.0\n\n\n10\nqlib\n4.2\n0.5\n\n\n11\nqlib\n5.3\n1.0\n\n\n12\nqlib\n6.1\n1.0\n\n\n13\nqlib\n6.2\n1.0\n\n\n\n\n\n\n\n\ngenerate_stat_plot(df_repo__stat, ground_truth=ground_truth, repo=\"qlib\", facet_col='repo')\n\n\n\n\n\n\n\n\ndf_repo__count.query('repo == \"qlib\"')\n\n\n\n\n\n\n\n\nrepo\nlevel_1\n2.1\n3.2\n3.5\n4.2\n5.3\n6.1\n6.2\n\n\n\n\n24\nqlib\n0.0\n0.0\n1.0\n29.0\n3.0\n14.0\n4.0\n1.0\n\n\n25\nqlib\n0.5\n0.0\n12.0\n1.0\n27.0\n16.0\n24.0\n26.0\n\n\n26\nqlib\n1.0\n30.0\n17.0\n0.0\n0.0\n0.0\n2.0\n3.0\n\n\n\n\n\n\n\nObservations: - There are more disagreement between system and manual evaluation - especially for 5.3, 6.1, 6.2. - The items consistency in this repo are not similar to those in lightfm. - e.g. Variance for 3.5 is greatly reduced. Variance for 3.2 becomes larger. - However, qlib is not just a machine learning project, it also contains a software inside. - e.g. It has a lot of randomly generated data by itself, instead of reading a data to perform analysis, it seems to deviate from the objective of 2.1.\n\n\n3. The consistency for each checklist items\n\nWhy is it important? If the score of a particular item varies a lot when evaluating a repository, it might mean that its prompt (Requirement) is confusing to the LLM, or the checklist item itself is not well defined.\n\n\ndf_repo__stat.pivot(index='id', columns='repo', values='std')\n\n\n\n\n\n\n\nrepo\nDeepSpeech\napollo\nlightfm\nmagenta\nmmf\nmycroft-core\nnanodet\npaperless-ng\nqlib\n\n\nid\n\n\n\n\n\n\n\n\n\n\n\n\n\n2.1\n0.479463\n0.152564\n0.091287\n0.423451\n0.000000\n0.000000\n0.000000\n0.242117\n0.000000\n\n\n3.2\n0.406838\n0.215092\n0.091287\n0.189525\n0.245066\n0.278027\n0.239732\n0.091287\n0.285673\n\n\n3.5\n0.000000\n0.000000\n0.388040\n0.252003\n0.126854\n0.000000\n0.252003\n0.000000\n0.091287\n\n\n4.2\n0.000000\n0.000000\n0.152564\n0.091287\n0.126854\n0.000000\n0.254274\n0.000000\n0.152564\n\n\n5.3\n0.000000\n0.000000\n0.249136\n0.000000\n0.126854\n0.000000\n0.000000\n0.000000\n0.253708\n\n\n6.1\n0.351107\n0.172873\n0.239732\n0.252003\n0.233046\n0.000000\n0.285673\n0.000000\n0.224888\n\n\n6.2\n0.000000\n0.000000\n0.172873\n0.000000\n0.201289\n0.253708\n0.260415\n0.126854\n0.182574\n\n\n\n\n\n\n\n\nalt.Chart(df_repo__stat).mark_boxplot().encode(\n    x=\"std:Q\",\n    y='id:N'\n).properties(\n    height=200,\n    width=400\n)\n\n\n\n\n\n\n\nObservations: - The evaluation of the checklist item 2.1 Ensure Data File Loads as Expected is usually stable. - When evaluating a repository, 50% of the time its standard deviation is smaller than 0.05, the smallest among the others.\nBelow shows the breakdown of item scores for each repository:\n(NOTE: only lightfm and qlib have ground truth, in green diamond)\n\ngenerate_stat_plot(df_repo__stat, ground_truth=ground_truth, facet_col='id')\n\n\n\n\n\n\n\nObservations: - (TBC) The standard deviation for Item 3.5 and 5.3 shows great variation, which might imply that test cases in some repo might be confusing to LLM while some are clear. - (TBC) The standard deviation for Item 5.3, 6.1, 6.2 are relatively high and consistent, which might imply that there is room for refining the prompt to reduce consistency issue.\n\n\n4. The consistency for each checklist items, compared to the lightfm\n\nWhy is it important? We optimized the consistency of our system using lightfm. Therefore, we treat this repository as a benchmark. If a particular checklist item has a much worse consistency in other repository, that might mean that the prompt for that item is not generalizable.\n\nBelow shows the standard deviations in a 30 runs for each checklist item for each repository:\n\nstds = df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id')\nstds\n\n\n\n\n\n\n\n\nstd\n\n\nid\n2.1\n3.2\n3.5\n4.2\n5.3\n6.1\n6.2\n\n\nrepo\n\n\n\n\n\n\n\n\n\n\n\nDeepSpeech\n0.479463\n0.406838\n0.000000\n0.000000\n0.000000\n0.351107\n0.000000\n\n\napollo\n0.152564\n0.215092\n0.000000\n0.000000\n0.000000\n0.172873\n0.000000\n\n\nlightfm\n0.091287\n0.091287\n0.388040\n0.152564\n0.249136\n0.239732\n0.172873\n\n\nmagenta\n0.423451\n0.189525\n0.252003\n0.091287\n0.000000\n0.252003\n0.000000\n\n\nmmf\n0.000000\n0.245066\n0.126854\n0.126854\n0.126854\n0.233046\n0.201289\n\n\nmycroft-core\n0.000000\n0.278027\n0.000000\n0.000000\n0.000000\n0.000000\n0.253708\n\n\nnanodet\n0.000000\n0.239732\n0.252003\n0.254274\n0.000000\n0.285673\n0.260415\n\n\npaperless-ng\n0.242117\n0.091287\n0.000000\n0.000000\n0.000000\n0.000000\n0.126854\n\n\nqlib\n0.000000\n0.285673\n0.091287\n0.152564\n0.253708\n0.224888\n0.182574\n\n\n\n\n\n\n\n\nstds_p = stds.copy()\nstds_p.columns = [col[1] for col in stds_p.columns]\nstds_p = stds_p.reset_index()\nstds_p = stds_p.melt(id_vars='repo', var_name='id')\n\n\nstds_p.head()\n\n\n\n\n\n\n\n\nrepo\nid\nvalue\n\n\n\n\n0\nDeepSpeech\n2.1\n0.479463\n\n\n1\napollo\n2.1\n0.152564\n\n\n2\nlightfm\n2.1\n0.091287\n\n\n3\nmagenta\n2.1\n0.423451\n\n\n4\nmmf\n2.1\n0.000000\n\n\n\n\n\n\n\n\n# stripplot = (\n#     alt.Chart(stds_p)\n#     .mark_point(filled=True, size=100)\n#     .transform_calculate( \n#         # Generate Gaussian jitter with a Box-Muller transform \n#         jitter='sqrt(-2*log(random()))*cos(2*PI*random())'\n#         # jitter='random()'\n#     ).encode( \n#         y=alt.Y( \n#             'jitter:Q', \n#             title=None, \n#             axis=alt.Axis(ticks=False, grid=True, labels=False), \n#             scale=alt.Scale(), \n#         ), \n#         x=alt.X('value:Q'), \n#         color=alt.Color('repo:N'),\n#         row=alt.Row( \n#             'id:N',\n#             header=alt.Header(\n#                 labelFontSize=16,\n#                 labelAngle=0\n#             )\n#         ),\n#         tooltip='repo'\n#     ).configure_facet( \n#         spacing=0\n#     ).configure_view( \n#         stroke=None\n#     ).configure_axis( \n#         labelFontSize=16, \n#         titleFontSize=16\n#     ).properties(\n#         height=50, \n#         width=600\n#     ) \n# )\n    \n# stripplot \n\n\ndef generate_jitterbox_plot(df_stds_p):\n    \"\"\"\n    Generate jitterbox plot across all repo and all checklist item\n    \"\"\"\n    box = alt.Chart().mark_boxplot(\n        color='grey',\n        opacity=0.5,\n        size=20,\n    ).encode(\n        x=alt.X('value:Q').title('SD(Score)'),\n        y=alt.Y('id:N', title=None, axis=alt.Axis(labelPadding=10, grid=False))\n    )\n    \n    stripplot = alt.Chart().mark_circle(size=100).encode(\n        y=alt.Y( \n            'id:N',\n            axis=alt.Axis(ticks=False, grid=True, labels=True), \n            scale=alt.Scale(), \n        ), \n        x='value:Q',\n        yOffset=\"jitter:Q\",\n        color=alt.Color('id:N', legend=None),\n        tooltip='repo'\n    ).transform_calculate(\n        # Generate Gaussian jitter with a Box-Muller transform\n        jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n    )\n    \n    plot = alt.layer(\n        box,\n        stripplot,\n        data=df_stds_p\n    ).configure_view( \n        stroke=None\n    ).configure_axis( \n        labelFontSize=16, \n        titleFontSize=16\n    ).properties(\n        height=300, \n        width=600\n    ) \n    \n    return plot\n\n\ngenerate_jitterbox_plot(stds_p)\n\n\n\n\n\n\n\n\nalt.Chart(df_repo__stat).mark_boxplot().encode(\n    x=\"std:Q\",\n    y='id:N'\n).properties(\n    height=200,\n    width=400\n)\n\n\n\n\n\n\n\n\n# !pip install altair_catplot\n# !pip install seaborn\n\n\n# import altair_catplot\n\n# altair_catplot.catplot(\n#     stds_p, \n#     transform ='jitterbox', \n#     mark ='point', \n#     encoding = dict(\n#         x = alt.X('value:Q'), \n#         y = alt.Y('id:N'), \n#         color = alt.Color('repo:N')\n#     ) \n# )\n\n\nF = stds.drop(index='lightfm') / stds.loc['lightfm']\n\nbase = alt.Chart(\n    F.melt(ignore_index=False).reset_index()[['repo', 'id', 'value']]\n).transform_calculate(\n    benchmark=\"1\",\n    threshold=f\"{scipy.stats.f.ppf(0.975, 29, 29)}\"\n)\n\npoint = base.mark_point(\n    filled=True,\n    size=100,\n).encode(\n    x=alt.X('value:Q').title(\"std ratio (c.f. lightfm)\"),\n    y='id:N',\n    color='repo',\n    tooltip='repo'\n).properties(\n    height=200,\n    width=400\n)\n\npoint \\\n+ base.mark_rule(color='black').encode(x=\"benchmark:Q\") \\\n+ base.mark_rule(color='red').encode(x=\"threshold:Q\")\n# jitter instead of mark_point &lt;-- prompt vs. repo problem?\n# prompt: sd of checklist item for all repo is high\n# repo: most of repo have low sd, the repo we're looking at has outlier\n\n\n\n\n\n\n\nObservations: - The evaluation of the checklist item 3.2 Data in the Expected Format becomes much more unstable in most of other repositories. - That of the 2.1 is significantly unstable in the repo paperless-ng, magenta and DeepSpeech, but it may be due to the repo itself.\nTODO: to look into the 3.2’s scores.\n\n\nTODO: Given ground truth == 1, distribution of system score?\n\n\nTODO: Given ground truth == 0, distribution of system score?\n\ndef generate_histogram_plot(df_repo_run_long, df_ground_truth=None, repo=None, id=None):\n    \"\"\"\n    Generate histogram across all repo and all checklist item\n    Optional to incorporate ground truth and select specific repo/checklist item\n    \"\"\"\n    # data\n    repo_data = df_repo_run_long.copy()\n    if repo:\n        repo_data = repo_data.query(f'repo == \"{repo}\"')\n    if id:\n        repo_data = repo_data.query(f'id == \"{id}\"')\n\n    # base histogram chart\n    base = alt.Chart().mark_bar().encode(\n                x=alt.X('eval_score:Q', title='Score'), \n                y=alt.Y('count()'), \n                color=alt.value('grey'),\n                size=alt.value(20),\n            )\n    \n    if df_ground_truth is not None:\n        # data\n        gt_data = df_ground_truth.copy()\n        if repo:\n            gt_data = gt_data.query(f'repo == \"{repo}\"')\n        if id:\n            gt_data = gt_data.query(f'id == \"{id}\"')\n        \n        repo_data = pd.merge(repo_data, gt_data, how='left', on=['repo', 'id'])\n        repo_data['is_equal_to_gt'] = repo_data['eval_score'] == repo_data['score']\n        \n        # base histogram chart\n        base = base.encode(\n                    color=alt.Color('is_equal_to_gt', scale=alt.Scale(range=['grey', 'green']), legend=None)\n                )\n        base += base.mark_text().encode(\n            text=alt.value('Ground Truth'),\n            x='score',\n            size=alt.value(10),\n            color=alt.value('green'),\n        )\n\n    plot = alt.layer(\n                base,\n                data=repo_data\n            ).properties(\n                width=200,\n                height=200,\n            ).facet(\n                row='repo',\n                column='id'\n            )        \n    \n    return plot\n\n\n\nContingency Table\n\ndf_repo_run_p = pd.melt(df_repo_run, id_vars=['repo', 'run'], var_name='id', value_name='eval_score')\ndf_repo_run_p = pd.merge(df_repo_run_p, ground_truth, how='inner', on=['repo', 'id'])\ndf_repo_run_p = df_repo_run_p.rename(columns={'score': 'ground_truth'})\npd.pivot_table(df_repo_run_p, values='run', index=['ground_truth'], columns=['eval_score'], aggfunc='count', fill_value=0)\n\n\n\n\n\n\n\neval_score\n0.0\n0.5\n1.0\n\n\nground_truth\n\n\n\n\n\n\n\n0.0\n227\n8\n35\n\n\n0.5\n21\n39\n30\n\n\n1.0\n21\n159\n90\n\n\n\n\n\n\n\n\ndf_repo_run_p\n\n\n\n\n\n\n\n\nrepo\nrun\nid\neval_score\nground_truth\n\n\n\n\n0\nDeepSpeech\n1\n2.1\n1.0\n0.0\n\n\n1\nDeepSpeech\n2\n2.1\n1.0\n0.0\n\n\n2\nDeepSpeech\n3\n2.1\n0.0\n0.0\n\n\n3\nDeepSpeech\n4\n2.1\n1.0\n0.0\n\n\n4\nDeepSpeech\n5\n2.1\n0.0\n0.0\n\n\n...\n...\n...\n...\n...\n...\n\n\n625\nqlib\n26\n6.2\n0.5\n1.0\n\n\n626\nqlib\n27\n6.2\n0.5\n1.0\n\n\n627\nqlib\n28\n6.2\n0.5\n1.0\n\n\n628\nqlib\n29\n6.2\n1.0\n1.0\n\n\n629\nqlib\n30\n6.2\n0.5\n1.0\n\n\n\n\n630 rows × 5 columns\n\n\n\n\n# generate_histogram_plot(df_repo_run_p, df_ground_truth=ground_truth)"
-  },
-  {
-    "objectID": "04_plots-for-presentations.html",
-    "href": "04_plots-for-presentations.html",
-    "title": "Accuracy: Contingency table",
-    "section": "",
-    "text": "!pip install scipy altair\n\nRequirement already satisfied: scipy in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (1.13.1)\nCollecting altair\n  Using cached altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)\nRequirement already satisfied: numpy&lt;2.3,&gt;=1.22.4 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from scipy) (1.26.4)\nRequirement already satisfied: jinja2 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (3.1.4)\nRequirement already satisfied: jsonschema&gt;=3.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (4.22.0)\nRequirement already satisfied: packaging in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (23.2)\nRequirement already satisfied: pandas&gt;=0.25 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (2.2.2)\nCollecting toolz (from altair)\n  Using cached toolz-0.12.1-py3-none-any.whl.metadata (5.1 kB)\nRequirement already satisfied: attrs&gt;=22.2.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (23.2.0)\nRequirement already satisfied: jsonschema-specifications&gt;=2023.03.6 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (2023.12.1)\nRequirement already satisfied: referencing&gt;=0.28.4 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (0.35.1)\nRequirement already satisfied: rpds-py&gt;=0.7.1 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema&gt;=3.0-&gt;altair) (0.18.1)\nRequirement already satisfied: python-dateutil&gt;=2.8.2 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas&gt;=0.25-&gt;altair) (2.9.0.post0)\nRequirement already satisfied: pytz&gt;=2020.1 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas&gt;=0.25-&gt;altair) (2024.1)\nRequirement already satisfied: tzdata&gt;=2022.7 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas&gt;=0.25-&gt;altair) (2024.1)\nRequirement already satisfied: MarkupSafe&gt;=2.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jinja2-&gt;altair) (2.1.5)\nRequirement already satisfied: six&gt;=1.5 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from python-dateutil&gt;=2.8.2-&gt;pandas&gt;=0.25-&gt;altair) (1.16.0)\nUsing cached altair-5.3.0-py3-none-any.whl (857 kB)\nUsing cached toolz-0.12.1-py3-none-any.whl (56 kB)\nInstalling collected packages: toolz, altair\nSuccessfully installed altair-5.3.0 toolz-0.12.1\n\n\n\nimport scipy\nimport pickle\nimport json\nimport yaml\nimport pandas as pd\nimport altair as alt\nfrom collections import Counter\n\n\ndef get_report(response):\n    report = []\n    for result in response.call_results:\n        if result.parsed_response:\n            resp = result.parsed_response['results']\n            for item in resp:\n                item['file'] = result.files_evaluated[0] \n                item['success'] = result.success\n                report.append(item)\n        else:\n            report.append({\n                'ID': '2.1', # FIXME\n                'Title': '',\n                'Requirement': '',\n                'Observation': '',\n                'Functions': [],\n                'Evaluation': '',\n                'Score': 0,\n                'file': result.files_evaluated[0],\n                'success': result.success\n            })\n    return pd.DataFrame(report)\n\ndef get_report_json(response):\n    report = []\n    for result in response['call_results']:\n        if result['parsed_response']:\n            resp = result['parsed_response']['results']\n            for item in resp:\n                item['file'] = result['files_evaluated'][0] \n                item['success'] = result['success']\n                report.append(item)\n        else:\n            report.append({\n                'ID': '2.1', # FIXME\n                'Title': '',\n                'Requirement': '',\n                'Observation': '',\n                'Functions': [],\n                'Evaluation': '',\n                'Score': 0,\n                'file': result.files_evaluated[0],\n                'success': result.success\n            })\n    return pd.DataFrame(report)\n\ndef extract_file_and_scores(resp_path, verbose=False):\n    if verbose:\n        print(resp_path)\n    with open(resp_path, 'rb') as file:\n        try:\n            response = pickle.load(file)\n            report = get_report(response)\n        except:\n            response = json.load(file)\n            report = get_report_json(response)\n    df = (\n        report\n        .pivot(index='file', columns='ID', values='Score')\n        .rename_axis(None, axis=1)\n    )\n    df['success'] = report.groupby(['file'])['success'].all()\n    df['response_path'] = resp_path\n    return df.reset_index()\n\n\nchecklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']\n\ndef read_and_preprocess(result_path):\n    with open(result_path, 'r') as file:\n        config = pd.DataFrame(yaml.safe_load(file))\n    \n    # prepare score data by repo, run, file\n    tmp = [\n        extract_file_and_scores(path) for path in config['response_path'] # FIXME: excluded deepchem\n    ]\n    tmp = pd.concat(tmp, axis=0).reset_index(drop=True)\n    \n    raw_df_repo_run_file = config.merge(tmp, on='response_path', how='left')\n\n    # filter non-test files in qlib\n    df_repo_run_file = raw_df_repo_run_file.query('(repo != \"qlib\") | (file.str.contains(\"../data/raw/openja/qlib/tests/\"))')\n    \n    # prepare score data by repo, run\n    df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({\n        id: ['max'] for id in checklist_ids\n    })\n    df_repo_run.columns = [col[0] for col in df_repo_run.columns]\n    df_repo_run = df_repo_run.reset_index()\n    \n    # prepare statistics of scores by repo\n    df_repo__stat = df_repo_run.groupby(['repo']).agg({\n        id: ['mean', 'std', 'count'] for id in checklist_ids\n    })\n    df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])\n    df_repo__stat.columns = ['repo', 'id', 'stat', 'value']\n    df_repo__stat = (\n        df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')\n        .reset_index()\n        .rename_axis(None, axis=1)\n    )\n    \n    # prepare counting of scores by repo\n    df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()\n    for id in checklist_ids[1:]:\n        df_repo__count = df_repo__count.merge(\n            df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),\n            on=['repo', 'level_1'],\n            how='outer'\n        )\n    \n    df_repo__count = df_repo__count.fillna(0)\n\n    return (df_repo_run_file, df_repo_run, df_repo__stat, df_repo__count)\n\n\n# Ground truth\nground_truth = pd.DataFrame([\n    {'repo': 'lightfm', 'id': '2.1', 'score': 1},\n    {'repo': 'lightfm', 'id': '3.2', 'score': 1},\n    {'repo': 'lightfm', 'id': '3.5', 'score': 0},\n    {'repo': 'lightfm', 'id': '4.2', 'score': 1},\n    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},\n    {'repo': 'lightfm', 'id': '6.1', 'score': 1},\n    {'repo': 'lightfm', 'id': '6.2', 'score': 1},\n    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},\n    {'repo': 'qlib', 'id': '3.2', 'score': 1},\n    {'repo': 'qlib', 'id': '3.5', 'score': 0},\n    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},\n    {'repo': 'qlib', 'id': '5.3', 'score': 1},\n    {'repo': 'qlib', 'id': '6.1', 'score': 1},\n    {'repo': 'qlib', 'id': '6.2', 'score': 1},\n    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},\n])\n\n\nid_item_map = {\n    '2.1': 'Ensure Data File Loads as Expected',\n    '3.2': 'Data in the Expected Format',\n    '3.5': 'Check for Duplicate Records in Data',\n    '4.2': 'Verify Data Split Proportion',\n    '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n    '6.1': 'Verify Evaluation Metrics Implementation',\n    '6.2': \"Evaluate Model's Performance Against Thresholds\"\n}\n\n\n#result_path = '../draft/batch_run_results/record_combine.yml'\ndf_repo_run_file, df_repo_run, df_repo__stat, df_repo__count = read_and_preprocess(\n    '../data/processed/batch_run/record_combine.yml'\n)\n\n\ncont_table = pd.melt(\n    df_repo_run.query('(repo == \"lightfm\")')[['repo', 'run', '3.5', '4.2', '5.3']], \n    id_vars=['repo', 'run'], var_name='id', value_name='System Output')\ncont_table = pd.merge(cont_table, ground_truth, how='inner', on=['repo', 'id'])\ncont_table = cont_table.rename(columns={'score': 'ground_truth'})\ncont_table['title'] = cont_table['id'].apply(lambda x: id_item_map[x])\n#cont_table = cont_table[['repo', 'title', 'ground_truth', 'System Output', 'run']]\ncont_table = pd.pivot_table(cont_table, values='run', index=['repo', 'id', 'title', 'ground_truth'], columns=['System Output'], aggfunc='count', fill_value=0)\ncont_table.index.names = ['Repository', 'ID', 'Title', 'Ground Truth']\ncont_table.sort_index(level=3)\n\n\n\n\n\n\n\n\n\n\nSystem Output\n0.0\n0.5\n1.0\n\n\nRepository\nID\nTitle\nGround Truth\n\n\n\n\n\n\n\nlightfm\n3.5\nCheck for Duplicate Records in Data\n0.0\n19\n6\n5\n\n\n5.3\nEnsure Model Output Shape Aligns with Expectation\n0.5\n18\n12\n0\n\n\n4.2\nVerify Data Split Proportion\n1.0\n0\n27\n3\n\n\n\n\n\n\n\n\nConsistency: jitterbox plot\n\nstds = df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id').copy()\nstds.columns = [col[1] for col in stds.columns]\nstds = stds.reset_index()\nstds = stds.melt(id_vars='repo', var_name='id')\nstds['title'] = stds['id'].apply(lambda x: id_item_map[x])\n\n\nbox = alt.Chart().mark_boxplot(\n    color='grey',\n    opacity=0.5,\n    size=20,\n).encode(\n    x=alt.X('value:Q').title('System Output Uncertainty'),\n    y=alt.Y('title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))\n)\n\nstripplot = alt.Chart().mark_circle(size=100).encode(\n    y=alt.Y( \n        'title:N',\n        axis=alt.Axis(ticks=False, grid=True, labels=True), \n        scale=alt.Scale(), \n    ), \n    x='value:Q',\n    yOffset=\"jitter:Q\",\n    color=alt.Color('id:N', legend=None),\n    tooltip='repo'\n).transform_calculate(\n    # Generate Gaussian jitter with a Box-Muller transform\n    jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n)\n\nplot = alt.layer(\n    box,\n    stripplot,\n    data=stds\n).configure_view( \n    stroke=None\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n).properties(\n    height=300, \n    width=600,\n    title=\"30 Runs on Openja's Repositories for each Checklist Item\"\n) \n\n\nplot\n\n\n\n\n\n\n\n\n\nimprovement from 3.5 to 4o\n\n#result_path = '../draft/batch_run_results/record_combine.yml'\ndf_repo_run_file_4o, df_repo_run_4o, df_repo_4o__stat, df_repo_4o__count = read_and_preprocess(\n    '../data/processed/batch_run_4o/record_combine.yml'\n)\n\n\ndf_repo_4o__stat\n\n\n\n\n\n\n\n\nrepo\nid\ncount\nmean\nstd\n\n\n\n\n0\nlightfm\n2.1\n30.0\n1.0\n0.0\n\n\n1\nlightfm\n3.2\n30.0\n1.0\n0.0\n\n\n2\nlightfm\n3.5\n30.0\n1.0\n0.0\n\n\n3\nlightfm\n4.2\n30.0\n1.0\n0.0\n\n\n4\nlightfm\n5.3\n30.0\n1.0\n0.0\n\n\n5\nlightfm\n6.1\n30.0\n1.0\n0.0\n\n\n6\nlightfm\n6.2\n30.0\n1.0\n0.0\n\n\n\n\n\n\n\n\ndf1 = df_repo__stat.query('(repo == \"lightfm\") & (id == \"4.2\")').copy()\ndf1['model'] = ['gpt-3.5-turbo']\n\ndf2 = df_repo_4o__stat.query('(repo == \"lightfm\") & (id == \"4.2\")').copy()\ndf2['model'] = ['gpt-4o']\n\ndf_model_comp = pd.concat((df1, df2), axis=0)\n\n\nbase = alt.Chart(df_model_comp).transform_calculate(\n    min=\"max(0, datum.mean-datum.std)\",\n    max=\"min(1, datum.mean+datum.std)\"\n)\n    \n# generate the points\npoints = base.mark_point(\n    filled=True,\n    size=50,\n    color='black'\n).encode(\n    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"System Output\").axis(\n        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n    ),\n    y=alt.Y('model:N').title(\"Model\")#.scale(domainMin=0, domainMax=1).title('Score'),\n)\n    \n# generate the error bars\nerrorbars = base.mark_errorbar().encode(\n    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n    x2=\"max:Q\",\n    y=\"model:N\"\n)\n\n(points + errorbars).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n).properties(\n    height=200,\n    width=400,\n    title={\n        'text': '30 Runs on Checklist Item: \"Ensure Data File Loads as Expected\"',\n        'subtitle': \"Ground Truth = 1\"\n    }\n)"
-  },
-  {
-    "objectID": "01_preprocess.html",
-    "href": "01_preprocess.html",
-    "title": "",
-    "section": "",
-    "text": "#!pip install scipy altair\n\n\nimport os\nimport json\nimport yaml\nimport pandas as pd\nfrom collections import Counter\n\nid_item_map = {\n    '2.1': 'Ensure Data File Loads as Expected',\n    '3.2': 'Data in the Expected Format',\n    '3.5': 'Check for Duplicate Records in Data',\n    '4.2': 'Verify Data Split Proportion',\n    '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n    '6.1': 'Verify Evaluation Metrics Implementation',\n    '6.2': \"Evaluate Model's Performance Against Thresholds\"\n}\n\nground_truth = [\n    {'repo': 'lightfm', 'id': '2.1', 'score': 1},\n    {'repo': 'lightfm', 'id': '3.2', 'score': 1},\n    {'repo': 'lightfm', 'id': '3.5', 'score': 0},\n    {'repo': 'lightfm', 'id': '4.2', 'score': 1},\n    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},\n    {'repo': 'lightfm', 'id': '6.1', 'score': 1},\n    {'repo': 'lightfm', 'id': '6.2', 'score': 1},\n    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},\n    {'repo': 'qlib', 'id': '3.2', 'score': 1},\n    {'repo': 'qlib', 'id': '3.5', 'score': 0},\n    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},\n    {'repo': 'qlib', 'id': '5.3', 'score': 1},\n    {'repo': 'qlib', 'id': '6.1', 'score': 1},\n    {'repo': 'qlib', 'id': '6.2', 'score': 1},\n    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},\n    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},\n]\n\ndef get_score_report_from_response(resp_path, verbose=False):\n    if verbose:\n        print(resp_path)\n    with open(resp_path, 'rb') as file:\n        response = json.load(file)\n    \n    reports = [] # report for each test file\n    for result in response['call_results']: # one test file per response\n        if result['parsed_response']:\n            resp = result['parsed_response']['results']\n            for item in resp:\n                item['file'] = result['files_evaluated'][0] \n                item['success'] = result['success']\n                reports.append(item)\n        # FIXME: not handled failed run for now\n        # else: # if the run is failed, the parsed_response will be None\n        #     reports.append({\n        #         'ID': '2.1', \n        #         'Title': '',\n        #         'Requirement': '',\n        #         'Observation': '',\n        #         'Functions': [],\n        #         'Evaluation': '',\n        #         'Score': 0,\n        #         'file': result['files_evaluated'][0],\n        #         'success': result['success']\n        #     })\n    \n    reports_df = pd.DataFrame(reports)\n    df = (\n        reports_df\n        .pivot(index='file', columns='ID', values='Score')\n        .rename_axis(None, axis=1)\n    )\n    df['success'] = reports_df.groupby(['file'])['success'].all()\n    df['response_path'] = os.path.abspath(resp_path)\n    \n    return df.reset_index()\n\ndef get_scores_by_repo_by_run_by_file(batch_run_dir_path, record_yml='record_combine.yml', verbose=False):\n    ''' Get score for each checklist item, by repository, by run and by test file\n    '''\n    with open(os.path.join(batch_run_dir_path, record_yml), 'r') as file:\n        config = pd.DataFrame(yaml.safe_load(file))\n\n    config['response_path'] = config['response_path'].apply(\n        lambda x: os.path.abspath(os.path.join(batch_run_dir_path, x))\n    )\n    \n    tmp = [\n        get_score_report_from_response(\n            os.path.join(batch_run_dir_path, path),\n            verbose=verbose\n        ) for path in config['response_path']\n    ]\n    tmp = pd.concat(tmp, axis=0).reset_index(drop=True)\n    \n    return config.merge(tmp, on='response_path', how='left')\n\ndef preprocess(df_repo_run_file, id_item_map=None):\n    if id_item_map is None:\n        id_item_map = {\n            '2.1': 'Ensure Data File Loads as Expected',\n            '3.2': 'Data in the Expected Format',\n            '3.5': 'Check for Duplicate Records in Data',\n            '4.2': 'Verify Data Split Proportion',\n            '5.3': 'Ensure Model Output Shape Aligns with Expectation',\n            '6.1': 'Verify Evaluation Metrics Implementation',\n            '6.2': \"Evaluate Model's Performance Against Thresholds\"\n        }\n\n    # prepare score data by repo, by run\n    df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({\n        id: ['max'] for id in id_item_map.keys()\n    })\n    df_repo_run.columns = [col[0] for col in df_repo_run.columns]\n    df_repo_run = df_repo_run.reset_index()\n    \n    # prepare statistics of scores by repo\n    df_repo__stat = df_repo_run.groupby(['repo']).agg({\n        id: ['mean', 'std', 'count'] for id in id_item_map.keys()\n    })\n    df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])\n    df_repo__stat.columns = ['repo', 'id', 'stat', 'value']\n    df_repo__stat = (\n        df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')\n        .reset_index()\n        .rename_axis(None, axis=1)\n    )\n    df_repo__stat['title'] = df_repo__stat['id'].apply(lambda x: id_item_map[x])\n    df_repo__stat['id_title'] = df_repo__stat['id'] + '. ' + df_repo__stat['title']\n    \n    # prepare counting of scores by repo\n    df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()\n    for id in list(id_item_map.keys())[1:]:\n        df_repo__count = df_repo__count.merge(\n            df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),\n            on=['repo', 'level_1'],\n            how='outer'\n        )\n    #df_repo__count['title'] = df_repo__count['id'].apply(lambda x: id_item_map[x])\n    \n    df_repo__count = df_repo__count.fillna(0)\n\n    df_repo_run = df_repo_run.melt(id_vars=['repo', 'run'], var_name='id', value_name='score')\n    df_repo_run['title'] = df_repo_run['id'].apply(lambda x: id_item_map[x])\n    df_repo_run['id_title'] = df_repo_run['id'] + '. ' + df_repo_run['title']\n    \n    return (df_repo_run, df_repo__stat, df_repo__count)\n\n\ndf_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/processed/batch_run_3.5-turbo/')\ndf_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n\n\ndf_repo_run.to_csv('score_by_repo_run_3.5-turbo.csv', index=False)\ndf_repo__stat.to_csv('score_stat_by_repo_3.5-turbo.csv', index=False)\ndf_repo__count.to_csv('score_count_by_repo_3.5-turbo.csv', index=False)\n\n\ndf_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/processed/batch_run_4o/')\ndf_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)\n\ndf_repo_run.to_csv('score_by_repo_run_4o.csv', index=False)\ndf_repo__stat.to_csv('score_stat_by_repo_4o.csv', index=False)\ndf_repo__count.to_csv('score_count_by_repo_4o.csv', index=False)\n\n\nground_truth_df = pd.DataFrame(ground_truth)\nground_truth_df['title'] = ground_truth_df['id'].apply(lambda x: id_item_map[x])\nground_truth_df = ground_truth_df.pivot(index=['id', 'title'], columns='repo', values='score')\nground_truth_df.to_csv('ground_truth.csv')\n\n\n#df_repo__count.melt(id_vars=['repo', 'level_1'])"
-  },
-  {
-    "objectID": "02_plots-for-final-report.html",
-    "href": "02_plots-for-final-report.html",
-    "title": "",
-    "section": "",
-    "text": "import altair as alt\nimport pandas as pd\n\ndf_repo__stat = pd.read_csv('score_stat_by_repo_3.5-turbo.csv')\ngt = pd.read_csv('ground_truth.csv')\ngt = gt.melt(id_vars=['id', 'title'], var_name='repo', value_name='ground_truth')\n\ndf_repo__stat_with_gt = df_repo__stat.merge(gt, on=['id', 'title', 'repo'])\n\nbase = alt.Chart(\n    df_repo__stat_with_gt.query('repo in [\"lightfm\", \"qlib\", \"DeepSpeech\"]')\n).transform_calculate(\n    min=\"max(0, datum.mean-datum.std)\",\n    max=\"min(1, datum.mean+datum.std)\"\n)\n    \n# generate the points\npoints = base.mark_point(\n    filled=True,\n    size=50,\n    color='black'\n).encode(\n    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n    ),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n)\n\n# generate the points for ground truth\ngt_points = base.mark_point(\n    filled=True,\n    size=200,\n    color='green',\n    shape=\"diamond\"\n).encode(\n    x=alt.X('ground_truth:Q'),\n    y=alt.Y('id_title:N')\n)\n\n# generate the error bars\nerrorbars = base.mark_errorbar().encode(\n    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n    x2=\"max:Q\",\n    y=\"id_title:N\"\n)\n\n(gt_points + points + errorbars).facet(\n    column=alt.Column('repo:N').title(None)\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n)\n\n\n\n\n\n\n\n\ndf_repo_run = pd.read_csv('score_by_repo_run_3.5-turbo.csv')\n\ndf_repo_run = df_repo_run.merge(gt, on=['id', 'title', 'repo'])\n\ncontingency_table = pd.pivot_table(\n    df_repo_run,\n    values='run', \n    index=['repo', 'id_title', 'ground_truth'], \n    columns=['score'],\n    aggfunc='count', \n    fill_value=0\n)\ncontingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth']\ncontingency_table.sort_index(level=[0, 2])\n\n\n\n\n\n\n\n\n\nscore\n0.0\n0.5\n1.0\n\n\nRepository\nChecklist Item\nGround Truth\n\n\n\n\n\n\n\nlightfm\n3.5. Check for Duplicate Records in Data\n0.0\n30\n0\n0\n\n\n5.3. Ensure Model Output Shape Aligns with Expectation\n0.5\n1\n29\n0\n\n\n2.1. Ensure Data File Loads as Expected\n1.0\n0\n0\n30\n\n\n3.2. Data in the Expected Format\n1.0\n0\n30\n0\n\n\n4.2. Verify Data Split Proportion\n1.0\n0\n11\n19\n\n\n6.1. Verify Evaluation Metrics Implementation\n1.0\n0\n5\n25\n\n\n6.2. Evaluate Model's Performance Against Thresholds\n1.0\n0\n1\n29\n\n\nqlib\n3.5. Check for Duplicate Records in Data\n0.0\n23\n7\n0\n\n\n2.1. Ensure Data File Loads as Expected\n0.5\n0\n0\n30\n\n\n4.2. Verify Data Split Proportion\n0.5\n3\n25\n2\n\n\n3.2. Data in the Expected Format\n1.0\n0\n14\n16\n\n\n5.3. Ensure Model Output Shape Aligns with Expectation\n1.0\n1\n25\n4\n\n\n6.1. Verify Evaluation Metrics Implementation\n1.0\n2\n18\n10\n\n\n6.2. Evaluate Model's Performance Against Thresholds\n1.0\n0\n24\n6\n\n\n\n\n\n\n\n\nstds = df_repo__stat[['repo', 'std', 'id_title']].pivot(index='repo', columns='id_title').copy()\nstds.columns = [col[1] for col in stds.columns]\nstds = stds.reset_index()\nstds = stds.melt(id_vars='repo', var_name='id_title')\n\nbase = alt.Chart(stds)\n\nbox = base.mark_boxplot(\n    color='grey',\n    opacity=0.5,\n    size=20,\n).encode(\n    x=alt.X('value:Q').title('Standard Deviation of Scores'),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))\n)\n\nstripplot = base.mark_circle(size=100).encode(\n    y=alt.Y( \n        'id_title:N',\n        axis=alt.Axis(ticks=False, grid=True, labels=True), \n        scale=alt.Scale(), \n    ), \n    x='value:Q',\n    yOffset=\"jitter:Q\",\n    color=alt.Color('id_title:N', legend=None),\n    tooltip='repo'\n).transform_calculate(\n    # Generate Gaussian jitter with a Box-Muller transform\n    jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n)\n\n(\n    box + stripplot\n).configure_view( \n    stroke=None\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n).properties(\n    height=300, \n    width=600,\n    title=\"30 Runs on Openja's Repositories for each Checklist Item\"\n) \n\n\n\n\n\n\n\n\ndf_repo_4o__stat = pd.read_csv('score_stat_by_repo_4o.csv')\ndf_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])\ndf_repo_4o__stat_with_gt['model'] = 'gpt-4o'\n\ndf_repo_35turbo__stat_with_gt = df_repo__stat_with_gt.query(\"repo == 'lightfm'\").copy()\ndf_repo_35turbo__stat_with_gt['model'] = 'gpt-3.5-turbo'\n\ndf_model_comp = pd.concat(\n    (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), \n    axis=0\n)\n\nbase = alt.Chart(\n    df_model_comp\n).transform_calculate(\n    min=\"max(0, datum.mean-datum.std)\",\n    max=\"min(1, datum.mean+datum.std)\"\n)\n    \n# generate the points\npoints = base.mark_point(\n    filled=True,\n    size=50,\n    color='black'\n).encode(\n    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n    ),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n)\n\n# generate the points for ground truth\ngt_points = base.mark_point(\n    filled=True,\n    size=200,\n    color='green',\n    shape=\"diamond\"\n).encode(\n    x=alt.X('ground_truth:Q'),\n    y=alt.Y('id_title:N')\n)\n\n# generate the error bars\nerrorbars = base.mark_errorbar().encode(\n    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n    x2=\"max:Q\",\n    y=\"id_title:N\"\n)\n\n(gt_points + points + errorbars).facet(\n    column=alt.Column('model:N').title(None)\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n)"
-  },
-  {
-    "objectID": "final_report.html",
-    "href": "final_report.html",
-    "title": "DSCI591 Capstone Final Report",
-    "section": "",
-    "text": "by John Shiu, Orix Au Yeung, Tony Shum, Yingzi Jin\n\n\n#FIXME\n\n\n\n\n\nThe global artificial intelligence (AI) market is growing exponentially (Grand-View-Research 2021), driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.\nHowever, ensuring the software quality of these systems remains a significant challenge (Openja et al. 2023). Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as substantial financial losses ((Regidi 2019), (Regidi 2019), (Regidi 2019)) and safety hazards. (FIXME: false information to clients, bias to society, substantial financial losses)\nTherefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?\n\n\n\nWe propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate checklists based on ML research papers and best practices to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software’s trustworthiness, quality, and reproducibility across both the industry and academia (Kapoor and Narayanan 2022).\n\n\n\n\n\n\nTo ensure the reproducibility, trustworthiness and free-of-bias ML system, comprehensive assessment (FIXME: or comprehensive testing?) is essential. We have observed some traditional approaches in assessing the quality (FIXME: test completeness) of ML systems, which contain different advantages and drawbacks as follows.\n\n\nCode coverage is a measure of the proportion of source code of a program executed when a particular test suite is run. It is widely used in software development domain as one of the measurements. It quantifies the test quality and is scalable given the short process time. However, it cannot provide the reasons and in which ML areas that the test suites fall short under the context of ML system development.\n\n\n\nManual evaluation involves human expert review at the source code, whom can take the business logic into considerations and find vulnerabilites. Manual evaluation usually delivers comments for improvement under specific development context, and it is still one of the most reliable methods in practice. (FIXME: references) However, the time cost is large and it is not scalable due to the scarcity of time and human expert. Different human expert might put emphasis on different ML test areas instead of a comprehensive and holistic review on the ML system test suites.\n\n\n\n\nOur approach is to deliver an automated code review tool with the best practices of ML test suites embedded, which can be used by ML users to learn the best practices as well as to obtain a comprehensive evaluation on their ML system codes.\nTo come up with the best practices of ML test suites, ML research paper and recognized online resources are our data. Under the collaboration with our partner, we have researched industrial best practices (cite: Microsoft, Jordan) and published academic literature (cite: OpenJa) and consolidated the testing strategies of ML projects into a format which is easily legible and editable by human (researchers, ML engineers, etc.). The format is also machine-friendly that can be easily incorporated into the automated tool.\nTo develop our automated code review tool, GitHub repositories of ML projects are our data. We have collected 11 repositories studied in (Openja et al. 2023), where these projects include comprehensive test suites and are written in Python programming language, for our product development. Our tool is capable of understanding the test suites in these projects, comparing and contrasting the test suites with the embedded best practices, and delivering evaluations and suggestions (FIXME: no suggestions right?) to the current test suites.\nBy developing our approach, we expect that it can provide reliable test suites evaluation to multiple ML projects in a scalable manner. However, we acknowledged that the consolidation of best practices currently only focused on a few high priority test areas due to time constraint, where we expect to expand in the future. The test evaluation results provided by our tool are yet as reliable as human evaluation, where we will quantify its performance using the success metrics below.\n\n\n\nTo properly assess the performance of our tool which leverages the capability of LLMs, we have researched and taken reference of the methods in (Alexander et al. 2023) and defined the 2 success metrics: accuracy and consistency. With these metrics, our users (researchers, ML engineers, etc.) can assess the degree of trustworthiness of the evaluation results from our tool.\n\nAccuracy of the Application vs Human Expert Judgement\n\nWe run our tool on the ML projects in (Openja et al. 2023) to obtain the evaluation results (i.e. completeness score) per each ML test best practice item. We then manually assess the test suites of these ML projects using the same criteria as the ground truth data. Machine evaluation results are compared and contrasted with the ground truth data. Accuracy is defined as the number of matching results over total number of results.\n\nConsistency of the Application\n\nMultiple runs on each ML project are performed and the evaluation results per each ML test best practice item are obtained. Standard deviation of these results per ML projects are calculated as a measure of consistency.\n\n\n\n\n\n\nOur solution offers both a curated checklist on robust ML testing, and a Python package that facilitates the use of LLMs in checklist-based evaluation on the robustness of users’ ML projects. The Python package is made publicly available for distribution on the Python Packaging Index (PyPI).\nThe justifications for creating these products are, on one hand, checklists have been shown to decrease errors in software systems and promote code submissions (cite: Gawande 2010, Pineau et al. (2021) from Tiffany PDF). Moreover, Python is chosen to be the programming language of our package given its prevalence in the ML landscape, its ubiquitous presence across different OSes and the existence of Python libraries for the integration with LLMs. This lowers the barrier to use and develop our package and provides better user experience.\n\n\nThere are two ways to make use of this package:\n\nAs a CLI tool. A runnable command fixml is provided by the package. Once installed, users can perform the codebase evaluation, test function specification generation and other relevant tasks by running subcommands under fixml in terminal environment.\nAs a high-level API. Alternatively, one can use the package to import all components necessary for performing the tasks as part of their own system. Documentations are provided in terms of docstrings.\n\nBy formatting our product as a CLI tool and API, one (researchers, ML engineers, etc.) will find it user-friendly to interact with. Moreover, it is versatile to support various use cases, such as web application development, data science research, etc.\n\n\n\n(FIXME: To be revised) \nThe design principle of our package adheres to object-oriented design and SOLID principles, which is fully modular. One can easily switch between different prompts, models and checklists to use. This facilitates code reusability and users’ collaboration to extend its functionality.\nThere are five components in the system of our package:\n\nCode Analyzer This component extracts the information relevant to test suites from the input codebase, which is essential for injecting only the most relevant information to LLMs given its token limits.\nPrompt Templates This component stores the prompt template necessary for instructing LLM to behave and return responses in consistent and expected format. Few-shot learning is applied for the instruction. (FIXME: do we have few-shot learning?)\nChecklist This component reads the curated checklist, which is stored in CSV format, as a dict with fixed schema for injection into prompt. Default checklist is also included inside the package for distribution.\nRunners This component involves the Evaluator module, which evaluates each file from the test suites using LLMs and outputs evaluation results, and Generator module, which generates test specifications. Both modules include validation and retry logics and record all relevant information in the responses.\nParsers This components parses the responses from Evaluator into evaluation reports in various formats (HTML, PDF) using Jinja template engine. Adhering to our design principle, this enables flexibility in creating customized report structure.\n\n\n\n\nThe package will incorporate a checklist (Fig. 1) which contains the best practices in testing ML pipeline and is curated manually based on ML researches and recognized online resources. Prompt engineering is applied to the checklist for better performance. This also helps combating the hallucination of LLMs ((Zhang et al. 2023)) during the evaluation of ML projects by prompting it to follow exactly the checklist.\nHere is an example of how the checklist would be structured:\n\n\n\n\n\n\n\nColumn\nDescription\n\n\n\n\nID\nThe Unique Identifier of the checklist item\n\n\nTopic\nThe Test Area of the checklist item\n\n\nTitle\nThe Title of the checklist item\n\n\nRequirement\nThe Prompt of the checklist item to be injected into LLMs for evaluation\n\n\nExplanations\nDetailed explanations of the checklist item for human understanding\n\n\nReference\nReferences of the checklist item, e.g. academic paper\n\n\nIs Evaluator Applicable\nWhether the checklist item is selected to be used during evaluation. 0 indicates No, 1 indicates Yes\n\n\n\n(FIXME: To be revised) \n\n\n\nThere are three artifacts after using our package:\n\nEvaluation Responses The artifact stores both the evaluation responses from LLMs and meta-data of the process in JSON format. This supports downstream tasks, such as report render, scientific research, etc.\n\n(FIXME: To be revised) schema of the JSON saved & what kind of information is stored\n\nEvaluation Report The artifact stores the evaluation results of the ML projects in a structured format, which includes completeness score breakdown and corresponding detailed reasons.\n\n(FIXME: To be revised) \n\nTest Specification Script The artifacts stores the test specification responses from LLMs in Python script format.\n\n(FIXME: To be revised) \n\n\n\n\nAs illustrated in Success Metrics, we ran 30 iterations on each of the repositories in (Openja et al. 2023) and examined the breakdown of the completeness score to assessed the quality of evaluation determined by our tool. (FIXME: would it be better to show a table of the repos? like how the Openja does?)\n\n\nFor accuracy, we targeted 3 of the repositories (lightfm (FIXME: link), qlib (FIXME: link), DeepSpeech (FIXME: link)) for human evaluation and compared the ground truth with the outputs from our tool.\n\n\nCode\nimport pandas as pd\ngt = pd.read_csv('ground_truth.csv')\ngt\n\n\n\n\n\n\n\n\n\nid\ntitle\nDeepSpeech\nlightfm\nqlib\n\n\n\n\n0\n2.1\nEnsure Data File Loads as Expected\n0.0\n1.0\n0.5\n\n\n1\n3.2\nData in the Expected Format\n0.0\n1.0\n1.0\n\n\n2\n3.5\nCheck for Duplicate Records in Data\n0.0\n0.0\n0.0\n\n\n3\n4.2\nVerify Data Split Proportion\n0.0\n1.0\n0.5\n\n\n4\n5.3\nEnsure Model Output Shape Aligns with Expectation\n0.0\n0.5\n1.0\n\n\n5\n6.1\nVerify Evaluation Metrics Implementation\n0.0\n1.0\n1.0\n\n\n6\n6.2\nEvaluate Model's Performance Against Thresholds\n0.0\n1.0\n1.0\n\n\n\n\n\n\n\n\nCaption: Ground truth data on the 3 repositories\n\n\n\nCode\n# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo\nimport altair as alt\nimport pandas as pd\n\ndf_repo__stat = pd.read_csv('score_stat_by_repo_3.5-turbo.csv')\ngt = pd.read_csv('ground_truth.csv')\ngt = gt.melt(id_vars=['id', 'title'], var_name='repo', value_name='ground_truth')\n\ndf_repo__stat_with_gt = df_repo__stat.merge(gt, on=['id', 'title', 'repo'])\n\nbase = alt.Chart(\n    df_repo__stat_with_gt.query('repo in [\"lightfm\", \"qlib\", \"DeepSpeech\"]')\n).transform_calculate(\n    min=\"max(0, datum.mean-datum.std)\",\n    max=\"min(1, datum.mean+datum.std)\"\n)\n    \n# generate the points\npoints = base.mark_point(\n    filled=True,\n    size=50,\n    color='black'\n).encode(\n    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n    ),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n)\n\n# generate the points for ground truth\ngt_points = base.mark_point(\n    filled=True,\n    size=200,\n    color='green',\n    shape=\"diamond\"\n).encode(\n    x=alt.X('ground_truth:Q'),\n    y=alt.Y('id_title:N')\n)\n\n# generate the error bars\nerrorbars = base.mark_errorbar().encode(\n    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n    x2=\"max:Q\",\n    y=\"id_title:N\"\n)\n\n(gt_points + points + errorbars).facet(\n    column=alt.Column('repo:N').title(None)\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n)\n\n\n\n\n\n\n\n\n\nCaption: Comparison of the satisfaction determined by our system versus the ground truth for each checklist item and repository\n\nWe found that our tool tends to undermine the actual satisfying cases. For the items that are actually satisfied (score = 1), our tool tends to classify as partially satisfied (score = 0.5), (FIXME: in the newer run, the actual 0.5 seems to be tagged quite accurately) while for those that are partially satisfied (score = 0.5), our tool often classifies as not satisfied (score = 0).\n\n\nCode\ndf_repo_run = pd.read_csv('score_by_repo_run_3.5-turbo.csv')\n\ndf_repo_run = df_repo_run.merge(gt, on=['id', 'title', 'repo'])\n\ncontingency_table = pd.pivot_table(\n    df_repo_run,\n    values='run', \n    index=['repo', 'id_title', 'ground_truth'], \n    columns=['score'],\n    aggfunc='count', \n    fill_value=0\n)\ncontingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth']\ncontingency_table.sort_index(level=[0, 2])\n\n\n\n\n\n\n\n\n\n\nscore\n0.0\n0.5\n1.0\n\n\nRepository\nChecklist Item\nGround Truth\n\n\n\n\n\n\n\nlightfm\n3.5. Check for Duplicate Records in Data\n0.0\n30\n0\n0\n\n\n5.3. Ensure Model Output Shape Aligns with Expectation\n0.5\n1\n29\n0\n\n\n2.1. Ensure Data File Loads as Expected\n1.0\n0\n0\n30\n\n\n3.2. Data in the Expected Format\n1.0\n0\n30\n0\n\n\n4.2. Verify Data Split Proportion\n1.0\n0\n11\n19\n\n\n6.1. Verify Evaluation Metrics Implementation\n1.0\n0\n5\n25\n\n\n6.2. Evaluate Model's Performance Against Thresholds\n1.0\n0\n1\n29\n\n\nqlib\n3.5. Check for Duplicate Records in Data\n0.0\n23\n7\n0\n\n\n2.1. Ensure Data File Loads as Expected\n0.5\n0\n0\n30\n\n\n4.2. Verify Data Split Proportion\n0.5\n3\n25\n2\n\n\n3.2. Data in the Expected Format\n1.0\n0\n14\n16\n\n\n5.3. Ensure Model Output Shape Aligns with Expectation\n1.0\n1\n25\n4\n\n\n6.1. Verify Evaluation Metrics Implementation\n1.0\n2\n18\n10\n\n\n6.2. Evaluate Model's Performance Against Thresholds\n1.0\n0\n24\n6\n\n\n\n\n\n\n\n\nContingency table of the satisfaction determined by our system versus the ground truth\n\nThe accuracy issue may be attributed to the need for improvement of prompts in our checklist.\n\n\n\nSince the completeness score from LLMs contain randomness, we further studied the consistency of scores across checklist items and repositories.\n\n\nCode\nstds = df_repo__stat[['repo', 'std', 'id_title']].pivot(index='repo', columns='id_title').copy()\nstds.columns = [col[1] for col in stds.columns]\nstds = stds.reset_index()\nstds = stds.melt(id_vars='repo', var_name='id_title')\n\nbase = alt.Chart(stds)\n\nbox = base.mark_boxplot(\n    color='grey',\n    opacity=0.5,\n    size=20,\n).encode(\n    x=alt.X('value:Q').title('Standard Deviation of Scores'),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))\n)\n\nstripplot = base.mark_circle(size=100).encode(\n    y=alt.Y( \n        'id_title:N',\n        axis=alt.Axis(ticks=False, grid=True, labels=True), \n        scale=alt.Scale(), \n    ), \n    x='value:Q',\n    yOffset=\"jitter:Q\",\n    color=alt.Color('id_title:N', legend=None),\n    tooltip='repo'\n).transform_calculate(\n    # Generate Gaussian jitter with a Box-Muller transform\n    jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n)\n\n(\n    box + stripplot\n).configure_view( \n    stroke=None\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n).properties(\n    height=300, \n    width=600,\n    title=\"30 Runs on Openja's Repositories for each Checklist Item\"\n) \n\n\n\n\n\n\n\n\n\nCaption: Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores of 30 runs of a single repository\n\nWe found 2 diverging cases. For example, it shows high standard deviations across repositories for item 3.2 Data in the Expected Format. This might be a proof of poor prompt quality, making it ambiguous for the LLM and hence hard to produce consistent results. Prompt engineering might solve this problem.\nOn the other hand, there are outliers yielding exceptionally high standard deviations for item 5.3 Ensure Model Output Shape Aligns with Expectation. This may be because those repositories are unorthodox, but careful manual examination is required for a more definite conclusion.\n\n\n\nTo examine if newer LLMs help in both metrics, we preliminarily compared system outputs from gpt-4o and gpt-3.5-turbo on the lightfm repository, we observed that the gpt-4o system consistently returned “Satisfied”, which deviates from the ground truth.\n\n\nCode\n# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo\ndf_repo_4o__stat = pd.read_csv('score_stat_by_repo_4o.csv')\ndf_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])\ndf_repo_4o__stat_with_gt['model'] = 'gpt-4o'\n\ndf_repo_35turbo__stat_with_gt = df_repo__stat_with_gt.query(\"repo == 'lightfm'\").copy()\ndf_repo_35turbo__stat_with_gt['model'] = 'gpt-3.5-turbo'\n\ndf_model_comp = pd.concat(\n    (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), \n    axis=0\n)\n\nbase = alt.Chart(\n    df_model_comp\n).transform_calculate(\n    min=\"max(0, datum.mean-datum.std)\",\n    max=\"min(1, datum.mean+datum.std)\"\n)\n    \n# generate the points\npoints = base.mark_point(\n    filled=True,\n    size=50,\n    color='black'\n).encode(\n    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n    ),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n)\n\n# generate the points for ground truth\ngt_points = base.mark_point(\n    filled=True,\n    size=200,\n    color='green',\n    shape=\"diamond\"\n).encode(\n    x=alt.X('ground_truth:Q'),\n    y=alt.Y('id_title:N')\n)\n\n# generate the error bars\nerrorbars = base.mark_errorbar().encode(\n    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n    x2=\"max:Q\",\n    y=\"id_title:N\"\n)\n\n(gt_points + points + errorbars).facet(\n    column=alt.Column('model:N').title(None)\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n)\n\n\n\n\n\n\n\n\n\nCaption: Comparison of the satisfaction using gpt-4o versus using gpt-3.5-turbo for each checklist item on lightfm\n\nFurther investigation into gpt-4o is required to address this issue and enhance the system performance.\n\n\n\n\n\n\n\nOur project, FixML, represents a significant step forward in the field of machine learning (ML) testing by providing curated checklists and automated tools that enhance the evaluation and creation of test suites for ML models. The development and implementation of FixML have been driven by both the need of better quality assurance in ML systems, and the current limitations of traditional testing methods on ML projects which are either too general without comprehensive clarification, or are too human-reliant.\nFixML seamlessly takes in the user’s ML codebase, identifies and extracted its existing test suites. Together with the curated checklist on ML testing, FixML leverages Large Language Models (LLMs) to assess the completeness of the test suites and output detailed evaluation reports with completeness scores and specific reasons. This assists users in understanding the performance of their current test suites with insights. Additionally, FixML can generate test function specifications corresponding to the curated checklist, helping users utilizing their test suites.\nIn return, FixML solution combines the scalability of automated testing with the reliability of expert evaluation. By automating the evaluation process, FixML significantly reduces the time and human effort required to assess the quality of ML test suites. This popularizes thorough and efficient quality assessment on ML projects.\n\n\n\nWhile FixML provides substantial benefits, there are limitations and areas that aim to be addressed in future development:\n\nSpecialized Checklist\n\nThe current checklist is designed to be general and may not cover all specific requirements for different ML projects. Future development will focus on creating more specialized checklists for different domains and project types, allowing for more tailored evaluations. Since the format of the checklist is designed to allow users to easily expand, edit and select checklist items based on their specific use case, we welcome any collaboration with ML researchers on the creation of specalized checklists.\n\nEnhanced Test Evaluator\n\nOur current study unveils the varying accuracy and consistency issues on the evaluation results using OpenAI GPT models. Future improvements involves prompt enhancement with prompt engineering techniques and support for multiple LLMs for higher performance and flexibility of FixML test evaluator functionality. We also expect to deliver user guidelines in editing the prompts in our system, where ML developers can customize prompts for better performance and collaborate with us to embed them into the system.\n\nCustomized Test Specification\n\nFixML test specification generator currently produces general test function skeletons solely based on the curated checklist without the context of the specific ML projects. Future developments will involve the integration of the ML project codebase in the generation process to output customized test functions skeletons. This further lower the barrier of ML users in creating comprehensive test suites relevant to the projects.\n\nWorkflow Optimization #FIXME: have to review whether to include as it seems lower priority.\n\nThe current test evaluator and test specification generator are separate entities. This could be improved by embedding a workflow engine that allows the system to automatically take actions based on the LLM response. For instance, if the LLM response suggests that test suites are partially satisfied or non-satisfied, the system could automatically run the test generator to produce test function skeletons and then reevaluate them until they are satisfied or some threshold is met. This would create a more cohesive and efficient workflow, reducing manual intervention and improving overall system performance.\n\nPerformance Optimization #FIXME: have to review whether to include as it seems lower priority.\n\nPerformance optimization is another critical area for future development. As FixML handles large codebases and complex evaluations, optimizing the system to handle these tasks more efficiently is essential. This includes improving the speed and accuracy of the LLM responses, reducing the time taken to analyze and generate reports, and ensuring the system can scale effectively to handle more extensive and more complex projects.\nBy addressing these limitations and focusing on these future improvements, FixML will become an even more powerful tool for ensuring the quality and robustness of machine learning and data science projects."
-  },
-  {
-    "objectID": "final_report.html#executive-summary",
-    "href": "final_report.html#executive-summary",
-    "title": "DSCI591 Capstone Final Report",
-    "section": "",
-    "text": "#FIXME"
-  },
-  {
-    "objectID": "final_report.html#introduction",
-    "href": "final_report.html#introduction",
-    "title": "DSCI591 Capstone Final Report",
-    "section": "",
-    "text": "The global artificial intelligence (AI) market is growing exponentially (Grand-View-Research 2021), driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.\nHowever, ensuring the software quality of these systems remains a significant challenge (Openja et al. 2023). Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as substantial financial losses ((Regidi 2019), (Regidi 2019), (Regidi 2019)) and safety hazards. (FIXME: false information to clients, bias to society, substantial financial losses)\nTherefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?\n\n\n\nWe propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate checklists based on ML research papers and best practices to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software’s trustworthiness, quality, and reproducibility across both the industry and academia (Kapoor and Narayanan 2022)."
-  },
-  {
-    "objectID": "final_report.html#data-science-methods",
-    "href": "final_report.html#data-science-methods",
-    "title": "DSCI591 Capstone Final Report",
-    "section": "",
-    "text": "To ensure the reproducibility, trustworthiness and free-of-bias ML system, comprehensive assessment (FIXME: or comprehensive testing?) is essential. We have observed some traditional approaches in assessing the quality (FIXME: test completeness) of ML systems, which contain different advantages and drawbacks as follows.\n\n\nCode coverage is a measure of the proportion of source code of a program executed when a particular test suite is run. It is widely used in software development domain as one of the measurements. It quantifies the test quality and is scalable given the short process time. However, it cannot provide the reasons and in which ML areas that the test suites fall short under the context of ML system development.\n\n\n\nManual evaluation involves human expert review at the source code, whom can take the business logic into considerations and find vulnerabilites. Manual evaluation usually delivers comments for improvement under specific development context, and it is still one of the most reliable methods in practice. (FIXME: references) However, the time cost is large and it is not scalable due to the scarcity of time and human expert. Different human expert might put emphasis on different ML test areas instead of a comprehensive and holistic review on the ML system test suites.\n\n\n\n\nOur approach is to deliver an automated code review tool with the best practices of ML test suites embedded, which can be used by ML users to learn the best practices as well as to obtain a comprehensive evaluation on their ML system codes.\nTo come up with the best practices of ML test suites, ML research paper and recognized online resources are our data. Under the collaboration with our partner, we have researched industrial best practices (cite: Microsoft, Jordan) and published academic literature (cite: OpenJa) and consolidated the testing strategies of ML projects into a format which is easily legible and editable by human (researchers, ML engineers, etc.). The format is also machine-friendly that can be easily incorporated into the automated tool.\nTo develop our automated code review tool, GitHub repositories of ML projects are our data. We have collected 11 repositories studied in (Openja et al. 2023), where these projects include comprehensive test suites and are written in Python programming language, for our product development. Our tool is capable of understanding the test suites in these projects, comparing and contrasting the test suites with the embedded best practices, and delivering evaluations and suggestions (FIXME: no suggestions right?) to the current test suites.\nBy developing our approach, we expect that it can provide reliable test suites evaluation to multiple ML projects in a scalable manner. However, we acknowledged that the consolidation of best practices currently only focused on a few high priority test areas due to time constraint, where we expect to expand in the future. The test evaluation results provided by our tool are yet as reliable as human evaluation, where we will quantify its performance using the success metrics below.\n\n\n\nTo properly assess the performance of our tool which leverages the capability of LLMs, we have researched and taken reference of the methods in (Alexander et al. 2023) and defined the 2 success metrics: accuracy and consistency. With these metrics, our users (researchers, ML engineers, etc.) can assess the degree of trustworthiness of the evaluation results from our tool.\n\nAccuracy of the Application vs Human Expert Judgement\n\nWe run our tool on the ML projects in (Openja et al. 2023) to obtain the evaluation results (i.e. completeness score) per each ML test best practice item. We then manually assess the test suites of these ML projects using the same criteria as the ground truth data. Machine evaluation results are compared and contrasted with the ground truth data. Accuracy is defined as the number of matching results over total number of results.\n\nConsistency of the Application\n\nMultiple runs on each ML project are performed and the evaluation results per each ML test best practice item are obtained. Standard deviation of these results per ML projects are calculated as a measure of consistency."
-  },
-  {
-    "objectID": "final_report.html#data-product-results",
-    "href": "final_report.html#data-product-results",
-    "title": "DSCI591 Capstone Final Report",
-    "section": "",
-    "text": "Our solution offers both a curated checklist on robust ML testing, and a Python package that facilitates the use of LLMs in checklist-based evaluation on the robustness of users’ ML projects. The Python package is made publicly available for distribution on the Python Packaging Index (PyPI).\nThe justifications for creating these products are, on one hand, checklists have been shown to decrease errors in software systems and promote code submissions (cite: Gawande 2010, Pineau et al. (2021) from Tiffany PDF). Moreover, Python is chosen to be the programming language of our package given its prevalence in the ML landscape, its ubiquitous presence across different OSes and the existence of Python libraries for the integration with LLMs. This lowers the barrier to use and develop our package and provides better user experience.\n\n\nThere are two ways to make use of this package:\n\nAs a CLI tool. A runnable command fixml is provided by the package. Once installed, users can perform the codebase evaluation, test function specification generation and other relevant tasks by running subcommands under fixml in terminal environment.\nAs a high-level API. Alternatively, one can use the package to import all components necessary for performing the tasks as part of their own system. Documentations are provided in terms of docstrings.\n\nBy formatting our product as a CLI tool and API, one (researchers, ML engineers, etc.) will find it user-friendly to interact with. Moreover, it is versatile to support various use cases, such as web application development, data science research, etc.\n\n\n\n(FIXME: To be revised) \nThe design principle of our package adheres to object-oriented design and SOLID principles, which is fully modular. One can easily switch between different prompts, models and checklists to use. This facilitates code reusability and users’ collaboration to extend its functionality.\nThere are five components in the system of our package:\n\nCode Analyzer This component extracts the information relevant to test suites from the input codebase, which is essential for injecting only the most relevant information to LLMs given its token limits.\nPrompt Templates This component stores the prompt template necessary for instructing LLM to behave and return responses in consistent and expected format. Few-shot learning is applied for the instruction. (FIXME: do we have few-shot learning?)\nChecklist This component reads the curated checklist, which is stored in CSV format, as a dict with fixed schema for injection into prompt. Default checklist is also included inside the package for distribution.\nRunners This component involves the Evaluator module, which evaluates each file from the test suites using LLMs and outputs evaluation results, and Generator module, which generates test specifications. Both modules include validation and retry logics and record all relevant information in the responses.\nParsers This components parses the responses from Evaluator into evaluation reports in various formats (HTML, PDF) using Jinja template engine. Adhering to our design principle, this enables flexibility in creating customized report structure.\n\n\n\n\nThe package will incorporate a checklist (Fig. 1) which contains the best practices in testing ML pipeline and is curated manually based on ML researches and recognized online resources. Prompt engineering is applied to the checklist for better performance. This also helps combating the hallucination of LLMs ((Zhang et al. 2023)) during the evaluation of ML projects by prompting it to follow exactly the checklist.\nHere is an example of how the checklist would be structured:\n\n\n\n\n\n\n\nColumn\nDescription\n\n\n\n\nID\nThe Unique Identifier of the checklist item\n\n\nTopic\nThe Test Area of the checklist item\n\n\nTitle\nThe Title of the checklist item\n\n\nRequirement\nThe Prompt of the checklist item to be injected into LLMs for evaluation\n\n\nExplanations\nDetailed explanations of the checklist item for human understanding\n\n\nReference\nReferences of the checklist item, e.g. academic paper\n\n\nIs Evaluator Applicable\nWhether the checklist item is selected to be used during evaluation. 0 indicates No, 1 indicates Yes\n\n\n\n(FIXME: To be revised) \n\n\n\nThere are three artifacts after using our package:\n\nEvaluation Responses The artifact stores both the evaluation responses from LLMs and meta-data of the process in JSON format. This supports downstream tasks, such as report render, scientific research, etc.\n\n(FIXME: To be revised) schema of the JSON saved & what kind of information is stored\n\nEvaluation Report The artifact stores the evaluation results of the ML projects in a structured format, which includes completeness score breakdown and corresponding detailed reasons.\n\n(FIXME: To be revised) \n\nTest Specification Script The artifacts stores the test specification responses from LLMs in Python script format.\n\n(FIXME: To be revised) \n\n\n\n\nAs illustrated in Success Metrics, we ran 30 iterations on each of the repositories in (Openja et al. 2023) and examined the breakdown of the completeness score to assessed the quality of evaluation determined by our tool. (FIXME: would it be better to show a table of the repos? like how the Openja does?)\n\n\nFor accuracy, we targeted 3 of the repositories (lightfm (FIXME: link), qlib (FIXME: link), DeepSpeech (FIXME: link)) for human evaluation and compared the ground truth with the outputs from our tool.\n\n\nCode\nimport pandas as pd\ngt = pd.read_csv('ground_truth.csv')\ngt\n\n\n\n\n\n\n\n\n\nid\ntitle\nDeepSpeech\nlightfm\nqlib\n\n\n\n\n0\n2.1\nEnsure Data File Loads as Expected\n0.0\n1.0\n0.5\n\n\n1\n3.2\nData in the Expected Format\n0.0\n1.0\n1.0\n\n\n2\n3.5\nCheck for Duplicate Records in Data\n0.0\n0.0\n0.0\n\n\n3\n4.2\nVerify Data Split Proportion\n0.0\n1.0\n0.5\n\n\n4\n5.3\nEnsure Model Output Shape Aligns with Expectation\n0.0\n0.5\n1.0\n\n\n5\n6.1\nVerify Evaluation Metrics Implementation\n0.0\n1.0\n1.0\n\n\n6\n6.2\nEvaluate Model's Performance Against Thresholds\n0.0\n1.0\n1.0\n\n\n\n\n\n\n\n\nCaption: Ground truth data on the 3 repositories\n\n\n\nCode\n# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo\nimport altair as alt\nimport pandas as pd\n\ndf_repo__stat = pd.read_csv('score_stat_by_repo_3.5-turbo.csv')\ngt = pd.read_csv('ground_truth.csv')\ngt = gt.melt(id_vars=['id', 'title'], var_name='repo', value_name='ground_truth')\n\ndf_repo__stat_with_gt = df_repo__stat.merge(gt, on=['id', 'title', 'repo'])\n\nbase = alt.Chart(\n    df_repo__stat_with_gt.query('repo in [\"lightfm\", \"qlib\", \"DeepSpeech\"]')\n).transform_calculate(\n    min=\"max(0, datum.mean-datum.std)\",\n    max=\"min(1, datum.mean+datum.std)\"\n)\n    \n# generate the points\npoints = base.mark_point(\n    filled=True,\n    size=50,\n    color='black'\n).encode(\n    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n    ),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n)\n\n# generate the points for ground truth\ngt_points = base.mark_point(\n    filled=True,\n    size=200,\n    color='green',\n    shape=\"diamond\"\n).encode(\n    x=alt.X('ground_truth:Q'),\n    y=alt.Y('id_title:N')\n)\n\n# generate the error bars\nerrorbars = base.mark_errorbar().encode(\n    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n    x2=\"max:Q\",\n    y=\"id_title:N\"\n)\n\n(gt_points + points + errorbars).facet(\n    column=alt.Column('repo:N').title(None)\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n)\n\n\n\n\n\n\n\n\n\nCaption: Comparison of the satisfaction determined by our system versus the ground truth for each checklist item and repository\n\nWe found that our tool tends to undermine the actual satisfying cases. For the items that are actually satisfied (score = 1), our tool tends to classify as partially satisfied (score = 0.5), (FIXME: in the newer run, the actual 0.5 seems to be tagged quite accurately) while for those that are partially satisfied (score = 0.5), our tool often classifies as not satisfied (score = 0).\n\n\nCode\ndf_repo_run = pd.read_csv('score_by_repo_run_3.5-turbo.csv')\n\ndf_repo_run = df_repo_run.merge(gt, on=['id', 'title', 'repo'])\n\ncontingency_table = pd.pivot_table(\n    df_repo_run,\n    values='run', \n    index=['repo', 'id_title', 'ground_truth'], \n    columns=['score'],\n    aggfunc='count', \n    fill_value=0\n)\ncontingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth']\ncontingency_table.sort_index(level=[0, 2])\n\n\n\n\n\n\n\n\n\n\nscore\n0.0\n0.5\n1.0\n\n\nRepository\nChecklist Item\nGround Truth\n\n\n\n\n\n\n\nlightfm\n3.5. Check for Duplicate Records in Data\n0.0\n30\n0\n0\n\n\n5.3. Ensure Model Output Shape Aligns with Expectation\n0.5\n1\n29\n0\n\n\n2.1. Ensure Data File Loads as Expected\n1.0\n0\n0\n30\n\n\n3.2. Data in the Expected Format\n1.0\n0\n30\n0\n\n\n4.2. Verify Data Split Proportion\n1.0\n0\n11\n19\n\n\n6.1. Verify Evaluation Metrics Implementation\n1.0\n0\n5\n25\n\n\n6.2. Evaluate Model's Performance Against Thresholds\n1.0\n0\n1\n29\n\n\nqlib\n3.5. Check for Duplicate Records in Data\n0.0\n23\n7\n0\n\n\n2.1. Ensure Data File Loads as Expected\n0.5\n0\n0\n30\n\n\n4.2. Verify Data Split Proportion\n0.5\n3\n25\n2\n\n\n3.2. Data in the Expected Format\n1.0\n0\n14\n16\n\n\n5.3. Ensure Model Output Shape Aligns with Expectation\n1.0\n1\n25\n4\n\n\n6.1. Verify Evaluation Metrics Implementation\n1.0\n2\n18\n10\n\n\n6.2. Evaluate Model's Performance Against Thresholds\n1.0\n0\n24\n6\n\n\n\n\n\n\n\n\nContingency table of the satisfaction determined by our system versus the ground truth\n\nThe accuracy issue may be attributed to the need for improvement of prompts in our checklist.\n\n\n\nSince the completeness score from LLMs contain randomness, we further studied the consistency of scores across checklist items and repositories.\n\n\nCode\nstds = df_repo__stat[['repo', 'std', 'id_title']].pivot(index='repo', columns='id_title').copy()\nstds.columns = [col[1] for col in stds.columns]\nstds = stds.reset_index()\nstds = stds.melt(id_vars='repo', var_name='id_title')\n\nbase = alt.Chart(stds)\n\nbox = base.mark_boxplot(\n    color='grey',\n    opacity=0.5,\n    size=20,\n).encode(\n    x=alt.X('value:Q').title('Standard Deviation of Scores'),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))\n)\n\nstripplot = base.mark_circle(size=100).encode(\n    y=alt.Y( \n        'id_title:N',\n        axis=alt.Axis(ticks=False, grid=True, labels=True), \n        scale=alt.Scale(), \n    ), \n    x='value:Q',\n    yOffset=\"jitter:Q\",\n    color=alt.Color('id_title:N', legend=None),\n    tooltip='repo'\n).transform_calculate(\n    # Generate Gaussian jitter with a Box-Muller transform\n    jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n)\n\n(\n    box + stripplot\n).configure_view( \n    stroke=None\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n).properties(\n    height=300, \n    width=600,\n    title=\"30 Runs on Openja's Repositories for each Checklist Item\"\n) \n\n\n\n\n\n\n\n\n\nCaption: Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores of 30 runs of a single repository\n\nWe found 2 diverging cases. For example, it shows high standard deviations across repositories for item 3.2 Data in the Expected Format. This might be a proof of poor prompt quality, making it ambiguous for the LLM and hence hard to produce consistent results. Prompt engineering might solve this problem.\nOn the other hand, there are outliers yielding exceptionally high standard deviations for item 5.3 Ensure Model Output Shape Aligns with Expectation. This may be because those repositories are unorthodox, but careful manual examination is required for a more definite conclusion.\n\n\n\nTo examine if newer LLMs help in both metrics, we preliminarily compared system outputs from gpt-4o and gpt-3.5-turbo on the lightfm repository, we observed that the gpt-4o system consistently returned “Satisfied”, which deviates from the ground truth.\n\n\nCode\n# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo\ndf_repo_4o__stat = pd.read_csv('score_stat_by_repo_4o.csv')\ndf_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])\ndf_repo_4o__stat_with_gt['model'] = 'gpt-4o'\n\ndf_repo_35turbo__stat_with_gt = df_repo__stat_with_gt.query(\"repo == 'lightfm'\").copy()\ndf_repo_35turbo__stat_with_gt['model'] = 'gpt-3.5-turbo'\n\ndf_model_comp = pd.concat(\n    (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), \n    axis=0\n)\n\nbase = alt.Chart(\n    df_model_comp\n).transform_calculate(\n    min=\"max(0, datum.mean-datum.std)\",\n    max=\"min(1, datum.mean+datum.std)\"\n)\n    \n# generate the points\npoints = base.mark_point(\n    filled=True,\n    size=50,\n    color='black'\n).encode(\n    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n    ),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n)\n\n# generate the points for ground truth\ngt_points = base.mark_point(\n    filled=True,\n    size=200,\n    color='green',\n    shape=\"diamond\"\n).encode(\n    x=alt.X('ground_truth:Q'),\n    y=alt.Y('id_title:N')\n)\n\n# generate the error bars\nerrorbars = base.mark_errorbar().encode(\n    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n    x2=\"max:Q\",\n    y=\"id_title:N\"\n)\n\n(gt_points + points + errorbars).facet(\n    column=alt.Column('model:N').title(None)\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n)\n\n\n\n\n\n\n\n\n\nCaption: Comparison of the satisfaction using gpt-4o versus using gpt-3.5-turbo for each checklist item on lightfm\n\nFurther investigation into gpt-4o is required to address this issue and enhance the system performance."
-  },
-  {
-    "objectID": "final_report.html#conclusion",
-    "href": "final_report.html#conclusion",
-    "title": "DSCI591 Capstone Final Report",
-    "section": "",
-    "text": "Our project, FixML, represents a significant step forward in the field of machine learning (ML) testing by providing curated checklists and automated tools that enhance the evaluation and creation of test suites for ML models. The development and implementation of FixML have been driven by both the need of better quality assurance in ML systems, and the current limitations of traditional testing methods on ML projects which are either too general without comprehensive clarification, or are too human-reliant.\nFixML seamlessly takes in the user’s ML codebase, identifies and extracted its existing test suites. Together with the curated checklist on ML testing, FixML leverages Large Language Models (LLMs) to assess the completeness of the test suites and output detailed evaluation reports with completeness scores and specific reasons. This assists users in understanding the performance of their current test suites with insights. Additionally, FixML can generate test function specifications corresponding to the curated checklist, helping users utilizing their test suites.\nIn return, FixML solution combines the scalability of automated testing with the reliability of expert evaluation. By automating the evaluation process, FixML significantly reduces the time and human effort required to assess the quality of ML test suites. This popularizes thorough and efficient quality assessment on ML projects.\n\n\n\nWhile FixML provides substantial benefits, there are limitations and areas that aim to be addressed in future development:\n\nSpecialized Checklist\n\nThe current checklist is designed to be general and may not cover all specific requirements for different ML projects. Future development will focus on creating more specialized checklists for different domains and project types, allowing for more tailored evaluations. Since the format of the checklist is designed to allow users to easily expand, edit and select checklist items based on their specific use case, we welcome any collaboration with ML researchers on the creation of specalized checklists.\n\nEnhanced Test Evaluator\n\nOur current study unveils the varying accuracy and consistency issues on the evaluation results using OpenAI GPT models. Future improvements involves prompt enhancement with prompt engineering techniques and support for multiple LLMs for higher performance and flexibility of FixML test evaluator functionality. We also expect to deliver user guidelines in editing the prompts in our system, where ML developers can customize prompts for better performance and collaborate with us to embed them into the system.\n\nCustomized Test Specification\n\nFixML test specification generator currently produces general test function skeletons solely based on the curated checklist without the context of the specific ML projects. Future developments will involve the integration of the ML project codebase in the generation process to output customized test functions skeletons. This further lower the barrier of ML users in creating comprehensive test suites relevant to the projects.\n\nWorkflow Optimization #FIXME: have to review whether to include as it seems lower priority.\n\nThe current test evaluator and test specification generator are separate entities. This could be improved by embedding a workflow engine that allows the system to automatically take actions based on the LLM response. For instance, if the LLM response suggests that test suites are partially satisfied or non-satisfied, the system could automatically run the test generator to produce test function skeletons and then reevaluate them until they are satisfied or some threshold is met. This would create a more cohesive and efficient workflow, reducing manual intervention and improving overall system performance.\n\nPerformance Optimization #FIXME: have to review whether to include as it seems lower priority.\n\nPerformance optimization is another critical area for future development. As FixML handles large codebases and complex evaluations, optimizing the system to handle these tasks more efficiently is essential. This includes improving the speed and accuracy of the LLM responses, reducing the time taken to analyze and generate reports, and ensuring the system can scale effectively to handle more extensive and more complex projects.\nBy addressing these limitations and focusing on these future improvements, FixML will become an even more powerful tool for ensuring the quality and robustness of machine learning and data science projects."
-  },
   {
     "objectID": "proposal.html",
     "href": "proposal.html",
@@ -95,7 +25,7 @@
     "href": "proposal.html#our-product",
     "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
     "section": "Our Product",
-    "text": "Our Product\nOur solution offers an end-to-end application for evaluating and enhancing the robustness of users’ ML systems.\n\n\n\n```ngsvnkff ../../img/proposed_system_overview.png\n\n\n\n\nname: overview-diagram\n\n\n\nMain components and workflow of the proposed system. The checklist would be written in YAML to maximize readability for both humans and machines. We hope this will encourage researchers/users to read, understand and modify the checklist items, while keeping the checklist closely integrated with other components in our system.\n\nOne big challenge in utilizing LLMs to reliably and consistently evaluate ML systems is their tendency to generate illogical and/or factually wrong information known as hallucination [@zhang2023sirens].\n\nTo combat this, the proposed system will incorporate a checklist ([Fig. 1](overview-diagram)) which would be curated manually and incorporate best practices in software testing and identified areas to be tested inside ML pipeline from human experts and past research.\n\nThis checklist will be our basis in evaluating the effectiveness and completeness of existing tests in a given codebase. Relevant information will be injected into a prompt template, which the LLMs would then be prompted to follow the checklist **exactly** during the evaluation.\n\nHere is an example of how the proposed checklist would be structured:\n\n\n```{toggle}\n```yaml\n%YAML 1.2\n---\nTitle: Checklist for Tests in Machine Learning Projects\nDescription: &gt;\n  This is a comprehensive checklist for evaluating the data and ML pipeline\n  based on identified testing strategies from experts in the field.\nTest Areas:\n  - Topic: General\n    Description: &gt;\n      The following items describe best practices for all tests to be\n      written.\n    Tests:\n      - Title: Write Descriptive Test Names\n        Requirement: &gt;\n          Every test function should have a clear, descriptive name\n        Explanation: &gt;\n          If out tests are narrow and sufficiently descriptive, the test\n          name itself may give us enough information to start debugging.\n          This also helps us to identify what is being tested inside the\n          function.\n        References:\n          - https://testing.googleblog.com/2014/10/testing-on-toilet-writing-descriptive.html\n          - https://testing.googleblog.com/2024/05/test-failures-should-be-actionable.html\n\n      - Title: Keep Tests Focused\n        Requirement: &gt;\n          Each test should only test one scenario, meaning that in each\n          test we should only use one set of mock data.\n        Explanation: &gt;\n          If we test multiple scenarios in a single test, it is hard to\n          idenitfy exactly what went wrong. Keeping one scenario in a\n          single test helps us to isolate problematic scenarios.\n        References:\n          - https://testing.googleblog.com/2018/06/testing-on-toilet-keep-tests-focused.html\n\n      - Title: Prefer Narrow Assertions in Unit Tests\n        Requirement: &gt;\n          The assertions inside the tests should be narrow, meaning that\n          when checking a complex object, any unrelated behavior should\n          not be tested - Assert on only relevant behaviors.\n        Explanation: &gt;\n          If we have overly wide assertions (such as depending on every\n          field of a complex output proto), the test may fail for many\n          unimportant reasons. False positives are the opposite of\n          actionable.\n        References:\n          - https://testing.googleblog.com/2024/04/prefer-narrow-assertions-in-unit-tests.html\n\n      - Title: Keep Cause and Effect Clear\n        Requirement: &gt;\n          The modifications and the assertions of an object's behavior\n          in a single test should not be far away from each other.\n        Explanation: &gt;\n          Refrain from using large global test data structures shared\n          across multiple unit tests. This will allow for clear\n          identification of each test's setup and the cause and effect.\n        References:\n          - https://testing.googleblog.com/2017/01/testing-on-toilet-keep-cause-and-effect.html\n\n  - Topic: Data Presence\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the presence of data.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Data Quality\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the quality of data.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Data Ingestion\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      if the data is ingestion properly.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Model Fitting\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the model fitting process.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Model Evaluation\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the model evaluation process.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Artifact Testing\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      any artifacts that are created from the project.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\nEvaluation Artifacts\nThe end goal of our product is to generate the following three artifacts in relation to the evaluation of a given ML system codebase:\n\nML Test Completeness Score: The application utilizes LLMs and our curated checklist to analyze users’ ML system source code and returns a comprehensive score of the system’s test quality.\nMissing Test Recommendations: The application evaluates the adequacy of existing tests for users’ ML code and offers recommendations for additional, system-specific tests to enhance testing effectiveness.\nTest Function Specification Generation: Users select desired test recommendations and prompt the application to generate test function specifications and references. These are reliable starting points for users to enrich the ML system test suites.\n\n\n\nSuccess Metrics\nOur product’s success will depend on mutation testing of the test functions developed based on our application-generated specifications. The evaluation metric is the success rate of detecting the perturbations introduced to the ML project code.\nOur partners and stakeholders expect a significant improvement in the testing suites of their ML systems post-application usage. As a result, the testing suites will demonstrate high accuracy in detecting faults, ensuring consistency and high quality of ML projects during updates.\n\n\nData Science Approach\n\nData: GitHub Repositories\nIn this project, GitHub repositories are our data.\nTo develop our testing checklist, we will collect 11 repositories studied in (Openja et al. 2023). Additionally, we will collect 377 repositories identified in the study by (Wattanakriengkrai et al. 2022) for our product development.\nFor each repository, we are interested in the metadata and the ML modeling- and test-related source code. The metadata will be retrieved using the GitHub API, while the source code will be downloaded and filtered using our custom scripts. To ensure the relevance of the repositories to our study, we will apply the following criteria for filtering: 1. Repositories that are related to ML systems. 2. Repositories that include test cases. 3. Repositories whose development is written in the Python programming language.\n\n\nMethodologies\nOur data science methodology incorporates human expert evaluation and prompt engineering to assess and enhance the test quality of ML systems.\n\nHuman Expert Evaluation\nWe will begin by formulating a comprehensive checklist for evaluating the data and ML pipeline based on the established testing strategies outlined in (Openja et al. 2023) as the foundational framework. Based on the formulated checklist, our team will manually assess the test quality within each repository data. We will refine the checklist to ensure applicability and robustness when testing general ML systems.\nPrompt Engineering\nWe will engineer the prompts for LLM to incorporate with the ML system code and the curated checklist and to serve various purposes across the three-stage process:\n\nPrompts to examine test cases within the ML system source codes and deliver test completeness scores.\nPrompts to compare and contrast the existing tests and the checklist and deliver recommendations.\nPrompts to generate system-specific test specifications based on user-selected testing recommendations (Schäfer et al. 2023)\n\n\n\n\nIterative Development Approach\nWe begin by setting up a foundational framework based on the selected GitHub repositories and research on ML testing. The framework might not cover all ML systems or testing practices. Therefore, we adopt an iterative development approach by establishing an open and scalable framework to address these considerations. The application will be continuously refined based on contributors’ insights.\nUsers are encouraged to interpret the generated artifacts with a grain of salt and recognize the evolving nature of ML system testing practices."
+    "text": "Our Product\nOur solution offers an end-to-end application for evaluating and enhancing the robustness of users’ ML systems.\n\n\n\nMain components and workflow of the proposed system. The checklist would be written in YAML to maximize readability for both humans and machines. We hope this will encourage researchers/users to read, understand and modify the checklist items, while keeping the checklist closely integrated with other components in our system.\n\n\nOne big challenge in utilizing LLMs to reliably and consistently evaluate ML systems is their tendency to generate illogical and/or factually wrong information known as hallucination (Zhang et al. 2023).\nTo combat this, the proposed system will incorporate a checklist (Fig. 1) which would be curated manually and incorporate best practices in software testing and identified areas to be tested inside ML pipeline from human experts and past research.\nThis checklist will be our basis in evaluating the effectiveness and completeness of existing tests in a given codebase. Relevant information will be injected into a prompt template, which the LLMs would then be prompted to follow the checklist exactly during the evaluation.\nHere is an example of how the proposed checklist would be structured:\n%YAML 1.2\n---\nTitle: Checklist for Tests in Machine Learning Projects\nDescription: &gt;\n  This is a comprehensive checklist for evaluating the data and ML pipeline\n  based on identified testing strategies from experts in the field.\nTest Areas:\n  - Topic: General\n    Description: &gt;\n      The following items describe best practices for all tests to be\n      written.\n    Tests:\n      - Title: Write Descriptive Test Names\n        Requirement: &gt;\n          Every test function should have a clear, descriptive name\n        Explanation: &gt;\n          If out tests are narrow and sufficiently descriptive, the test\n          name itself may give us enough information to start debugging.\n          This also helps us to identify what is being tested inside the\n          function.\n        References:\n          - https://testing.googleblog.com/2014/10/testing-on-toilet-writing-descriptive.html\n          - https://testing.googleblog.com/2024/05/test-failures-should-be-actionable.html\n\n      - Title: Keep Tests Focused\n        Requirement: &gt;\n          Each test should only test one scenario, meaning that in each\n          test we should only use one set of mock data.\n        Explanation: &gt;\n          If we test multiple scenarios in a single test, it is hard to\n          idenitfy exactly what went wrong. Keeping one scenario in a\n          single test helps us to isolate problematic scenarios.\n        References:\n          - https://testing.googleblog.com/2018/06/testing-on-toilet-keep-tests-focused.html\n\n      - Title: Prefer Narrow Assertions in Unit Tests\n        Requirement: &gt;\n          The assertions inside the tests should be narrow, meaning that\n          when checking a complex object, any unrelated behavior should\n          not be tested - Assert on only relevant behaviors.\n        Explanation: &gt;\n          If we have overly wide assertions (such as depending on every\n          field of a complex output proto), the test may fail for many\n          unimportant reasons. False positives are the opposite of\n          actionable.\n        References:\n          - https://testing.googleblog.com/2024/04/prefer-narrow-assertions-in-unit-tests.html\n\n      - Title: Keep Cause and Effect Clear\n        Requirement: &gt;\n          The modifications and the assertions of an object's behavior\n          in a single test should not be far away from each other.\n        Explanation: &gt;\n          Refrain from using large global test data structures shared\n          across multiple unit tests. This will allow for clear\n          identification of each test's setup and the cause and effect.\n        References:\n          - https://testing.googleblog.com/2017/01/testing-on-toilet-keep-cause-and-effect.html\n\n  - Topic: Data Presence\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the presence of data.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Data Quality\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the quality of data.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Data Ingestion\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      if the data is ingestion properly.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Model Fitting\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the model fitting process.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Model Evaluation\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      the model evaluation process.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\n  - Topic: Artifact Testing\n    Description: &gt;\n      The following items describe tests that need to be done for testing\n      any artifacts that are created from the project.\n    Tests:\n      - Title: ...\n        Requirement: ...\n        Explanation: ...\n        References:\n          - ...\n\nEvaluation Artifacts\nThe end goal of our product is to generate the following three artifacts in relation to the evaluation of a given ML system codebase:\n\nML Test Completeness Score: The application utilizes LLMs and our curated checklist to analyze users’ ML system source code and returns a comprehensive score of the system’s test quality.\nMissing Test Recommendations: The application evaluates the adequacy of existing tests for users’ ML code and offers recommendations for additional, system-specific tests to enhance testing effectiveness.\nTest Function Specification Generation: Users select desired test recommendations and prompt the application to generate test function specifications and references. These are reliable starting points for users to enrich the ML system test suites.\n\n\n\nSuccess Metrics\nOur product’s success will depend on mutation testing of the test functions developed based on our application-generated specifications. The evaluation metric is the success rate of detecting the perturbations introduced to the ML project code.\nOur partners and stakeholders expect a significant improvement in the testing suites of their ML systems post-application usage. As a result, the testing suites will demonstrate high accuracy in detecting faults, ensuring consistency and high quality of ML projects during updates.\n\n\nData Science Approach\n\nData: GitHub Repositories\nIn this project, GitHub repositories are our data.\nTo develop our testing checklist, we will collect 11 repositories studied in (Openja et al. 2023). Additionally, we will collect 377 repositories identified in the study by (Wattanakriengkrai et al. 2022) for our product development.\nFor each repository, we are interested in the metadata and the ML modeling- and test-related source code. The metadata will be retrieved using the GitHub API, while the source code will be downloaded and filtered using our custom scripts. To ensure the relevance of the repositories to our study, we will apply the following criteria for filtering: 1. Repositories that are related to ML systems. 2. Repositories that include test cases. 3. Repositories whose development is written in the Python programming language.\n\n\nMethodologies\nOur data science methodology incorporates human expert evaluation and prompt engineering to assess and enhance the test quality of ML systems.\n\nHuman Expert Evaluation\nWe will begin by formulating a comprehensive checklist for evaluating the data and ML pipeline based on the established testing strategies outlined in (Openja et al. 2023) as the foundational framework. Based on the formulated checklist, our team will manually assess the test quality within each repository data. We will refine the checklist to ensure applicability and robustness when testing general ML systems.\nPrompt Engineering\nWe will engineer the prompts for LLM to incorporate with the ML system code and the curated checklist and to serve various purposes across the three-stage process:\n\nPrompts to examine test cases within the ML system source codes and deliver test completeness scores.\nPrompts to compare and contrast the existing tests and the checklist and deliver recommendations.\nPrompts to generate system-specific test specifications based on user-selected testing recommendations (Schäfer et al. 2023)\n\n\n\n\nIterative Development Approach\nWe begin by setting up a foundational framework based on the selected GitHub repositories and research on ML testing. The framework might not cover all ML systems or testing practices. Therefore, we adopt an iterative development approach by establishing an open and scalable framework to address these considerations. The application will be continuously refined based on contributors’ insights.\nUsers are encouraged to interpret the generated artifacts with a grain of salt and recognize the evolving nature of ML system testing practices."
   },
   {
     "objectID": "proposal.html#delivery-timeline",
@@ -105,10 +35,45 @@
     "text": "Delivery Timeline\nOur team follows the timeline below for our product delivery and prioritizes close communication with our partners to ensure that our developments align closely with their expectations.\n\n\n\n\n\n\n\nTimeline\nMilestones\n\n\n\n\nWeek 1 (Apr 29 - May 3)\nPrepare and Present Initial Proposal. Scrape repository data.\n\n\nWeek 2 - 3 (May 6 - 17)\nDeliver Proposal. Deliver Draft of ML Pipeline Test Checklist. Develop Minimum Viable Product (Test Completeness Score, Missing Test Recommendation)\n\n\nWeek 4 - 5 (May 20 - May 31)\nUpdate Test Checklist. Develop Test Function Specification Generator.\n\n\nWeek 6 (Jun 3 - Jun 7)\nUpdate Test Checklist. Wrap Up Product.\n\n\nWeek 7 (Jun 10 - Jun 14)\nFinalize Test Checklist. Perform Product System Test. Present Final Product. Prepare Final Product Report.\n\n\nWeek 8 (Jun 17 - Jun 21)\nDeliver Final Product. Deliver Final Product Report."
   },
   {
-    "objectID": "proposal.html#references",
-    "href": "proposal.html#references",
-    "title": "Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
-    "section": "References",
-    "text": "References"
+    "objectID": "final_report.html",
+    "href": "final_report.html",
+    "title": "Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
+    "section": "",
+    "text": "by John Shiu, Orix Au Yeung, Tony Shum, Yingzi Jin"
+  },
+  {
+    "objectID": "final_report.html#executive-summary",
+    "href": "final_report.html#executive-summary",
+    "title": "Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
+    "section": "Executive Summary",
+    "text": "Executive Summary\n#FIXME"
+  },
+  {
+    "objectID": "final_report.html#introduction",
+    "href": "final_report.html#introduction",
+    "title": "Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
+    "section": "Introduction",
+    "text": "Introduction\n\nProblem Statement\nThe global artificial intelligence (AI) market is growing exponentially ((Grand-View-Research 2021)), driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.\nHowever, ensuring the software quality of these systems remains a significant challenge ((Openja et al. 2023)). Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as misinformation ((Belanger 2024)), social bias ((Nunwick 2023)), substantial financial losses ((Regidi 2019)) and safety hazards ((Shepardson 2023))\nTherefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?\n\n\nOur Objectives\nWe propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate checklists based on ML research papers and best practices to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software’s trustworthiness, quality, and reproducibility across both the industry and academia (Kapoor and Narayanan 2022)."
+  },
+  {
+    "objectID": "final_report.html#data-science-methods",
+    "href": "final_report.html#data-science-methods",
+    "title": "Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
+    "section": "Data Science Methods",
+    "text": "Data Science Methods\n\nCurrent Approaches\nTo ensure the reproducibility, trustworthiness, and lack of bias in ML systems, comprehensive testing is essential. We outlined some traditional approaches for assessing the completeness of ML system tests with their advantages and drawbacks as follows.\n\nCode Coverage\n\nCode coverage measures the proportion of source code of a program executed when a particular test suite is run. Widely used in software development, it quantifies test quality and is scalable due to its short processing time. However, it cannot indicate the reasons or specific ML areas where the test suites fall short under the context of ML system development.\n\nManual Evaluation\n\nManual evaluation involves human experts reviewing the source code, whom can take the business logic into considerations and identify vulnerabilites. It often provides context-specific improvement suggestions and remains one of the most reliable practices ((Openja et al. 2023), (Alexander et al. 2023)). However, it is time-consuming and not scalable due to the scarcity of human experts. Moreover, different experts might put emphasis on different ML test areas and lack a comprehensive and holistic review of the ML system test suites.\n\n\nOur Approach\nOur approach is to deliver an automated code review tool with the best practices of ML test suites embedded. This tool aims to educate ML users on best practices while providing comprehensive evaluations of their ML system codes.\nTo establish these best practices, we utilized data from ML research papers and recognized online resources. In collaboration with our partner, we researched industrial best practices ((Team 2023), (Jordan 2020)) and academic literature ((Openja et al. 2023)), and consolidated testing strategies into a human-readable and machine-friendly checklist that can be embedded into the automated tool.\nFor development, we collected 11 GitHub repositories of ML projects as studied in (Openja et al. 2023). These Python-based projects include comprehensive test suites. Our tool should be able to analyze these test suites, compare them with embedded best practices, and deliver evaluations.\nWe expect that our approach will provide scalable and reliable test suite evaluations for multiple ML projects. However, we recognize that our current best practices only focus on a few high-priority test areas due to time constraints. We plan to expand this scope in the future. While our tool’s evaluations are not yet as reliable as human evaluations, we will quantify its performance.\n\n\nSuccess Metrics\nTo properly assess the performance of our tool which leverages LLMs capability, we have taken reference of the methods in (Alexander et al. 2023) and defined two success metrics: accuracy and consistency. These metrics will help users (researchers, ML engineers, etc.) gauge the trustworthiness of our tool’s evaluation results.\n\nAccuracy vs Human Expert Judgement\n\nWe run our tool on ML projects from (Openja et al. 2023) to obtain evaluation results for each ML checklist item. These results are then compared with our manually assessed ground truth data based on the same criteria. Accuracy is calculated as the proportion of matching results to the total number of results.\n\nConsistency\n\nWe perform multiple runs on each ML project to obtain evaluation results for each checklist item. Consistency is measured by calculating the standard deviation of these results across multiple runs for each project."
+  },
+  {
+    "objectID": "final_report.html#data-product-results",
+    "href": "final_report.html#data-product-results",
+    "title": "Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
+    "section": "Data Product & Results",
+    "text": "Data Product & Results\n\nData Products\nOur solution includes a curated checklist for robust ML testing and a Python package for checklist-based evaluation of ML project testing robustness using LLMs. The package is publicly available on the Python Packaging Index (PyPI).\nJustifications for these products are:\n\nChecklists have been shown to reduce errors in software systems and promote code submissions ((Gawande 2010), (Pineau et al. 2021)).\nPython is widely used in ML, compatible with various OSes, and integrates well with LLMs. These ensure the ease of use and development.\n\n\nHow to use the product\nThere are two ways to make use of this package:\n\nAs a CLI tool. A runnable command fixml is provided by the package. Once installed, users can perform codebase evaluations, generate test function specifications, and more by running subcommands under fixml in the terminal.\nAs a high-level API. Users can import necessary components from the package into their own systems. Documentation is available through docstrings.\n\nBy offering it as both CLI tool and API, our product is user-friendly to interact with, and versatile to support various use cases such as web application development and scientific research.\n\n\nSystem Design\n(FIXME To be revised) \nThe design of our package follows object-oriented and SOLID principles, which is fully modularity. Users can easily switch between different prompts, models, and checklists, which facilitates code reusability and collaboration to extend its functionality.\nThere are five components in the system of our package:\n\nCode Analyzer\n\nIt extracts test suites from the input codebase, to ensure only the most relevants details are provided to LLMs given token limits.\n\nPrompt Templates\n\nIt stores prompt templates for instructing LLMs to generate responses in the expected format.\n\nChecklist\n\nIt reads the curated checklist from a CSV file into a dictionary with a fixed schema for LLM injection. The package includes a default checklist for distribution.\n\nRunners\n\nIt includes the Evaluator module, which assesses each test suite file using LLMs and outputs evaluation results, and the Generator module, which creates test specifications. Both modules feature validation, retry logic, and record response and relevant information.\n\nParsers\n\nIt converts Evaluator responses into evaluation reports in various formats (HTML, PDF) using the Jinja template engine, which enables customizable report structures.\n\n\nChecklist Design\nThe embedded checklist contains best practices for testing ML pipelines, and is curated from ML research and recognized online resources. Prompt engineering further improves performance. THis helps mitigate LLM hallucinations ((Zhang et al. 2023)) by ensuring strict adherence to the checklist.\nExample checklist structure:\n\n\n\n\n\n\n\nColumn\nDescription\n\n\n\n\nID\nUnique Identifier of the checklist item\n\n\nTopic\nTest Area of the checklist item\n\n\nTitle\nTitle of the checklist item\n\n\nRequirement\nPrompt for the checklist item to be injected into LLMs for evaluation\n\n\nExplanations\nDetailed explanations for human understanding\n\n\nReference\nReferences for the checklist item, e.g., academic papers\n\n\nIs Evaluator Applicable\nIndicates if the checklist item is used during evaluation (0 = No, 1 = Yes)\n\n\n\n(FIXME To be revised) \n\n\nArtifacts\nUsing our package results in three artifacts:\n\nEvaluation Responses\n\nThese responses include both LLM evaluation results and process metadata stored in JSON format.This supports downsteam tasks like report rendering and scientific research, etc.\n(FIXME To be revised) schema of the JSON saved & what kind of information is stored\n\nEvaluation Report\n\nThis report presents structured evaluation results of ML projects, which includes a detailed breakdown of completeness scores and reasons for each score.\n(FIXME To be revised) \n\nTest Specification Script\n\nGenerated test specifications are stored as Python scripts.\n(FIXME To be revised) \n\n\n\nEvaluation Results\nAs described in Success Metrics, we conducted 30 iterations on each repository from (Openja et al. 2023) and examined the breakdown of the completeness score to assess our tool’s evaluation quality.\n(FIXME: would it be better to show a table of the repos? like how the Openja does?)\n\nAccuracy\n\nWe targeted 3 of the repositories (lightfm, qlib, DeepSpeech) for human evaluation compared our tool’s outputs with the ground truth.\n\n\nCode\nimport pandas as pd\ngt = pd.read_csv('ground_truth.csv')\ngt\n\n\n\n\n\n\n\n\n\nid\ntitle\nDeepSpeech\nlightfm\nqlib\n\n\n\n\n0\n2.1\nEnsure Data File Loads as Expected\n0.0\n1.0\n0.5\n\n\n1\n3.2\nData in the Expected Format\n0.0\n1.0\n1.0\n\n\n2\n3.5\nCheck for Duplicate Records in Data\n0.0\n0.0\n0.0\n\n\n3\n4.2\nVerify Data Split Proportion\n0.0\n1.0\n0.5\n\n\n4\n5.3\nEnsure Model Output Shape Aligns with Expectation\n0.0\n0.5\n1.0\n\n\n5\n6.1\nVerify Evaluation Metrics Implementation\n0.0\n1.0\n1.0\n\n\n6\n6.2\nEvaluate Model's Performance Against Thresholds\n0.0\n1.0\n1.0\n\n\n\n\n\n\n\n\nGround truth data for the 3 repositories. (1 = fully satisfied, 0.5 = partially satisfied, 0 = not satisfied)\n\n\n\nCode\n# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo\nimport altair as alt\nimport pandas as pd\n\ndf_repo__stat = pd.read_csv('score_stat_by_repo_3.5-turbo.csv')\ngt = pd.read_csv('ground_truth.csv')\ngt = gt.melt(id_vars=['id', 'title'], var_name='repo', value_name='ground_truth')\n\ndf_repo__stat_with_gt = df_repo__stat.merge(gt, on=['id', 'title', 'repo'])\n\nbase = alt.Chart(\n    df_repo__stat_with_gt.query('repo in [\"lightfm\", \"qlib\", \"DeepSpeech\"]')\n).transform_calculate(\n    min=\"max(0, datum.mean-datum.std)\",\n    max=\"min(1, datum.mean+datum.std)\"\n)\n    \n# generate the points\npoints = base.mark_point(\n    filled=True,\n    size=50,\n    color='black'\n).encode(\n    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n    ),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n)\n\n# generate the points for ground truth\ngt_points = base.mark_point(\n    filled=True,\n    size=200,\n    color='green',\n    shape=\"diamond\"\n).encode(\n    x=alt.X('ground_truth:Q'),\n    y=alt.Y('id_title:N')\n)\n\n# generate the error bars\nerrorbars = base.mark_errorbar().encode(\n    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n    x2=\"max:Q\",\n    y=\"id_title:N\"\n)\n\n(gt_points + points + errorbars).facet(\n    column=alt.Column('repo:N').title(None)\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n)\n\n\n\n\n\n\n\n\n\nComparison of our system’s satisfaction determination versus the ground truth for each checklist item and repository\n\nOur tool tends to underrate satisfying cases, which often classifies fully satisfied items as partially satisfied and partially satisfied items as not satisfied.\n\n\nCode\ndf_repo_run = pd.read_csv('score_by_repo_run_3.5-turbo.csv')\n\ndf_repo_run = df_repo_run.merge(gt, on=['id', 'title', 'repo'])\n\ncontingency_table = pd.pivot_table(\n    df_repo_run,\n    values='run', \n    index=['repo', 'id_title', 'ground_truth'], \n    columns=['score'],\n    aggfunc='count', \n    fill_value=0\n)\ncontingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth']\ncontingency_table.sort_index(level=[0, 2])\n\n\n\n\n\n\n\n\n\n\nscore\n0.0\n0.5\n1.0\n\n\nRepository\nChecklist Item\nGround Truth\n\n\n\n\n\n\n\nlightfm\n3.5. Check for Duplicate Records in Data\n0.0\n30\n0\n0\n\n\n5.3. Ensure Model Output Shape Aligns with Expectation\n0.5\n1\n29\n0\n\n\n2.1. Ensure Data File Loads as Expected\n1.0\n0\n0\n30\n\n\n3.2. Data in the Expected Format\n1.0\n0\n30\n0\n\n\n4.2. Verify Data Split Proportion\n1.0\n0\n11\n19\n\n\n6.1. Verify Evaluation Metrics Implementation\n1.0\n0\n5\n25\n\n\n6.2. Evaluate Model's Performance Against Thresholds\n1.0\n0\n1\n29\n\n\nqlib\n3.5. Check for Duplicate Records in Data\n0.0\n23\n7\n0\n\n\n2.1. Ensure Data File Loads as Expected\n0.5\n0\n0\n30\n\n\n4.2. Verify Data Split Proportion\n0.5\n3\n25\n2\n\n\n3.2. Data in the Expected Format\n1.0\n0\n14\n16\n\n\n5.3. Ensure Model Output Shape Aligns with Expectation\n1.0\n1\n25\n4\n\n\n6.1. Verify Evaluation Metrics Implementation\n1.0\n2\n18\n10\n\n\n6.2. Evaluate Model's Performance Against Thresholds\n1.0\n0\n24\n6\n\n\n\n\n\n\n\n\nContingency table of our system’s satisfaction determination versus the ground truth\n\nThe accuracy issue may be attributed to a need to improve our checklist prompts.\n\nConsistency\n\nAs the completeness scores from LLMs contain randomness, we examined the consistency of completeness scores across checklist items and repositories.\n\n\nCode\nstds = df_repo__stat[['repo', 'std', 'id_title']].pivot(index='repo', columns='id_title').copy()\nstds.columns = [col[1] for col in stds.columns]\nstds = stds.reset_index()\nstds = stds.melt(id_vars='repo', var_name='id_title')\n\nbase = alt.Chart(stds)\n\nbox = base.mark_boxplot(\n    color='grey',\n    opacity=0.5,\n    size=20,\n).encode(\n    x=alt.X('value:Q').title('Standard Deviation of Scores'),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))\n)\n\nstripplot = base.mark_circle(size=100).encode(\n    y=alt.Y( \n        'id_title:N',\n        axis=alt.Axis(ticks=False, grid=True, labels=True), \n        scale=alt.Scale(), \n    ), \n    x='value:Q',\n    yOffset=\"jitter:Q\",\n    color=alt.Color('id_title:N', legend=None),\n    tooltip='repo'\n).transform_calculate(\n    # Generate Gaussian jitter with a Box-Muller transform\n    jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n)\n\n(\n    box + stripplot\n).configure_view( \n    stroke=None\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n).properties(\n    height=300, \n    width=600,\n    title=\"30 Runs on Openja's Repositories for each Checklist Item\"\n) \n\n\n\n\n\n\n\n\n\nStandard deviations of the score for each checklist item. Each dot represents the standard deviation of scores from 30 runs of a single repository.\n\nWe identified two diverging cases:\n\nHigh Standard Deviations\n\nItems like 3.2 Data in the Expected Format showed high standard deviations across repositories. This might indicate potential poor prompt quality for the LLM to produce consistent results. Improved prompt engineering could address this issue.\n\nOutliers with High Standard Deviations\n\nItems like 5.3 Ensure Model Output Shape Aligns with Expectation had outliers with exceptionally high standard deviations, which is possibly due to unorthodox repositories. A careful manual examination is required for a more definitive conclusion.\n\nComparison of gpt-3.5-turbo and gpt-4o\nTo evaluate if newer LLMs improve performance, we preliminarily compared outputs from gpt-4o and gpt-3.5-turbo on the lightfm repository. We observed that gpt-4o consistently returned “Satisfied,” which deviated from the ground truth.\n\n\nCode\n# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo\ndf_repo_4o__stat = pd.read_csv('score_stat_by_repo_4o.csv')\ndf_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])\ndf_repo_4o__stat_with_gt['model'] = 'gpt-4o'\n\ndf_repo_35turbo__stat_with_gt = df_repo__stat_with_gt.query(\"repo == 'lightfm'\").copy()\ndf_repo_35turbo__stat_with_gt['model'] = 'gpt-3.5-turbo'\n\ndf_model_comp = pd.concat(\n    (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), \n    axis=0\n)\n\nbase = alt.Chart(\n    df_model_comp\n).transform_calculate(\n    min=\"max(0, datum.mean-datum.std)\",\n    max=\"min(1, datum.mean+datum.std)\"\n)\n    \n# generate the points\npoints = base.mark_point(\n    filled=True,\n    size=50,\n    color='black'\n).encode(\n    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title(\"Score\").axis(\n        labelExpr=\"datum.value % 0.5 ? null : datum.label\"\n    ),\n    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),\n)\n\n# generate the points for ground truth\ngt_points = base.mark_point(\n    filled=True,\n    size=200,\n    color='green',\n    shape=\"diamond\"\n).encode(\n    x=alt.X('ground_truth:Q'),\n    y=alt.Y('id_title:N')\n)\n\n# generate the error bars\nerrorbars = base.mark_errorbar().encode(\n    x=alt.X(\"min:Q\").title('1 SD'), #\"id:N\",\n    x2=\"max:Q\",\n    y=\"id_title:N\"\n)\n\n(gt_points + points + errorbars).facet(\n    column=alt.Column('model:N').title(None)\n).configure_axis( \n    labelFontSize=12, \n    titleFontSize=12\n)\n\n\n\n\n\n\n\n\n\nComparison of satisfaction using gpt-4o versus gpt-3.5-turbo for each checklist item on lightfm\n\nFurther investigation into gpt-4o is required to determine its effectiveness in system performance."
+  },
+  {
+    "objectID": "final_report.html#conclusion",
+    "href": "final_report.html#conclusion",
+    "title": "Final Report - Checklists and LLM prompts for efficient and effective test creation in data analysis",
+    "section": "Conclusion",
+    "text": "Conclusion\n\nWrap Up\nThe development of FixML has been driven by the need of better quality assurance in ML systems and the current limitations of traditional testing methods on ML projects. FixML provides curated checklists and automated tools that enhance the evaluation and creation of test suites for ML projects. This in return, significantly reduces the time and effort required to assess the completeness of ML test suites, and thus promotes thorough and efficient assessment on ML projects.\n\n\nLimitation & Future Improvement\nWhile FixML provides substantial benefits, there are limitations and areas to be addressed in future development:\n\nSpecialized Checklist\n\nThe default checklist is general and may not cover all requirements for different ML projects. Future development will focus on creating specialized checklists for tailored evaluations across various domains and project types. Collaboration with ML researchers is welcomed for creating specialized checklists based on specific use cases.\n\nEnhanced Test Evaluator\n\nOur study reveals the accuracy and consistency issues on the evaluation results using OpenAI GPT-3.5-turbo model. Future improvements involves better prompt engineering techniques and support for multiple LLMs for enhanced performance and flexibility. User guidelines in prompt creation will be provided to facilitate collaboration with ML developers.\n\nCustomized Test Specification\n\nFuture developments will integrate project-specific information to produce customized test function skeletons. This may further encourage users to create comprehensive tests.\n\nWorkflow Optimization #FIXME: have to review whether to include as it seems lower priority.\n\nThe test evaluator and test specification generator are currently separate. Future improvements could embed a workflow engine that automatically takes actions based on LLM responses. This creates a more cohesive and efficient workflow, recues manual intervention, and improves overall system performance.\n\nPerformance Optimization #FIXME: have to review whether to include as it seems lower priority.\n\nAs FixML handles large codebases and complex evaluations, performance optimization is essential. Future developments will focus on improving the speed and accuracy of LLM responses, reducing analysis and report generation times, and ensuring scalability for handling larger and more complex projects.\nBy addressing these limitations and implementing future improvements, we aim for FixML to achieve better performance and contribute to the development of better ML systems, and ultimately enhance human life."
   }
 ]
\ No newline at end of file
diff --git a/report/final_report/final_report.qmd b/report/final_report/final_report.qmd
index 717b733..74f4a14 100644
--- a/report/final_report/final_report.qmd
+++ b/report/final_report/final_report.qmd
@@ -1,5 +1,4 @@
 ---
-title: "DSCI591 Capstone Final Report"
 format:
   html:
     code-fold: true
@@ -12,7 +11,13 @@ by John Shiu, Orix Au Yeung, Tony Shum, Yingzi Jin
 
 ## Executive Summary
 
-#FIXME
+The global AI market is expanding rapidly and demanding for robust quality assurance for ML systems to prevent risks such as misinformation, social bias, financial losses, and safety hazards. FixML addresses these challenges by offering an automated code review tool embedded with best practices for ML test suites, curated from ML research and industry standards.
+
+Our approach includes developing the tool in Python package based on Large Language Models (LLMs) and creating comprehensive checklists to enhance ML software's trustworthiness, quality, and reproducibility. The tool analyzes ML projects, compares test suites against best practices, and delivers evaluations, which can significantly reduce the time and effort required for manual assessments.
+
+To ensure reliability, we defined two success metrics: accuracy (comparison with human expert judgments) and consistency (standard deviation across multiple runs). Our findings indicated that while our tool is effective, there is room to improce in both metrics, which requires further prompt engineering and refinement for enhanced performance.
+
+The FixML package is available on PyPI and can be used as a CLI tool and a high-level API, which makes it user-friendly and versatile. Future improvements will focus on specialized checklists, enhanced evaluators, customized test specifications, workflow and performance optimization to further improve ML system quality and user experience.
 
 ## Introduction
 
@@ -20,7 +25,7 @@ by John Shiu, Orix Au Yeung, Tony Shum, Yingzi Jin
 
 The global artificial intelligence (AI) market is growing exponentially [@grand2021artificial], driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis.
 
-However, ensuring the software quality of these systems remains a significant challenge [@openja2023studying]. Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as substantial financial losses ([@Asheeta2019], [@Asheeta2019], [@Asheeta2019]) and safety hazards. (FIXME: false information to clients, bias to society, substantial financial losses)
+However, ensuring the software quality of these systems remains a significant challenge [@openja2023studying]. Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as misinformation [@Ashley2024], social bias [@Alice2023], substantial financial losses [@Asheeta2019] and safety hazards [@David2023]
 
 Therefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?
 
@@ -32,133 +37,181 @@ We propose to develop testing suites diagnostic tools based on Large Language Mo
 
 ### Current Approaches
 
-To ensure the reproducibility, trustworthiness and free-of-bias ML system, comprehensive assessment (FIXME: or comprehensive testing?) is essential. We have observed some traditional approaches in assessing the quality (FIXME: test completeness) of ML systems, which contain different advantages and drawbacks as follows.
+To ensure the reproducibility, trustworthiness, and lack of bias in ML systems, comprehensive testing is essential. We outlined some traditional approaches for assessing the completeness of ML system tests with their advantages and drawbacks as follows.
 
-#### 1. Code Coverage
+1. **Code Coverage**
 
-Code coverage is a measure of the proportion of source code of a program executed when a particular test suite is run. It is widely used in software development domain as one of the measurements. It quantifies the test quality and is scalable given the short process time. However, it cannot provide the reasons and in which ML areas that the test suites fall short under the context of ML system development.
+Code coverage measures the proportion of source code of a program executed when a particular test suite is run. Widely used in software development, it quantifies test quality and is scalable due to its short processing time. However, it cannot indicate the reasons or specific ML areas where the test suites fall short under the context of ML system development.
 
-#### 2. Manual Evaluation
+2. **Manual Evaluation**
 
-Manual evaluation involves human expert review at the source code, whom can take the business logic into considerations and find vulnerabilites. Manual evaluation usually delivers comments for improvement under specific development context, and it is still one of the most reliable methods in practice. (FIXME: references) However, the time cost is large and it is not scalable due to the scarcity of time and human expert. Different human expert might put emphasis on different ML test areas instead of a comprehensive and holistic review on the ML system test suites.
+Manual evaluation involves human experts reviewing the source code, whom can take the business logic into considerations and identify vulnerabilites. It often provides context-specific improvement suggestions and remains one of the most reliable practices [@openja2023studying], [@alexander2023evaluating]. However, it is time-consuming and not scalable due to the scarcity of human experts. Moreover, different experts might put emphasis on different ML test areas and lack a comprehensive and holistic review of the ML system test suites.
 
 ### Our Approach
 
-Our approach is to deliver an automated code review tool with the best practices of ML test suites embedded, which can be used by ML users to learn the best practices as well as to obtain a comprehensive evaluation on their ML system codes.
+Our approach is to deliver an automated code review tool with the best practices of ML test suites embedded. This tool aims to educate ML users on best practices while providing comprehensive evaluations of their ML system codes.
 
-To come up with the best practices of ML test suites, ML research paper and recognized online resources are our data. Under the collaboration with our partner, we have researched industrial best practices (cite: Microsoft, Jordan) and published academic literature (cite: OpenJa) and consolidated the testing strategies of ML projects into a format which is easily legible and editable by human (researchers, ML engineers, etc.). The format is also machine-friendly that can be easily incorporated into the automated tool.
+To establish these best practices, we utilized data from ML research papers and recognized online resources. In collaboration with our partner, we researched industrial best practices [@msise2023], [@jordan2020] and academic literature [@openja2023studying], and consolidated testing strategies into a human-readable and machine-friendly checklist that can be embedded into the automated tool.
 
-To develop our automated code review tool, GitHub repositories of ML projects are our data. We have collected 11 repositories studied in [@openja2023studying], where these projects include comprehensive test suites and are written in Python programming language, for our product development. Our tool is capable of understanding the test suites in these projects, comparing and contrasting the test suites with the embedded best practices, and delivering evaluations and suggestions (FIXME: no suggestions right?) to the current test suites.
+For development, we collected 11 GitHub repositories of ML projects as studied in [@openja2023studying]. These Python-based projects include comprehensive test suites. Our tool should be able to analyze these test suites, compare them with embedded best practices, and deliver evaluations.
 
-By developing our approach, we expect that it can provide reliable test suites evaluation to multiple ML projects in a scalable manner. However, we acknowledged that the consolidation of best practices currently only focused on a few high priority test areas due to time constraint, where we expect to expand in the future. The test evaluation results provided by our tool are yet as reliable as human evaluation, where we will quantify its performance using the success metrics below.
+We expect that our approach will provide scalable and reliable test suite evaluations for multiple ML projects. However, we recognize that our current best practices only focus on a few high-priority test areas due to time constraints. We plan to expand this scope in the future. While our tool's evaluations are not yet as reliable as human evaluations, we will quantify its performance.
 
 ### Success Metrics
 
-To properly assess the performance of our tool which leverages the capability of LLMs, we have researched and taken reference of the methods in [@alexander2023evaluating] and defined the 2 success metrics: accuracy and consistency. With these metrics, our users (researchers, ML engineers, etc.) can assess the degree of trustworthiness of the evaluation results from our tool.
+To properly assess the performance of our tool which leverages LLMs capability, we have taken reference of the methods in [@alexander2023evaluating] and defined two success metrics: accuracy and consistency. These metrics will help users (researchers, ML engineers, etc.) gauge the trustworthiness of our tool's evaluation results.
 
-1.  **Accuracy of the Application vs Human Expert Judgement**
+1.  **Accuracy vs Human Expert Judgement**
 
-We run our tool on the ML projects in [@openja2023studying] to obtain the evaluation results (i.e. completeness score) per each ML test best practice item. We then manually assess the test suites of these ML projects using the same criteria as the ground truth data. Machine evaluation results are compared and contrasted with the ground truth data. Accuracy is defined as the number of matching results over total number of results.
+We run our tool on ML projects from [@openja2023studying] to obtain evaluation results for each ML checklist item. These results are then compared with our manually assessed ground truth data based on the same criteria. Accuracy is calculated as the proportion of matching results to the total number of results.
 
-2.  **Consistency of the Application**
+2.  **Consistency**
 
-Multiple runs on each ML project are performed and the evaluation results per each ML test best practice item are obtained. Standard deviation of these results per ML projects are calculated as a measure of consistency.
+We perform multiple runs on each ML project to obtain evaluation results for each checklist item. Consistency is measured by calculating the standard deviation of these results across multiple runs for each project.
 
 ## Data Product & Results
 
 ### Data Products
 
-Our solution offers both a curated checklist on robust ML testing, and a Python package that facilitates the use of LLMs in checklist-based evaluation on the robustness of users' ML projects. The Python package is made publicly available for distribution on the Python Packaging Index (PyPI).
+Our solution includes a curated checklist for robust ML testing and a Python package for checklist-based evaluation of ML project testing robustness using LLMs. The package is publicly available on the Python Packaging Index (PyPI).
 
-The justifications for creating these products are, on one hand, checklists have been shown to decrease errors in software systems and promote code submissions (cite: Gawande 2010, Pineau et al. (2021) from Tiffany PDF). Moreover, Python is chosen to be the programming language of our package given its prevalence in the ML landscape, its ubiquitous presence across different OSes and the existence of Python libraries for the integration with LLMs. This lowers the barrier to use and develop our package and provides better user experience.
+Justifications for these products are:
+
+- Checklists have been shown to reduce errors in software systems and promote code submissions [@Atul2010], [@pineau2021improving].
+- Python is widely used in ML, compatible with various OSes, and integrates well with LLMs. These ensure the ease of use and development.
 
 #### How to use the product
 
 There are two ways to make use of this package:
 
-1.  **As a CLI tool.** A runnable command `fixml` is provided by the package. Once installed, users can perform the codebase evaluation, test function specification generation and other relevant tasks by running subcommands under `fixml` in terminal environment.
+1.  **As a CLI tool.** A runnable command `fixml` is provided by the package. Once installed, users can perform codebase evaluations, generate test function specifications, and more by running subcommands under `fixml` in the terminal.
 
-2.  **As a high-level API.** Alternatively, one can use the package to import all components necessary for performing the tasks as part of their own system. Documentations are provided in terms of docstrings.
+2.  **As a high-level API.** Users can import necessary components from the package into their own systems. Documentation is available through docstrings.
 
-By formatting our product as a CLI tool and API, one (researchers, ML engineers, etc.) will find it user-friendly to interact with. Moreover, it is versatile to support various use cases, such as web application development, data science research, etc.
+By offering it as both CLI tool and API, our product is user-friendly to interact with, and versatile to support various use cases such as web application development and scientific research.
 
 #### System Design
 
-(FIXME: To be revised) ![image](../../img/proposed_system_overview.png)
+(FIXME To be revised)
+
+::: {#fig-system}
+![](img/proposed_system_overview.png){width=600}
 
-The design principle of our package adheres to object-oriented design and SOLID principles, which is fully modular. One can easily switch between different prompts, models and checklists to use. This facilitates code reusability and users' collaboration to extend its functionality.
+Diagram of FixML system design
+:::
+
+
+The design of our package follows object-oriented and SOLID principles, which is fully modularity. Users can easily switch between different prompts, models, and checklists, which facilitates code reusability and collaboration to extend its functionality.
 
 There are five components in the system of our package:
 
 1.  **Code Analyzer** 
-This component extracts the information relevant to test suites from the input codebase, which is essential for injecting only the most relevant information to LLMs given its token limits.
+
+It extracts test suites from the input codebase, to ensure only the most relevants details are provided to LLMs given token limits.
 
 2.  **Prompt Templates** 
-This component stores the prompt template necessary for instructing LLM to behave and return responses in consistent and expected format. Few-shot learning is applied for the instruction. (FIXME: do we have few-shot learning?)
+
+It stores prompt templates for instructing LLMs to generate responses in the expected format.
 
 3.  **Checklist** 
-This component reads the curated checklist, which is stored in CSV format, as a dict with fixed schema for injection into prompt. Default checklist is also included inside the package for distribution.
+
+It reads the curated checklist from a CSV file into a dictionary with a fixed schema for LLM injection. The package includes a default checklist for distribution.
 
 4.  **Runners** 
-This component involves the Evaluator module, which evaluates each file from the test suites using LLMs and outputs evaluation results, and Generator module, which generates test specifications. Both modules include validation and retry logics and record all relevant information in the responses.
+
+It includes the Evaluator module, which assesses each test suite file using LLMs and outputs evaluation results, and the Generator module, which creates test specifications. Both modules feature validation, retry logic, and record response and relevant information.
 
 5.  **Parsers** 
-This components parses the responses from Evaluator into evaluation reports in various formats (HTML, PDF) using Jinja template engine. Adhering to our design principle, this enables flexibility in creating customized report structure.
+
+It converts Evaluator responses into evaluation reports in various formats (HTML, PDF) using the Jinja template engine, which enables customizable report structures.
 
 #### Checklist Design
 
-The package will incorporate a checklist ([Fig. 1](overview-diagram)) which contains the best practices in testing ML pipeline and is curated manually based on ML researches and recognized online resources. Prompt engineering is applied to the checklist for better performance. This also helps combating the hallucination of LLMs ([@zhang2023sirens]) during the evaluation of ML projects by prompting it to follow **exactly** the checklist.
+The embedded checklist contains best practices for testing ML pipelines, and is curated from ML research and recognized online resources. Prompt engineering further improves performance. THis helps mitigate LLM hallucinations [@zhang2023sirens] by ensuring strict adherence to the checklist.
 
-Here is an example of how the checklist would be structured:
+Example checklist structure:
 
 |                  Column | Description                                                                                          |
 |------------------:|:----------------------------------------------------|
-|                      ID | The Unique Identifier of the checklist item                                                          |
-|                   Topic | The Test Area of the checklist item                                                                  |
-|                   Title | The Title of the checklist item                                                                      |
-|             Requirement | The Prompt of the checklist item to be injected into LLMs for evaluation                             |
-|            Explanations | Detailed explanations of the checklist item for human understanding                                  |
-|               Reference | References of the checklist item, e.g. academic paper                                                |
-| Is Evaluator Applicable | Whether the checklist item is selected to be used during evaluation. 0 indicates No, 1 indicates Yes |
+|                      ID | Unique Identifier of the checklist item                                                          |
+|                   Topic | Test Area of the checklist item                                                                  |
+|                   Title | Title of the checklist item                                                                      |
+|             Requirement | Prompt for the checklist item to be injected into LLMs for evaluation                             |
+|            Explanations | Detailed explanations for human understanding                                  |
+|               Reference | References for the checklist item, e.g., academic papers                                                |
+| Is Evaluator Applicable | Indicates if the checklist item is used during evaluation (0 = No, 1 = Yes) |
+
+(FIXME To be revised)
+
+::: {#fig-checklist}
+![](img/checklist_sample.png){width=600}
 
-(FIXME: To be revised) <img src="../../img/checklist_sample.png" width="200" />
+An example of the checklist
+:::
 
 #### Artifacts
 
-There are three artifacts after using our package:
+Using our package results in three artifacts:
 
 1.  **Evaluation Responses** 
-The artifact stores both the evaluation responses from LLMs and meta-data of the process in JSON format. This supports downstream tasks, such as report render, scientific research, etc.
 
-(FIXME: To be revised) schema of the JSON saved & what kind of information is stored
+These responses include both LLM evaluation results and process metadata stored in JSON format.This supports downsteam tasks like report rendering and scientific research, etc.
+
+(FIXME To be revised)
+
+::: {#fig-responses}
+![](img/test_evaluation_responses_sample.png){width=600}
+
+An example of the evaluation responses
+:::
 
 2.  **Evaluation Report** 
-The artifact stores the evaluation results of the ML projects in a structured format, which includes completeness score breakdown and corresponding detailed reasons.
 
-(FIXME: To be revised) <img src="../../img/test_evaluation_report_sample.png" width="200" />
+This report presents structured evaluation results of ML projects, which includes a detailed breakdown of completeness scores and reasons for each score.
+
+(FIXME To be revised) 
+
+::: {#fig-report}
+![](img/test_evaluation_report_sample.png){width=600}
+
+An example of the evaluation report
+:::
 
 3.  **Test Specification Script** 
-The artifacts stores the test specification responses from LLMs in Python script format.
 
-(FIXME: To be revised) <img src="../../img/test_spec_sample.png" width="200" />
+Generated test specifications are stored as Python scripts.
+
+(FIXME To be revised) 
+
+::: {#fig-testspec}
+![](img/test_spec_sample.png){width=600}
+
+An example of the generated test specifications
+:::
 
 ### Evaluation Results
 
-As illustrated in `Success Metrics`, we ran 30 iterations on each of the repositories in [@openja2023studying] and examined the breakdown of the completeness score to assessed the quality of evaluation determined by our tool. (FIXME: would it be better to show a table of the repos? like how the Openja does?) 
+As described in `Success Metrics`, we conducted 30 iterations on each repository from [@openja2023studying] and examined the breakdown of the completeness score to assess our tool's evaluation quality. 
 
-#### Accuracy
+(FIXME: would it be better to show a table of the repos? like how the Openja does?) 
 
-For accuracy, we targeted 3 of the repositories (`lightfm` (FIXME: link), `qlib` (FIXME: link), `DeepSpeech` (FIXME: link)) for human evaluation and compared the ground truth with the outputs from our tool.
+1. **Accuracy**
+
+We targeted 3 of the repositories ([`lightfm`](https://github.com/lyst/lightfm), [`qlib`](https://github.com/microsoft/qlib), [`DeepSpeech`](https://github.com/mozilla/DeepSpeech)) for human evaluation compared our tool's outputs with the ground truth.
 
 ```{python}
+#| label: tbl-gt
+#| tbl-cap: Ground truth data for the 3 repositories. (1 = fully satisfied, 0.5 = partially satisfied, 0 = not satisfied)
+
 import pandas as pd
 gt = pd.read_csv('ground_truth.csv')
 gt
 ```
-> Caption: Ground truth data on the 3 repositories
 
 ```{python}
-# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo
+#| label: fig-accu-mean-sd-plot
+#| fig-cap: Comparison of our system's satisfaction determination versus the ground truth for each checklist item and repository
+
 import altair as alt
 import pandas as pd
 
@@ -212,11 +265,13 @@ errorbars = base.mark_errorbar().encode(
     titleFontSize=12
 )
 ```
-> Caption: Comparison of the satisfaction determined by our system versus the ground truth for each checklist item and repository
 
-We found that our tool tends to undermine the actual satisfying cases. For the items that are actually satisfied (score = 1), our tool tends to classify as partially satisfied (score = 0.5), (FIXME: in the newer run, the actual 0.5 seems to be tagged quite accurately) while for those that are partially satisfied (score = 0.5), our tool often classifies as not satisfied (score = 0).
+Our tool tends to underrate satisfying cases, which often classifies fully satisfied items as partially satisfied and partially satisfied items as not satisfied.
 
 ```{python}
+#| label: tbl-accu-contingency
+#| tbl-cap: Contingency table of our system's satisfaction determination versus the ground truth
+
 df_repo_run = pd.read_csv('score_by_repo_run_3.5-turbo.csv')
 
 df_repo_run = df_repo_run.merge(gt, on=['id', 'title', 'repo'])
@@ -232,15 +287,17 @@ contingency_table = pd.pivot_table(
 contingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth']
 contingency_table.sort_index(level=[0, 2])
 ```
-> Contingency table of the satisfaction determined by our system versus the ground truth
 
-The accuracy issue may be attributed to the need for improvement of prompts in our checklist.
+The accuracy issue may be attributed to a need to improve our checklist prompts.
 
-#### Consistency
+2. **Consistency**
 
-Since the completeness score from LLMs contain randomness, we further studied the consistency of scores across checklist items and repositories.
+As the completeness scores from LLMs contain randomness, we examined the consistency of completeness scores across checklist items and repositories.
 
 ```{python}
+#| label: fig-cons-sd-box-plot
+#| fig-cap: Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores from 30 runs of a single repository
+
 stds = df_repo__stat[['repo', 'std', 'id_title']].pivot(index='repo', columns='id_title').copy()
 stds.columns = [col[1] for col in stds.columns]
 stds = stds.reset_index()
@@ -285,18 +342,25 @@ stripplot = base.mark_circle(size=100).encode(
     title="30 Runs on Openja's Repositories for each Checklist Item"
 ) 
 ```
-> Caption: Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores of 30 runs of a single repository
 
-We found 2 diverging cases. For example, it shows high standard deviations across repositories for item `3.2 Data in the Expected Format`. This might be a proof of poor prompt quality, making it ambiguous for the LLM and hence hard to produce consistent results. Prompt engineering might solve this problem.
+We identified two diverging cases:
+
+i. **High Standard Deviations** 
+
+Items like `3.2 Data in the Expected Format` showed high standard deviations across repositories. This might indicate potential poor prompt quality for the LLM to produce consistent results. Improved prompt engineering could address this issue.
 
-On the other hand, there are outliers yielding exceptionally high standard deviations for item `5.3 Ensure Model Output Shape Aligns with Expectation`. This may be because those repositories are unorthodox, but careful manual examination is required for a more definite conclusion.
+ii. **Outliers with High Standard Deviations**
+
+Items like `5.3 Ensure Model Output Shape Aligns with Expectation` had outliers with exceptionally high standard deviations, which is possibly due to unorthodox repositories. A careful manual examination is required for a more definitive conclusion.
 
 #### Comparison of `gpt-3.5-turbo` and `gpt-4o`
 
-To examine if newer LLMs help in both metrics, we preliminarily compared system outputs from `gpt-4o` and `gpt-3.5-turbo` on the `lightfm` repository, we observed that the `gpt-4o` system consistently returned "Satisfied", which deviates from the ground truth.
+To evaluate if newer LLMs improve performance, we preliminarily compared outputs from `gpt-4o` and `gpt-3.5-turbo` on the `lightfm` repository. We observed that `gpt-4o` consistently returned "Satisfied," which deviated from the ground truth.
 
 ```{python}
-# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo
+#| label: fig-llm-mean-sd-plot
+#| fig-cap: Comparison of satisfaction using `gpt-4o` versus `gpt-3.5-turbo` for each checklist item on `lightfm`
+
 df_repo_4o__stat = pd.read_csv('score_stat_by_repo_4o.csv')
 df_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])
 df_repo_4o__stat_with_gt['model'] = 'gpt-4o'
@@ -353,44 +417,35 @@ errorbars = base.mark_errorbar().encode(
     titleFontSize=12
 )
 ```
-> Caption: Comparison of the satisfaction using `gpt-4o` versus using `gpt-3.5-turbo` for each checklist item on `lightfm`
 
-Further investigation into `gpt-4o` is required to address this issue and enhance the system performance.
+Further investigation into `gpt-4o` is required to determine its effectiveness in system performance.
 
 ## Conclusion
 
 ### Wrap Up
 
-Our project, FixML, represents a significant step forward in the field of machine learning (ML) testing by providing curated checklists and automated tools that enhance the evaluation and creation of test suites for ML models. The development and implementation of FixML have been driven by both the need of better quality assurance in ML systems, and the current limitations of traditional testing methods on ML projects which are either too general without comprehensive clarification, or are too human-reliant.
-
-FixML seamlessly takes in the user’s ML codebase, identifies and extracted its existing test suites. Together with the curated checklist on ML testing, FixML leverages Large Language Models (LLMs) to assess the completeness of the test suites and output detailed evaluation reports with completeness scores and specific reasons. This assists users in understanding the performance of their current test suites with insights. Additionally, FixML can generate test function specifications corresponding to the curated checklist, helping users utilizing their test suites.
-
-In return, FixML solution combines the scalability of automated testing with the reliability of expert evaluation. By automating the evaluation process, FixML significantly reduces the time and human effort required to assess the quality of ML test suites. This popularizes thorough and efficient quality assessment on ML projects.
+The development of FixML has been driven by the need of better quality assurance in ML systems and the current limitations of traditional testing methods on ML projects. FixML provides curated checklists and automated tools that enhance the evaluation and creation of test suites for ML projects. This in return, significantly reduces the time and effort required to assess the completeness of ML test suites, and thus promotes thorough and efficient assessment on ML projects.
 
 ### Limitation & Future Improvement
 
-While FixML provides substantial benefits, there are limitations and areas that aim to be addressed in future development:
+While FixML provides substantial benefits, there are limitations and areas to be addressed in future development:
 
 1.  **Specialized Checklist**
 
-The current checklist is designed to be general and may not cover all specific requirements for different ML projects. Future development will focus on creating more specialized checklists for different domains and project types, allowing for more tailored evaluations. Since the format of the checklist is designed to allow users to easily expand, edit and select checklist items based on their specific use case, we welcome any collaboration with ML researchers on the creation of specalized checklists.
+The default checklist is general and may not cover all requirements for different ML projects, as shown in [Figure 2: Checklist for Tests in Machine Learning Projects.](#checklist-design). Future development will focus on creating specialized checklists for tailored evaluations across various domains and project types. Collaboration with ML researchers is welcomed for creating specialized checklists based on specific use cases.
 
 2.  **Enhanced Test Evaluator**
 
-Our current study unveils the varying accuracy and consistency issues on the evaluation results using OpenAI GPT models. Future improvements involves prompt enhancement with prompt engineering techniques and support for multiple LLMs for higher performance and flexibility of FixML test evaluator functionality. We also expect to deliver user guidelines in editing the prompts in our system, where ML developers can customize prompts for better performance and collaborate with us to embed them into the system.
+Our study reveals the accuracy and consistency issues on the evaluation results using OpenAI GPT-3.5-turbo model (see [Comparison of our system’s satisfaction determination versus the ground truth for each checklist item and repository](#fig-accu-mean-sd-plot) and [Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores from 30 runs of a single repository](#fig-cons-sd-box-plot)). Future improvements involves better prompt engineering techniques and support for multiple LLMs for enhanced performance and flexibility. User guidelines in prompt creation will be provided to facilitate collaboration with ML developers.
 
 3.  **Customized Test Specification**
 
-FixML test specification generator currently produces general test function skeletons solely based on the curated checklist without the context of the specific ML projects. Future developments will involve the integration of the ML project codebase in the generation process to output customized test functions skeletons. This further lower the barrier of ML users in creating comprehensive test suites relevant to the projects.
-
-4.  Workflow Optimization #FIXME: have to review whether to include as it seems lower priority.
-
-The current test evaluator and test specification generator are separate entities. This could be improved by embedding a workflow engine that allows the system to automatically take actions based on the LLM response. For instance, if the LLM response suggests that test suites are partially satisfied or non-satisfied, the system could automatically run the test generator to produce test function skeletons and then reevaluate them until they are satisfied or some threshold is met. This would create a more cohesive and efficient workflow, reducing manual intervention and improving overall system performance.
+The current generator produces general test function skeletons and does not integrate specific details for the projects (see[General test specification.](#fig-testspec)).Future developments will integrate project-specific information to produce customized test function skeletons. This may further encourage users to create comprehensive tests.
 
-5.  Performance Optimization #FIXME: have to review whether to include as it seems lower priority.
+4.  Workflow Optimization 
 
-Performance optimization is another critical area for future development. As FixML handles large codebases and complex evaluations, optimizing the system to handle these tasks more efficiently is essential. This includes improving the speed and accuracy of the LLM responses, reducing the time taken to analyze and generate reports, and ensuring the system can scale effectively to handle more extensive and more complex projects.
+The test evaluator and test specification generator are currently separate (see [System design.](#system-design)). Future improvements could embed a workflow engine that automatically takes actions based on LLM responses. This creates a more cohesive and efficient workflow, recues manual intervention, and improves overall system performance.
 
-By addressing these limitations and focusing on these future improvements, FixML will become an even more powerful tool for ensuring the quality and robustness of machine learning and data science projects.
+By addressing these limitations and implementing future improvements, we aim for FixML to achieve better performance and contribute to the development of better ML systems, and ultimately enhance human life.
 
-## References
+## References
\ No newline at end of file
diff --git a/report/final_report/img/checklist_sample.png b/report/final_report/img/checklist_sample.png
new file mode 100644
index 0000000..96bfa2d
Binary files /dev/null and b/report/final_report/img/checklist_sample.png differ
diff --git a/report/final_report/logo.png b/report/final_report/img/logo.png
similarity index 100%
rename from report/final_report/logo.png
rename to report/final_report/img/logo.png
diff --git a/report/final_report/img/proposed_system_overview.png b/report/final_report/img/proposed_system_overview.png
new file mode 100644
index 0000000..f830376
Binary files /dev/null and b/report/final_report/img/proposed_system_overview.png differ
diff --git a/report/final_report/img/test_evaluation_report_sample.png b/report/final_report/img/test_evaluation_report_sample.png
new file mode 100644
index 0000000..2bc1528
Binary files /dev/null and b/report/final_report/img/test_evaluation_report_sample.png differ
diff --git a/report/final_report/img/test_evaluation_responses_sample.png b/report/final_report/img/test_evaluation_responses_sample.png
new file mode 100644
index 0000000..9824c22
Binary files /dev/null and b/report/final_report/img/test_evaluation_responses_sample.png differ
diff --git a/report/final_report/img/test_spec_sample.png b/report/final_report/img/test_spec_sample.png
new file mode 100644
index 0000000..4eb2baf
Binary files /dev/null and b/report/final_report/img/test_spec_sample.png differ
diff --git a/report/final_report/proposal.ipynb b/report/final_report/proposal.ipynb
deleted file mode 100644
index c684c7e..0000000
--- a/report/final_report/proposal.ipynb
+++ /dev/null
@@ -1,360 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "raw",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "bibliography: references.bib\n",
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis\n",
-    "\n",
-    "by John Shiu, Orix Au Yeung, Tony Shum, Yingzi Jin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Executive Summary\n",
-    "\n",
-    "The rapid growth of global artificial intelligence (AI) markets presents opportunities and challenges. While AI systems have the potential to impact various aspects of human life, ensuring their software quality remains a significant concern. Current testing strategies for machine learning (ML) systems lack standardization and comprehensiveness, which poses risks to stakeholders, such as financial losses and safety hazards.\n",
-    "\n",
-    "Our proposal addresses this challenge by developing a manually curated checklist which contains best practices and recommendations in testing ML systems. Additionally, an end-to-end application incorporating the checklist and Large Language Model (LLM) will be developed to analyze given ML system source codes and provide test completeness evaluation, missing test recommendations, and test function specification generation. Our proposed solution will enable users to systematically assess, improve, and include tests tailored to their ML systems through a combination of human expertise codified within the checklist and parametric memory from LLMs.\n",
-    "\n",
-    "In the following weeks, we will develop and refine our product through a swift and efficient iterative development approach, with the aim to deliver a rigorously tested and fully-documented system to our partners by the end of the project."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Introduction\n",
-    "\n",
-    "### Problem Statement\n",
-    "\n",
-    "The global artificial intelligence (AI) market is growing exponentially [@grand2021artificial], driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis. \n",
-    "\n",
-    "However, ensuring the software quality of these systems remains a significant challenge [@openja2023studying]. Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as substantial financial losses [@Asheeta2019] and safety hazards. \n",
-    "\n",
-    "Therefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?\n",
-    "\n",
-    "### Our Objectives\n",
-    "\n",
-    "We propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate a checklist to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software's trustworthiness, quality, and reproducibility across both the industry and academia [@kapoor2022leakage]."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "editable": true,
-    "slideshow": {
-     "slide_type": ""
-    },
-    "tags": []
-   },
-   "source": [
-    "## Our Product\n",
-    "\n",
-    "Our solution offers an end-to-end application for evaluating and enhancing the robustness of users' ML systems."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "```{figure} ../../img/proposed_system_overview.png\n",
-    "---\n",
-    "name: overview-diagram\n",
-    "---\n",
-    "Main components and workflow of the proposed system. The checklist would be written in [YAML](https://yaml.org/) to maximize readability for both humans and machines. We hope this will encourage researchers/users to read, understand and modify the checklist items, while keeping the checklist closely integrated with other components in our system.\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "editable": true,
-    "slideshow": {
-     "slide_type": ""
-    },
-    "tags": []
-   },
-   "source": [
-    "One big challenge in utilizing LLMs to reliably and consistently evaluate ML systems is their tendency to generate illogical and/or factually wrong information known as hallucination [@zhang2023sirens].\n",
-    "\n",
-    "To combat this, the proposed system will incorporate a checklist ([Fig. 1](overview-diagram)) which would be curated manually and incorporate best practices in software testing and identified areas to be tested inside ML pipeline from human experts and past research.\n",
-    "\n",
-    "This checklist will be our basis in evaluating the effectiveness and completeness of existing tests in a given codebase. Relevant information will be injected into a prompt template, which the LLMs would then be prompted to follow the checklist **exactly** during the evaluation.\n",
-    "\n",
-    "Here is an example of how the proposed checklist would be structured:"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "editable": true,
-    "slideshow": {
-     "slide_type": ""
-    },
-    "tags": [
-     "hide-cell"
-    ]
-   },
-   "source": [
-    "```{toggle}\n",
-    "```yaml\n",
-    "%YAML 1.2\n",
-    "---\n",
-    "Title: Checklist for Tests in Machine Learning Projects\n",
-    "Description: >\n",
-    "  This is a comprehensive checklist for evaluating the data and ML pipeline\n",
-    "  based on identified testing strategies from experts in the field.\n",
-    "Test Areas:\n",
-    "  - Topic: General\n",
-    "    Description: >\n",
-    "      The following items describe best practices for all tests to be\n",
-    "      written.\n",
-    "    Tests:\n",
-    "      - Title: Write Descriptive Test Names\n",
-    "        Requirement: >\n",
-    "          Every test function should have a clear, descriptive name\n",
-    "        Explanation: >\n",
-    "          If out tests are narrow and sufficiently descriptive, the test\n",
-    "          name itself may give us enough information to start debugging.\n",
-    "          This also helps us to identify what is being tested inside the\n",
-    "          function.\n",
-    "        References:\n",
-    "          - https://testing.googleblog.com/2014/10/testing-on-toilet-writing-descriptive.html\n",
-    "          - https://testing.googleblog.com/2024/05/test-failures-should-be-actionable.html\n",
-    "\n",
-    "      - Title: Keep Tests Focused\n",
-    "        Requirement: >\n",
-    "          Each test should only test one scenario, meaning that in each\n",
-    "          test we should only use one set of mock data.\n",
-    "        Explanation: >\n",
-    "          If we test multiple scenarios in a single test, it is hard to\n",
-    "          idenitfy exactly what went wrong. Keeping one scenario in a\n",
-    "          single test helps us to isolate problematic scenarios.\n",
-    "        References:\n",
-    "          - https://testing.googleblog.com/2018/06/testing-on-toilet-keep-tests-focused.html\n",
-    "\n",
-    "      - Title: Prefer Narrow Assertions in Unit Tests\n",
-    "        Requirement: >\n",
-    "          The assertions inside the tests should be narrow, meaning that\n",
-    "          when checking a complex object, any unrelated behavior should\n",
-    "          not be tested - Assert on only relevant behaviors.\n",
-    "        Explanation: >\n",
-    "          If we have overly wide assertions (such as depending on every\n",
-    "          field of a complex output proto), the test may fail for many\n",
-    "          unimportant reasons. False positives are the opposite of\n",
-    "          actionable.\n",
-    "        References:\n",
-    "          - https://testing.googleblog.com/2024/04/prefer-narrow-assertions-in-unit-tests.html\n",
-    "\n",
-    "      - Title: Keep Cause and Effect Clear\n",
-    "        Requirement: >\n",
-    "          The modifications and the assertions of an object's behavior\n",
-    "          in a single test should not be far away from each other.\n",
-    "        Explanation: >\n",
-    "          Refrain from using large global test data structures shared\n",
-    "          across multiple unit tests. This will allow for clear\n",
-    "          identification of each test's setup and the cause and effect.\n",
-    "        References:\n",
-    "          - https://testing.googleblog.com/2017/01/testing-on-toilet-keep-cause-and-effect.html\n",
-    "\n",
-    "  - Topic: Data Presence\n",
-    "    Description: >\n",
-    "      The following items describe tests that need to be done for testing\n",
-    "      the presence of data.\n",
-    "    Tests:\n",
-    "      - Title: ...\n",
-    "        Requirement: ...\n",
-    "        Explanation: ...\n",
-    "        References:\n",
-    "          - ...\n",
-    "\n",
-    "  - Topic: Data Quality\n",
-    "    Description: >\n",
-    "      The following items describe tests that need to be done for testing\n",
-    "      the quality of data.\n",
-    "    Tests:\n",
-    "      - Title: ...\n",
-    "        Requirement: ...\n",
-    "        Explanation: ...\n",
-    "        References:\n",
-    "          - ...\n",
-    "\n",
-    "  - Topic: Data Ingestion\n",
-    "    Description: >\n",
-    "      The following items describe tests that need to be done for testing\n",
-    "      if the data is ingestion properly.\n",
-    "    Tests:\n",
-    "      - Title: ...\n",
-    "        Requirement: ...\n",
-    "        Explanation: ...\n",
-    "        References:\n",
-    "          - ...\n",
-    "\n",
-    "  - Topic: Model Fitting\n",
-    "    Description: >\n",
-    "      The following items describe tests that need to be done for testing\n",
-    "      the model fitting process.\n",
-    "    Tests:\n",
-    "      - Title: ...\n",
-    "        Requirement: ...\n",
-    "        Explanation: ...\n",
-    "        References:\n",
-    "          - ...\n",
-    "\n",
-    "  - Topic: Model Evaluation\n",
-    "    Description: >\n",
-    "      The following items describe tests that need to be done for testing\n",
-    "      the model evaluation process.\n",
-    "    Tests:\n",
-    "      - Title: ...\n",
-    "        Requirement: ...\n",
-    "        Explanation: ...\n",
-    "        References:\n",
-    "          - ...\n",
-    "\n",
-    "  - Topic: Artifact Testing\n",
-    "    Description: >\n",
-    "      The following items describe tests that need to be done for testing\n",
-    "      any artifacts that are created from the project.\n",
-    "    Tests:\n",
-    "      - Title: ...\n",
-    "        Requirement: ...\n",
-    "        Explanation: ...\n",
-    "        References:\n",
-    "          - ...\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true
-   },
-   "source": [
-    "### Evaluation Artifacts\n",
-    "\n",
-    "The end goal of our product is to generate the following three artifacts in relation to the evaluation of a given ML system codebase:\n",
-    "\n",
-    "1. **ML Test Completeness Score**: The application utilizes LLMs and our curated checklist to analyze users' ML system source code and returns a comprehensive score of the system's test quality.\n",
-    "  \n",
-    "2. **Missing Test Recommendations**: The application evaluates the adequacy of existing tests for users' ML code and offers recommendations for additional, system-specific tests to enhance testing effectiveness.\n",
-    "  \n",
-    "3. **Test Function Specification Generation**: Users select desired test recommendations and prompt the application to generate test function specifications and references. These are reliable starting points for users to enrich the ML system test suites.\n",
-    "\n",
-    "### Success Metrics\n",
-    "\n",
-    "Our product's success will depend on mutation testing of the test functions developed based on our application-generated specifications. The evaluation metric is the success rate of detecting the perturbations introduced to the ML project code.\n",
-    "\n",
-    "Our partners and stakeholders expect a significant improvement in the testing suites of their ML systems post-application usage. As a result, the testing suites will demonstrate high accuracy in detecting faults, ensuring consistency and high quality of ML projects during updates.\n",
-    "\n",
-    "### Data Science Approach\n",
-    "\n",
-    "#### Data: GitHub Repositories\n",
-    "\n",
-    "In this project, GitHub repositories are our data. \n",
-    "\n",
-    "To develop our testing checklist, we will collect 11 repositories studied in [@openja2023studying]. Additionally, we will collect 377 repositories identified in the study by [@wattanakriengkrai2022github] for our product development.\n",
-    "\n",
-    "For each repository, we are interested in the metadata and the ML modeling- and test-related source code. The metadata will be retrieved using the GitHub API, while the source code will be downloaded and filtered using our custom scripts. To ensure the relevance of the repositories to our study, we will apply the following criteria for filtering:\n",
-    " 1. Repositories that are related to ML systems.\n",
-    " 2. Repositories that include test cases.\n",
-    " 3. Repositories whose development is written in the Python programming language.\n",
-    "\n",
-    "#### Methodologies\n",
-    "\n",
-    "Our data science methodology incorporates human expert evaluation and prompt engineering to assess and enhance the test quality of ML systems.\n",
-    "\n",
-    "- Human Expert Evaluation\n",
-    "\n",
-    "    We will begin by formulating a comprehensive checklist for evaluating the data and ML pipeline based on the established testing strategies outlined in [@openja2023studying] as the foundational framework. Based on the formulated checklist, our team will manually assess the test quality within each repository data. We will refine the checklist to ensure applicability and robustness when testing general ML systems.\n",
-    "\n",
-    "- Prompt Engineering\n",
-    "\n",
-    "    We will engineer the prompts for LLM to incorporate with the ML system code and the curated checklist and to serve various purposes across the three-stage process:\n",
-    "  \n",
-    "    1. Prompts to examine test cases within the ML system source codes and deliver test completeness scores.\n",
-    "    2. Prompts to compare and contrast the existing tests and the checklist and deliver recommendations.\n",
-    "    3. Prompts to generate system-specific test specifications based on user-selected testing recommendations [@schafer2023empirical]\n",
-    "\n",
-    "#### Iterative Development Approach\n",
-    "\n",
-    "We begin by setting up a foundational framework based on the selected GitHub repositories and research on ML testing. The framework might not cover all ML systems or testing practices. Therefore, we adopt an iterative development approach by establishing an open and scalable framework to address these considerations. The application will be continuously refined based on contributors' insights.\n",
-    "\n",
-    "Users are encouraged to interpret the generated artifacts with a grain of salt and recognize the evolving nature of ML system testing practices."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Delivery Timeline\n",
-    "\n",
-    "Our team follows the timeline below for our product delivery and prioritizes close communication with our partners to ensure that our developments align closely with their expectations.\n",
-    "\n",
-    "| Timeline | Milestones |\n",
-    "|---|---|\n",
-    "| Week 1 (Apr 29 - May 3) | Prepare and Present Initial Proposal. Scrape repository data. |\n",
-    "| Week 2 - 3 (May 6 - 17) | Deliver Proposal. Deliver Draft of ML Pipeline Test Checklist. Develop Minimum Viable Product (Test Completeness Score, Missing Test Recommendation) |\n",
-    "| Week 4 - 5 (May 20 - May 31) | Update Test Checklist. Develop Test Function Specification Generator. |\n",
-    "| Week 6 (Jun 3 - Jun 7) | Update Test Checklist. Wrap Up Product. |\n",
-    "| Week 7 (Jun 10 - Jun 14) | Finalize Test Checklist. Perform Product System Test. Present Final Product. Prepare Final Product Report. |\n",
-    "| Week 8 (Jun 17 - Jun 21) | Deliver Final Product. Deliver Final Product Report. |\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## References"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "```{bibliography}\n",
-    "```"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.4"
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "state": {},
-    "version_major": 2,
-    "version_minor": 0
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/report/final_report/proposal.qmd b/report/final_report/proposal.qmd
new file mode 100644
index 0000000..b3e1a6a
--- /dev/null
+++ b/report/final_report/proposal.qmd
@@ -0,0 +1,241 @@
+---
+format:
+  html:
+    code-fold: true
+bibliography: references.bib
+jupyter: python3
+---
+
+# Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis
+
+by John Shiu, Orix Au Yeung, Tony Shum, Yingzi Jin
+
+## Executive Summary
+
+The rapid growth of global artificial intelligence (AI) markets presents opportunities and challenges. While AI systems have the potential to impact various aspects of human life, ensuring their software quality remains a significant concern. Current testing strategies for machine learning (ML) systems lack standardization and comprehensiveness, which poses risks to stakeholders, such as financial losses and safety hazards.
+
+Our proposal addresses this challenge by developing a manually curated checklist which contains best practices and recommendations in testing ML systems. Additionally, an end-to-end application incorporating the checklist and Large Language Model (LLM) will be developed to analyze given ML system source codes and provide test completeness evaluation, missing test recommendations, and test function specification generation. Our proposed solution will enable users to systematically assess, improve, and include tests tailored to their ML systems through a combination of human expertise codified within the checklist and parametric memory from LLMs.
+
+In the following weeks, we will develop and refine our product through a swift and efficient iterative development approach, with the aim to deliver a rigorously tested and fully-documented system to our partners by the end of the project.
+
+## Introduction
+
+### Problem Statement
+
+The global artificial intelligence (AI) market is growing exponentially [@grand2021artificial], driven by its ability to autonomously make complex decisions impacting various aspects of human life, including financial transactions, autonomous transportation, and medical diagnosis. 
+
+However, ensuring the software quality of these systems remains a significant challenge [@openja2023studying]. Specifically, the lack of a standardized and comprehensive approach to testing machine learning (ML) systems introduces potential risks to stakeholders. For example, inadequate quality assurance in ML systems can lead to severe consequences, such as substantial financial losses [@Asheeta2019] and safety hazards. 
+
+Therefore, defining and promoting an industry standard and establishing robust testing methodologies for these systems is crucial. But how?
+
+### Our Objectives
+
+We propose to develop testing suites diagnostic tools based on Large Language Models (LLMs) and curate a checklist to facilitate comprehensive testing of ML systems with flexibility. Our goal is to enhance applied ML software's trustworthiness, quality, and reproducibility across both the industry and academia [@kapoor2022leakage].
+
+## Our Product
+
+Our solution offers an end-to-end application for evaluating and enhancing the robustness of users' ML systems.
+
+![Main components and workflow of the proposed system. The checklist would be written in [YAML](https://yaml.org/) to maximize readability for both humans and machines. We hope this will encourage researchers/users to read, understand and modify the checklist items, while keeping the checklist closely integrated with other components in our system.](img/proposed_system_overview.png)
+
+One big challenge in utilizing LLMs to reliably and consistently evaluate ML systems is their tendency to generate illogical and/or factually wrong information known as hallucination [@zhang2023sirens].
+
+To combat this, the proposed system will incorporate a checklist ([Fig. 1](overview-diagram)) which would be curated manually and incorporate best practices in software testing and identified areas to be tested inside ML pipeline from human experts and past research.
+
+This checklist will be our basis in evaluating the effectiveness and completeness of existing tests in a given codebase. Relevant information will be injected into a prompt template, which the LLMs would then be prompted to follow the checklist **exactly** during the evaluation.
+
+Here is an example of how the proposed checklist would be structured:
+
+```{yaml}
+%YAML 1.2
+---
+Title: Checklist for Tests in Machine Learning Projects
+Description: >
+  This is a comprehensive checklist for evaluating the data and ML pipeline
+  based on identified testing strategies from experts in the field.
+Test Areas:
+  - Topic: General
+    Description: >
+      The following items describe best practices for all tests to be
+      written.
+    Tests:
+      - Title: Write Descriptive Test Names
+        Requirement: >
+          Every test function should have a clear, descriptive name
+        Explanation: >
+          If out tests are narrow and sufficiently descriptive, the test
+          name itself may give us enough information to start debugging.
+          This also helps us to identify what is being tested inside the
+          function.
+        References:
+          - https://testing.googleblog.com/2014/10/testing-on-toilet-writing-descriptive.html
+          - https://testing.googleblog.com/2024/05/test-failures-should-be-actionable.html
+
+      - Title: Keep Tests Focused
+        Requirement: >
+          Each test should only test one scenario, meaning that in each
+          test we should only use one set of mock data.
+        Explanation: >
+          If we test multiple scenarios in a single test, it is hard to
+          idenitfy exactly what went wrong. Keeping one scenario in a
+          single test helps us to isolate problematic scenarios.
+        References:
+          - https://testing.googleblog.com/2018/06/testing-on-toilet-keep-tests-focused.html
+
+      - Title: Prefer Narrow Assertions in Unit Tests
+        Requirement: >
+          The assertions inside the tests should be narrow, meaning that
+          when checking a complex object, any unrelated behavior should
+          not be tested - Assert on only relevant behaviors.
+        Explanation: >
+          If we have overly wide assertions (such as depending on every
+          field of a complex output proto), the test may fail for many
+          unimportant reasons. False positives are the opposite of
+          actionable.
+        References:
+          - https://testing.googleblog.com/2024/04/prefer-narrow-assertions-in-unit-tests.html
+
+      - Title: Keep Cause and Effect Clear
+        Requirement: >
+          The modifications and the assertions of an object's behavior
+          in a single test should not be far away from each other.
+        Explanation: >
+          Refrain from using large global test data structures shared
+          across multiple unit tests. This will allow for clear
+          identification of each test's setup and the cause and effect.
+        References:
+          - https://testing.googleblog.com/2017/01/testing-on-toilet-keep-cause-and-effect.html
+
+  - Topic: Data Presence
+    Description: >
+      The following items describe tests that need to be done for testing
+      the presence of data.
+    Tests:
+      - Title: ...
+        Requirement: ...
+        Explanation: ...
+        References:
+          - ...
+
+  - Topic: Data Quality
+    Description: >
+      The following items describe tests that need to be done for testing
+      the quality of data.
+    Tests:
+      - Title: ...
+        Requirement: ...
+        Explanation: ...
+        References:
+          - ...
+
+  - Topic: Data Ingestion
+    Description: >
+      The following items describe tests that need to be done for testing
+      if the data is ingestion properly.
+    Tests:
+      - Title: ...
+        Requirement: ...
+        Explanation: ...
+        References:
+          - ...
+
+  - Topic: Model Fitting
+    Description: >
+      The following items describe tests that need to be done for testing
+      the model fitting process.
+    Tests:
+      - Title: ...
+        Requirement: ...
+        Explanation: ...
+        References:
+          - ...
+
+  - Topic: Model Evaluation
+    Description: >
+      The following items describe tests that need to be done for testing
+      the model evaluation process.
+    Tests:
+      - Title: ...
+        Requirement: ...
+        Explanation: ...
+        References:
+          - ...
+
+  - Topic: Artifact Testing
+    Description: >
+      The following items describe tests that need to be done for testing
+      any artifacts that are created from the project.
+    Tests:
+      - Title: ...
+        Requirement: ...
+        Explanation: ...
+        References:
+          - ...
+```
+
+### Evaluation Artifacts
+
+The end goal of our product is to generate the following three artifacts in relation to the evaluation of a given ML system codebase:
+
+1. **ML Test Completeness Score**: The application utilizes LLMs and our curated checklist to analyze users' ML system source code and returns a comprehensive score of the system's test quality.
+  
+2. **Missing Test Recommendations**: The application evaluates the adequacy of existing tests for users' ML code and offers recommendations for additional, system-specific tests to enhance testing effectiveness.
+  
+3. **Test Function Specification Generation**: Users select desired test recommendations and prompt the application to generate test function specifications and references. These are reliable starting points for users to enrich the ML system test suites.
+
+### Success Metrics
+
+Our product's success will depend on mutation testing of the test functions developed based on our application-generated specifications. The evaluation metric is the success rate of detecting the perturbations introduced to the ML project code.
+
+Our partners and stakeholders expect a significant improvement in the testing suites of their ML systems post-application usage. As a result, the testing suites will demonstrate high accuracy in detecting faults, ensuring consistency and high quality of ML projects during updates.
+
+### Data Science Approach
+
+#### Data: GitHub Repositories
+
+In this project, GitHub repositories are our data. 
+
+To develop our testing checklist, we will collect 11 repositories studied in [@openja2023studying]. Additionally, we will collect 377 repositories identified in the study by [@wattanakriengkrai2022github] for our product development.
+
+For each repository, we are interested in the metadata and the ML modeling- and test-related source code. The metadata will be retrieved using the GitHub API, while the source code will be downloaded and filtered using our custom scripts. To ensure the relevance of the repositories to our study, we will apply the following criteria for filtering:
+ 1. Repositories that are related to ML systems.
+ 2. Repositories that include test cases.
+ 3. Repositories whose development is written in the Python programming language.
+
+#### Methodologies
+
+Our data science methodology incorporates human expert evaluation and prompt engineering to assess and enhance the test quality of ML systems.
+
+- Human Expert Evaluation
+
+    We will begin by formulating a comprehensive checklist for evaluating the data and ML pipeline based on the established testing strategies outlined in [@openja2023studying] as the foundational framework. Based on the formulated checklist, our team will manually assess the test quality within each repository data. We will refine the checklist to ensure applicability and robustness when testing general ML systems.
+
+- Prompt Engineering
+
+    We will engineer the prompts for LLM to incorporate with the ML system code and the curated checklist and to serve various purposes across the three-stage process:
+  
+    1. Prompts to examine test cases within the ML system source codes and deliver test completeness scores.
+    2. Prompts to compare and contrast the existing tests and the checklist and deliver recommendations.
+    3. Prompts to generate system-specific test specifications based on user-selected testing recommendations [@schafer2023empirical]
+
+#### Iterative Development Approach
+
+We begin by setting up a foundational framework based on the selected GitHub repositories and research on ML testing. The framework might not cover all ML systems or testing practices. Therefore, we adopt an iterative development approach by establishing an open and scalable framework to address these considerations. The application will be continuously refined based on contributors' insights.
+
+Users are encouraged to interpret the generated artifacts with a grain of salt and recognize the evolving nature of ML system testing practices.
+
+## Delivery Timeline
+
+Our team follows the timeline below for our product delivery and prioritizes close communication with our partners to ensure that our developments align closely with their expectations.
+
+| Timeline | Milestones |
+|---|---|
+| Week 1 (Apr 29 - May 3) | Prepare and Present Initial Proposal. Scrape repository data. |
+| Week 2 - 3 (May 6 - 17) | Deliver Proposal. Deliver Draft of ML Pipeline Test Checklist. Develop Minimum Viable Product (Test Completeness Score, Missing Test Recommendation) |
+| Week 4 - 5 (May 20 - May 31) | Update Test Checklist. Develop Test Function Specification Generator. |
+| Week 6 (Jun 3 - Jun 7) | Update Test Checklist. Wrap Up Product. |
+| Week 7 (Jun 10 - Jun 14) | Finalize Test Checklist. Perform Product System Test. Present Final Product. Prepare Final Product Report. |
+| Week 8 (Jun 17 - Jun 21) | Deliver Final Product. Deliver Final Product Report. |
+
+## References
+
diff --git a/report/final_report/references.bib b/report/final_report/references.bib
index 1302306..8823afa 100644
--- a/report/final_report/references.bib
+++ b/report/final_report/references.bib
@@ -36,6 +36,14 @@ @misc{Asheeta2019
   institution = {Firstpost}
 }
 
+@misc{David2023,
+  author = {David Shepardson},
+  year = {2023},
+  title = {GM's Cruise recalling 950 driverless cars after pedestrian dragged in crash},
+  url = {https://www.reuters.com/business/autos-transportation/gms-cruise-recall-950-driverless-cars-after-accident-involving-pedestrian-2023-11-08/},
+  institution = {Reuters}
+}
+
 @article{kapoor2022leakage,
   title={Leakage and the reproducibility crisis in ML-based science},
   author={Kapoor, Sayash and Narayanan, Arvind},
@@ -75,4 +83,38 @@ @article{alexander2023evaluating
   author={Alexander, Rohan and Katz, Lindsay and Moore, Callandra and Schwartz, Zane},
   journal={arXiv preprint arXiv:2310.01402},
   year={2023}
+}
+
+@misc{msise2023,
+	title        = {Testing data science and MLOps Code},
+	author       = {Microsoft Industry Solutions Engineering Team},
+	year         = 2023,
+	month        = {May},
+	journal      = {Testing Data Science and MLOps Code - Engineering Fundamentals Playbook},
+	url          = {https://microsoft.github.io/code-with-engineering-playbook/machine-learning/ml-testing/}
+}
+
+@misc{jordan2020,
+	title        = {Effective Testing for Machine Learning Systems},
+	author       = {Jordan, Jeremy},
+	year         = 2020,
+	month        = {August},
+	url          = {https://www.jeremyjordan.me/testing-ml/}
+}
+
+@article{pineau2021improving,
+  title={Improving reproducibility in machine learning research (a report from the neurips 2019 reproducibility program)},
+  author={Pineau, Joelle and Vincent-Lamarre, Philippe and Sinha, Koustuv and Larivi{\`e}re, Vincent and Beygelzimer, Alina and d'Alch{\'e}-Buc, Florence and Fox, Emily and Larochelle, Hugo},
+  journal={Journal of Machine Learning Research},
+  volume={22},
+  number={164},
+  pages={1--20},
+  year={2021}
+}
+
+@book{Atul2010,
+  title     = "Checklist Manifesto, the (HB)",
+  author    = "Gawande, Atul.",
+  year      = 2010,
+  publisher = "Penguin Books India"
 }
\ No newline at end of file