diff --git a/docs/02_finding-report.html b/docs/02_finding-report.html deleted file mode 100644 index 5ceefa2..0000000 --- a/docs/02_finding-report.html +++ /dev/null @@ -1,2241 +0,0 @@ - - - - - - - - - -NOTE: the result is based on the code base abb9a21, which is similar to the commit 69d61a9 in the main branch - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
- - -
- - - -
- -
-
-

NOTE: the result is based on the code base abb9a21, which is similar to the commit 69d61a9 in the main branch

-
- - - -
- - - - -
- - -
- -
-
import scipy
-import pickle
-import yaml
-import pandas as pd
-import altair as alt
-from collections import Counter
-
-
-
def get_report(response):
-    report = []
-    for result in response.call_results:
-        if result.parsed_response:
-            resp = result.parsed_response['results']
-            for item in resp:
-                item['file'] = result.files_evaluated[0] 
-                item['success'] = result.success
-                report.append(item)
-        else:
-            report.append({
-                'ID': '2.1', # FIXME
-                'Title': '',
-                'Requirement': '',
-                'Observation': '',
-                'Functions': [],
-                'Evaluation': '',
-                'Score': 0,
-                'file': result.files_evaluated[0],
-                'success': result.success
-            })
-    return pd.DataFrame(report)
-
-def extract_file_and_scores(resp_path, verbose=False):
-    if verbose:
-        print(resp_path)
-    with open(resp_path, 'rb') as file:
-        response = pickle.load(file)
-    report = get_report(response)
-    df = (
-        report
-        .pivot(index='file', columns='ID', values='Score')
-        .rename_axis(None, axis=1)
-    )
-    df['success'] = report.groupby(['file'])['success'].all()
-    df['response_path'] = resp_path
-    return df.reset_index()
-
-def generate_stat_plot(df_repo__stat, ground_truth=None, facet_col='repo', repo=None, id=None):
-    """
-    Generate Stat plot across all repo and all checklist item
-    Optional to incorporate ground truth and select specific repo/checklist item
-    """
-    if facet_col == 'repo':
-        x_col = 'id'
-        x_title = 'Checklist ID'
-    elif facet_col == 'id':
-        x_col = 'repo'
-        x_title = 'Repository'
-    
-    # the base chart
-    if repo:
-        df_repo__stat = df_repo__stat.query(f'repo == "{repo}"')
-    if id:
-        df_repo__stat = df_repo__stat.query(f'id == "{id}"')
-    
-    base = alt.Chart().transform_calculate(
-        min="max(0, datum.mean-datum.std)",
-        max="min(1, datum.mean+datum.std)"
-    )
-    
-    # generate the points
-    points = base.mark_point(
-        filled=True,
-        size=50,
-        color='black'
-    ).encode(
-        x=alt.X(f'{x_col}:O').axis(labelAngle=0).title(x_title),
-        y=alt.Y('mean:Q').scale(domainMin=0, domainMax=1).title('Score'),
-    )
-    
-    # generate the error bars
-    errorbars = base.mark_errorbar().encode(
-        x=f"{x_col}:O",
-        y=alt.Y("min:Q").title('1 SD'),
-        y2="max:Q"
-    )
-
-    plot = points + errorbars
-    
-    if ground_truth is not None:
-        # generate points of ground truth
-        if repo:
-            ground_truth = ground_truth.query(f'repo == "{repo}"')
-        if id:
-            ground_truth = ground_truth.query(f'id == "{id}"')
-        
-        df_repo__stat = pd.merge(df_repo__stat, ground_truth, how='left', on=['repo', 'id'])
-        
-        gt_points = alt.Chart().mark_point(
-            filled=True,
-            size=100,
-            color='green',
-            shape="diamond"
-        ).encode(
-            x=alt.X(f'{x_col}:O'),
-            y=alt.Y('score:Q')
-        )
-
-        plot += gt_points
-
-    plot = alt.layer(
-                plot,
-                data=df_repo__stat
-            ).properties(
-                width=400,
-            ).facet(
-                column=f'{facet_col}',
-                columns=2
-            )
-
-    return plot
-
-
-

preprocess data

-
-
checklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']
-
-#result_path = '../draft/batch_run_results/record_combine.yml'
-result_path = '../data/processed/batch_run/record_combine.yml'
-with open(result_path, 'r') as file:
-    config = pd.DataFrame(yaml.safe_load(file))
-
-# prepare score data by repo, run, file
-tmp = [
-    extract_file_and_scores(path) for path in config['response_path'] # FIXME: excluded deepchem
-]
-tmp = pd.concat(tmp, axis=0).reset_index(drop=True)
-
-raw_df_repo_run_file = config.merge(tmp, on='response_path', how='left')
-
-
-
# filter non-test files in qlib
-df_repo_run_file = raw_df_repo_run_file.query('(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))')
-
-# prepare score data by repo, run
-df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
-    id: ['max'] for id in checklist_ids
-})
-df_repo_run.columns = [col[0] for col in df_repo_run.columns]
-df_repo_run = df_repo_run.reset_index()
-
-# prepare statistics of scores by repo
-df_repo__stat = df_repo_run.groupby(['repo']).agg({
-    id: ['mean', 'std', 'count'] for id in checklist_ids
-})
-df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
-df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
-df_repo__stat = (
-    df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
-    .reset_index()
-    .rename_axis(None, axis=1)
-)
-
-# prepare counting of scores by repo
-df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
-for id in checklist_ids[1:]:
-    df_repo__count = df_repo__count.merge(
-        df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
-        on=['repo', 'level_1'],
-        how='outer'
-    )
-
-df_repo__count = df_repo__count.fillna(0)
-
-
-
-

Runs Quality

-
-

1. Some non-test files are included in the evaluation

-

For example, the ./nanodet/nanodet/trainer/task.py

-
-
raw_df_repo_run_file.query('repo == "nanodet"')['file'].unique()[:3]
-
-
array(['../data/raw/openja/nanodet/nanodet/trainer/task.py',
-       '../data/raw/openja/nanodet/tests/test_configs/test_config.py',
-       '../data/raw/openja/nanodet/tests/test_data/test_batch_process.py'],
-      dtype=object)
-
-
-
-
-

2. Evaluation on the file magenta/magenta/models/music_vae/data_test.py is always failed

-
-
df_repo_run_file[~df_repo_run_file.success]['file'].unique()
-
-
array(['../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',
-       '../data/raw/openja/paperless-ng/src/documents/tests/test_api.py'],
-      dtype=object)
-
-
-
-
-

3. DeepSpeech, lightfm and magenta have the least (Python) test files

-
-
df_repo_run_file.query('run == 1').groupby(['repo'])['file'].count().reset_index()
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repofile
0DeepSpeech3
1apollo14
2lightfm7
3magenta8
4mmf70
5mycroft-core64
6nanodet42
7paperless-ng35
8qlib31
- -
-
-
-
-
-

4. The test files are not always in a tests/ folder. Is it be good practice to always do that? Should it be one of the checklist item to ensure all tests placed under tests/ folder?

-

For example, magenta

-
-
df_repo_run_file.query('repo == "magenta"')['file'].unique()
-
-
array(['../data/raw/openja/magenta/conftest.py',
-       '../data/raw/openja/magenta/magenta/common/state_util_test.py',
-       '../data/raw/openja/magenta/magenta/models/coconet/export_saved_model_test.py',
-       '../data/raw/openja/magenta/magenta/models/coconet/lib_data.py',
-       '../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',
-       '../data/raw/openja/magenta/magenta/models/onsets_frames_transcription/create_dataset_lib_test.py',
-       '../data/raw/openja/magenta/magenta/models/score2perf/datagen_beam_test.py',
-       '../data/raw/openja/magenta/magenta/pipelines/pipeline_test.py'],
-      dtype=object)
-
-
-
-
-
-

Findings on 8 repos

-
-
df_repo_run_file.repo.unique()
-
-
array(['lightfm', 'qlib', 'mmf', 'nanodet', 'magenta', 'DeepSpeech',
-       'paperless-ng', 'mycroft-core', 'apollo'], dtype=object)
-
-
-
-

1. Overview of accuracy and consistency lightfm evaluation

-

Let the ground truth of the lightfm is as the following:

-
-
# Ground truth
-ground_truth = pd.DataFrame([
-    {'repo': 'lightfm', 'id': '2.1', 'score': 1},
-    {'repo': 'lightfm', 'id': '3.2', 'score': 1},
-    {'repo': 'lightfm', 'id': '3.5', 'score': 0},
-    {'repo': 'lightfm', 'id': '4.2', 'score': 1},
-    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},
-    {'repo': 'lightfm', 'id': '6.1', 'score': 1},
-    {'repo': 'lightfm', 'id': '6.2', 'score': 1},
-    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},
-    {'repo': 'qlib', 'id': '3.2', 'score': 1},
-    {'repo': 'qlib', 'id': '3.5', 'score': 0},
-    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},
-    {'repo': 'qlib', 'id': '5.3', 'score': 1},
-    {'repo': 'qlib', 'id': '6.1', 'score': 1},
-    {'repo': 'qlib', 'id': '6.2', 'score': 1},
-    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},
-])
-ground_truth[ground_truth.repo == 'lightfm']
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repoidscore
0lightfm2.11.0
1lightfm3.21.0
2lightfm3.50.0
3lightfm4.21.0
4lightfm5.30.5
5lightfm6.11.0
6lightfm6.21.0
- -
-
-
-
-
generate_stat_plot(df_repo__stat, ground_truth=ground_truth, repo="lightfm", facet_col='repo')
-
- - -
- -
-
-

The distribution of the scores for each checklist items:

-
-
df_repo__count.query('repo == "lightfm"')
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repolevel_12.13.23.54.25.36.16.2
6lightfm0.00.01.019.00.018.00.00.0
7lightfm0.51.029.06.027.012.020.04.0
8lightfm1.029.00.05.03.00.010.026.0
- -
-
-
-

Observations: The system evaluation kind of aligns with our evaluation, that is, - for those items that we believe “Satisfied” (Score = 1), the system mostly output 0.5 or 1 - for those items that we believe “Partially Satisfied” or “Not Satisfied”, the system mostly output 0.5 or 0 - some checklist items display high variance, e.g. 3.5, 5.3 and 6.1.

-
-
-

2. Overview of qlib

-

Let the ground truth of the qlib is as the following (FIXME: to be confirmed):

-
-
# Ground truth
-ground_truth[ground_truth.repo == 'qlib']
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repoidscore
7qlib2.10.5
8qlib3.21.0
9qlib3.50.0
10qlib4.20.5
11qlib5.31.0
12qlib6.11.0
13qlib6.21.0
- -
-
-
-
-
generate_stat_plot(df_repo__stat, ground_truth=ground_truth, repo="qlib", facet_col='repo')
-
- - -
- -
-
-
-
df_repo__count.query('repo == "qlib"')
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repolevel_12.13.23.54.25.36.16.2
24qlib0.00.01.029.03.014.04.01.0
25qlib0.50.012.01.027.016.024.026.0
26qlib1.030.017.00.00.00.02.03.0
- -
-
-
-

Observations: - There are more disagreement between system and manual evaluation - especially for 5.3, 6.1, 6.2. - The items consistency in this repo are not similar to those in lightfm. - e.g. Variance for 3.5 is greatly reduced. Variance for 3.2 becomes larger. - However, qlib is not just a machine learning project, it also contains a software inside. - e.g. It has a lot of randomly generated data by itself, instead of reading a data to perform analysis, it seems to deviate from the objective of 2.1.

-
-
-

3. The consistency for each checklist items

-
    -
  • Why is it important? If the score of a particular item varies a lot when evaluating a repository, it might mean that its prompt (Requirement) is confusing to the LLM, or the checklist item itself is not well defined.
  • -
-
-
df_repo__stat.pivot(index='id', columns='repo', values='std')
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repoDeepSpeechapollolightfmmagentammfmycroft-corenanodetpaperless-ngqlib
id
2.10.4794630.1525640.0912870.4234510.0000000.0000000.0000000.2421170.000000
3.20.4068380.2150920.0912870.1895250.2450660.2780270.2397320.0912870.285673
3.50.0000000.0000000.3880400.2520030.1268540.0000000.2520030.0000000.091287
4.20.0000000.0000000.1525640.0912870.1268540.0000000.2542740.0000000.152564
5.30.0000000.0000000.2491360.0000000.1268540.0000000.0000000.0000000.253708
6.10.3511070.1728730.2397320.2520030.2330460.0000000.2856730.0000000.224888
6.20.0000000.0000000.1728730.0000000.2012890.2537080.2604150.1268540.182574
- -
-
-
-
-
alt.Chart(df_repo__stat).mark_boxplot().encode(
-    x="std:Q",
-    y='id:N'
-).properties(
-    height=200,
-    width=400
-)
-
- - -
- -
-
-

Observations: - The evaluation of the checklist item 2.1 Ensure Data File Loads as Expected is usually stable. - When evaluating a repository, 50% of the time its standard deviation is smaller than 0.05, the smallest among the others.

-

Below shows the breakdown of item scores for each repository:
-(NOTE: only lightfm and qlib have ground truth, in green diamond)

-
-
generate_stat_plot(df_repo__stat, ground_truth=ground_truth, facet_col='id')
-
- - -
- -
-
-

Observations: - (TBC) The standard deviation for Item 3.5 and 5.3 shows great variation, which might imply that test cases in some repo might be confusing to LLM while some are clear. - (TBC) The standard deviation for Item 5.3, 6.1, 6.2 are relatively high and consistent, which might imply that there is room for refining the prompt to reduce consistency issue.

-
-
-

4. The consistency for each checklist items, compared to the lightfm

-
    -
  • Why is it important? We optimized the consistency of our system using lightfm. Therefore, we treat this repository as a benchmark. If a particular checklist item has a much worse consistency in other repository, that might mean that the prompt for that item is not generalizable.
  • -
-

Below shows the standard deviations in a 30 runs for each checklist item for each repository:

-
-
stds = df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id')
-stds
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
std
id2.13.23.54.25.36.16.2
repo
DeepSpeech0.4794630.4068380.0000000.0000000.0000000.3511070.000000
apollo0.1525640.2150920.0000000.0000000.0000000.1728730.000000
lightfm0.0912870.0912870.3880400.1525640.2491360.2397320.172873
magenta0.4234510.1895250.2520030.0912870.0000000.2520030.000000
mmf0.0000000.2450660.1268540.1268540.1268540.2330460.201289
mycroft-core0.0000000.2780270.0000000.0000000.0000000.0000000.253708
nanodet0.0000000.2397320.2520030.2542740.0000000.2856730.260415
paperless-ng0.2421170.0912870.0000000.0000000.0000000.0000000.126854
qlib0.0000000.2856730.0912870.1525640.2537080.2248880.182574
- -
-
-
-
-
stds_p = stds.copy()
-stds_p.columns = [col[1] for col in stds_p.columns]
-stds_p = stds_p.reset_index()
-stds_p = stds_p.melt(id_vars='repo', var_name='id')
-
-
-
stds_p.head()
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repoidvalue
0DeepSpeech2.10.479463
1apollo2.10.152564
2lightfm2.10.091287
3magenta2.10.423451
4mmf2.10.000000
- -
-
-
-
-
# stripplot = (
-#     alt.Chart(stds_p)
-#     .mark_point(filled=True, size=100)
-#     .transform_calculate( 
-#         # Generate Gaussian jitter with a Box-Muller transform 
-#         jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
-#         # jitter='random()'
-#     ).encode( 
-#         y=alt.Y( 
-#             'jitter:Q', 
-#             title=None, 
-#             axis=alt.Axis(ticks=False, grid=True, labels=False), 
-#             scale=alt.Scale(), 
-#         ), 
-#         x=alt.X('value:Q'), 
-#         color=alt.Color('repo:N'),
-#         row=alt.Row( 
-#             'id:N',
-#             header=alt.Header(
-#                 labelFontSize=16,
-#                 labelAngle=0
-#             )
-#         ),
-#         tooltip='repo'
-#     ).configure_facet( 
-#         spacing=0
-#     ).configure_view( 
-#         stroke=None
-#     ).configure_axis( 
-#         labelFontSize=16, 
-#         titleFontSize=16
-#     ).properties(
-#         height=50, 
-#         width=600
-#     ) 
-# )
-    
-# stripplot 
-
-
-
def generate_jitterbox_plot(df_stds_p):
-    """
-    Generate jitterbox plot across all repo and all checklist item
-    """
-    box = alt.Chart().mark_boxplot(
-        color='grey',
-        opacity=0.5,
-        size=20,
-    ).encode(
-        x=alt.X('value:Q').title('SD(Score)'),
-        y=alt.Y('id:N', title=None, axis=alt.Axis(labelPadding=10, grid=False))
-    )
-    
-    stripplot = alt.Chart().mark_circle(size=100).encode(
-        y=alt.Y( 
-            'id:N',
-            axis=alt.Axis(ticks=False, grid=True, labels=True), 
-            scale=alt.Scale(), 
-        ), 
-        x='value:Q',
-        yOffset="jitter:Q",
-        color=alt.Color('id:N', legend=None),
-        tooltip='repo'
-    ).transform_calculate(
-        # Generate Gaussian jitter with a Box-Muller transform
-        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
-    )
-    
-    plot = alt.layer(
-        box,
-        stripplot,
-        data=df_stds_p
-    ).configure_view( 
-        stroke=None
-    ).configure_axis( 
-        labelFontSize=16, 
-        titleFontSize=16
-    ).properties(
-        height=300, 
-        width=600
-    ) 
-    
-    return plot
-
-
-
generate_jitterbox_plot(stds_p)
-
- - -
- -
-
-
-
alt.Chart(df_repo__stat).mark_boxplot().encode(
-    x="std:Q",
-    y='id:N'
-).properties(
-    height=200,
-    width=400
-)
-
- - -
- -
-
-
-
# !pip install altair_catplot
-# !pip install seaborn
-
-
-
# import altair_catplot
-
-# altair_catplot.catplot(
-#     stds_p, 
-#     transform ='jitterbox', 
-#     mark ='point', 
-#     encoding = dict(
-#         x = alt.X('value:Q'), 
-#         y = alt.Y('id:N'), 
-#         color = alt.Color('repo:N')
-#     ) 
-# )
-
-
-
F = stds.drop(index='lightfm') / stds.loc['lightfm']
-
-base = alt.Chart(
-    F.melt(ignore_index=False).reset_index()[['repo', 'id', 'value']]
-).transform_calculate(
-    benchmark="1",
-    threshold=f"{scipy.stats.f.ppf(0.975, 29, 29)}"
-)
-
-point = base.mark_point(
-    filled=True,
-    size=100,
-).encode(
-    x=alt.X('value:Q').title("std ratio (c.f. lightfm)"),
-    y='id:N',
-    color='repo',
-    tooltip='repo'
-).properties(
-    height=200,
-    width=400
-)
-
-point \
-+ base.mark_rule(color='black').encode(x="benchmark:Q") \
-+ base.mark_rule(color='red').encode(x="threshold:Q")
-# jitter instead of mark_point <-- prompt vs. repo problem?
-# prompt: sd of checklist item for all repo is high
-# repo: most of repo have low sd, the repo we're looking at has outlier
-
- - -
- -
-
-

Observations: - The evaluation of the checklist item 3.2 Data in the Expected Format becomes much more unstable in most of other repositories. - That of the 2.1 is significantly unstable in the repo paperless-ng, magenta and DeepSpeech, but it may be due to the repo itself.

-

TODO: to look into the 3.2’s scores.

-
-
-

TODO: Given ground truth == 1, distribution of system score?

-
-
-

TODO: Given ground truth == 0, distribution of system score?

-
-
def generate_histogram_plot(df_repo_run_long, df_ground_truth=None, repo=None, id=None):
-    """
-    Generate histogram across all repo and all checklist item
-    Optional to incorporate ground truth and select specific repo/checklist item
-    """
-    # data
-    repo_data = df_repo_run_long.copy()
-    if repo:
-        repo_data = repo_data.query(f'repo == "{repo}"')
-    if id:
-        repo_data = repo_data.query(f'id == "{id}"')
-
-    # base histogram chart
-    base = alt.Chart().mark_bar().encode(
-                x=alt.X('eval_score:Q', title='Score'), 
-                y=alt.Y('count()'), 
-                color=alt.value('grey'),
-                size=alt.value(20),
-            )
-    
-    if df_ground_truth is not None:
-        # data
-        gt_data = df_ground_truth.copy()
-        if repo:
-            gt_data = gt_data.query(f'repo == "{repo}"')
-        if id:
-            gt_data = gt_data.query(f'id == "{id}"')
-        
-        repo_data = pd.merge(repo_data, gt_data, how='left', on=['repo', 'id'])
-        repo_data['is_equal_to_gt'] = repo_data['eval_score'] == repo_data['score']
-        
-        # base histogram chart
-        base = base.encode(
-                    color=alt.Color('is_equal_to_gt', scale=alt.Scale(range=['grey', 'green']), legend=None)
-                )
-        base += base.mark_text().encode(
-            text=alt.value('Ground Truth'),
-            x='score',
-            size=alt.value(10),
-            color=alt.value('green'),
-        )
-
-    plot = alt.layer(
-                base,
-                data=repo_data
-            ).properties(
-                width=200,
-                height=200,
-            ).facet(
-                row='repo',
-                column='id'
-            )        
-    
-    return plot
-
-
-
-

Contingency Table

-
-
df_repo_run_p = pd.melt(df_repo_run, id_vars=['repo', 'run'], var_name='id', value_name='eval_score')
-df_repo_run_p = pd.merge(df_repo_run_p, ground_truth, how='inner', on=['repo', 'id'])
-df_repo_run_p = df_repo_run_p.rename(columns={'score': 'ground_truth'})
-pd.pivot_table(df_repo_run_p, values='run', index=['ground_truth'], columns=['eval_score'], aggfunc='count', fill_value=0)
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
eval_score0.00.51.0
ground_truth
0.0227835
0.5213930
1.02115990
- -
-
-
-
-
df_repo_run_p
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
reporunideval_scoreground_truth
0DeepSpeech12.11.00.0
1DeepSpeech22.11.00.0
2DeepSpeech32.10.00.0
3DeepSpeech42.11.00.0
4DeepSpeech52.10.00.0
..................
625qlib266.20.51.0
626qlib276.20.51.0
627qlib286.20.51.0
628qlib296.21.01.0
629qlib306.20.51.0
- -

630 rows × 5 columns

-
-
-
-
-
# generate_histogram_plot(df_repo_run_p, df_ground_truth=ground_truth)
-
- - -
-
- -
- -
- - - - \ No newline at end of file diff --git a/docs/04_plots-for-presentations.html b/docs/04_plots-for-presentations.html deleted file mode 100644 index 2168f03..0000000 --- a/docs/04_plots-for-presentations.html +++ /dev/null @@ -1,1036 +0,0 @@ - - - - - - - - - -Accuracy: Contingency table - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
- - -
- - - -
- -
-
-

Accuracy: Contingency table

-
- - - -
- - - - -
- - -
- -
-
!pip install scipy altair
-
-
Requirement already satisfied: scipy in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (1.13.1)
-Collecting altair
-  Using cached altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
-Requirement already satisfied: numpy<2.3,>=1.22.4 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from scipy) (1.26.4)
-Requirement already satisfied: jinja2 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (3.1.4)
-Requirement already satisfied: jsonschema>=3.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (4.22.0)
-Requirement already satisfied: packaging in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (23.2)
-Requirement already satisfied: pandas>=0.25 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (2.2.2)
-Collecting toolz (from altair)
-  Using cached toolz-0.12.1-py3-none-any.whl.metadata (5.1 kB)
-Requirement already satisfied: attrs>=22.2.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema>=3.0->altair) (23.2.0)
-Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema>=3.0->altair) (2023.12.1)
-Requirement already satisfied: referencing>=0.28.4 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema>=3.0->altair) (0.35.1)
-Requirement already satisfied: rpds-py>=0.7.1 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema>=3.0->altair) (0.18.1)
-Requirement already satisfied: python-dateutil>=2.8.2 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas>=0.25->altair) (2.9.0.post0)
-Requirement already satisfied: pytz>=2020.1 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas>=0.25->altair) (2024.1)
-Requirement already satisfied: tzdata>=2022.7 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas>=0.25->altair) (2024.1)
-Requirement already satisfied: MarkupSafe>=2.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jinja2->altair) (2.1.5)
-Requirement already satisfied: six>=1.5 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas>=0.25->altair) (1.16.0)
-Using cached altair-5.3.0-py3-none-any.whl (857 kB)
-Using cached toolz-0.12.1-py3-none-any.whl (56 kB)
-Installing collected packages: toolz, altair
-Successfully installed altair-5.3.0 toolz-0.12.1
-
-
-
-
import scipy
-import pickle
-import json
-import yaml
-import pandas as pd
-import altair as alt
-from collections import Counter
-
-
-
def get_report(response):
-    report = []
-    for result in response.call_results:
-        if result.parsed_response:
-            resp = result.parsed_response['results']
-            for item in resp:
-                item['file'] = result.files_evaluated[0] 
-                item['success'] = result.success
-                report.append(item)
-        else:
-            report.append({
-                'ID': '2.1', # FIXME
-                'Title': '',
-                'Requirement': '',
-                'Observation': '',
-                'Functions': [],
-                'Evaluation': '',
-                'Score': 0,
-                'file': result.files_evaluated[0],
-                'success': result.success
-            })
-    return pd.DataFrame(report)
-
-def get_report_json(response):
-    report = []
-    for result in response['call_results']:
-        if result['parsed_response']:
-            resp = result['parsed_response']['results']
-            for item in resp:
-                item['file'] = result['files_evaluated'][0] 
-                item['success'] = result['success']
-                report.append(item)
-        else:
-            report.append({
-                'ID': '2.1', # FIXME
-                'Title': '',
-                'Requirement': '',
-                'Observation': '',
-                'Functions': [],
-                'Evaluation': '',
-                'Score': 0,
-                'file': result.files_evaluated[0],
-                'success': result.success
-            })
-    return pd.DataFrame(report)
-
-def extract_file_and_scores(resp_path, verbose=False):
-    if verbose:
-        print(resp_path)
-    with open(resp_path, 'rb') as file:
-        try:
-            response = pickle.load(file)
-            report = get_report(response)
-        except:
-            response = json.load(file)
-            report = get_report_json(response)
-    df = (
-        report
-        .pivot(index='file', columns='ID', values='Score')
-        .rename_axis(None, axis=1)
-    )
-    df['success'] = report.groupby(['file'])['success'].all()
-    df['response_path'] = resp_path
-    return df.reset_index()
-
-
-
checklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']
-
-def read_and_preprocess(result_path):
-    with open(result_path, 'r') as file:
-        config = pd.DataFrame(yaml.safe_load(file))
-    
-    # prepare score data by repo, run, file
-    tmp = [
-        extract_file_and_scores(path) for path in config['response_path'] # FIXME: excluded deepchem
-    ]
-    tmp = pd.concat(tmp, axis=0).reset_index(drop=True)
-    
-    raw_df_repo_run_file = config.merge(tmp, on='response_path', how='left')
-
-    # filter non-test files in qlib
-    df_repo_run_file = raw_df_repo_run_file.query('(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))')
-    
-    # prepare score data by repo, run
-    df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
-        id: ['max'] for id in checklist_ids
-    })
-    df_repo_run.columns = [col[0] for col in df_repo_run.columns]
-    df_repo_run = df_repo_run.reset_index()
-    
-    # prepare statistics of scores by repo
-    df_repo__stat = df_repo_run.groupby(['repo']).agg({
-        id: ['mean', 'std', 'count'] for id in checklist_ids
-    })
-    df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
-    df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
-    df_repo__stat = (
-        df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
-        .reset_index()
-        .rename_axis(None, axis=1)
-    )
-    
-    # prepare counting of scores by repo
-    df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
-    for id in checklist_ids[1:]:
-        df_repo__count = df_repo__count.merge(
-            df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
-            on=['repo', 'level_1'],
-            how='outer'
-        )
-    
-    df_repo__count = df_repo__count.fillna(0)
-
-    return (df_repo_run_file, df_repo_run, df_repo__stat, df_repo__count)
-
-
-
# Ground truth
-ground_truth = pd.DataFrame([
-    {'repo': 'lightfm', 'id': '2.1', 'score': 1},
-    {'repo': 'lightfm', 'id': '3.2', 'score': 1},
-    {'repo': 'lightfm', 'id': '3.5', 'score': 0},
-    {'repo': 'lightfm', 'id': '4.2', 'score': 1},
-    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},
-    {'repo': 'lightfm', 'id': '6.1', 'score': 1},
-    {'repo': 'lightfm', 'id': '6.2', 'score': 1},
-    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},
-    {'repo': 'qlib', 'id': '3.2', 'score': 1},
-    {'repo': 'qlib', 'id': '3.5', 'score': 0},
-    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},
-    {'repo': 'qlib', 'id': '5.3', 'score': 1},
-    {'repo': 'qlib', 'id': '6.1', 'score': 1},
-    {'repo': 'qlib', 'id': '6.2', 'score': 1},
-    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},
-])
-
-
-
id_item_map = {
-    '2.1': 'Ensure Data File Loads as Expected',
-    '3.2': 'Data in the Expected Format',
-    '3.5': 'Check for Duplicate Records in Data',
-    '4.2': 'Verify Data Split Proportion',
-    '5.3': 'Ensure Model Output Shape Aligns with Expectation',
-    '6.1': 'Verify Evaluation Metrics Implementation',
-    '6.2': "Evaluate Model's Performance Against Thresholds"
-}
-
-
-
#result_path = '../draft/batch_run_results/record_combine.yml'
-df_repo_run_file, df_repo_run, df_repo__stat, df_repo__count = read_and_preprocess(
-    '../data/processed/batch_run/record_combine.yml'
-)
-
-
-
cont_table = pd.melt(
-    df_repo_run.query('(repo == "lightfm")')[['repo', 'run', '3.5', '4.2', '5.3']], 
-    id_vars=['repo', 'run'], var_name='id', value_name='System Output')
-cont_table = pd.merge(cont_table, ground_truth, how='inner', on=['repo', 'id'])
-cont_table = cont_table.rename(columns={'score': 'ground_truth'})
-cont_table['title'] = cont_table['id'].apply(lambda x: id_item_map[x])
-#cont_table = cont_table[['repo', 'title', 'ground_truth', 'System Output', 'run']]
-cont_table = pd.pivot_table(cont_table, values='run', index=['repo', 'id', 'title', 'ground_truth'], columns=['System Output'], aggfunc='count', fill_value=0)
-cont_table.index.names = ['Repository', 'ID', 'Title', 'Ground Truth']
-cont_table.sort_index(level=3)
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
System Output0.00.51.0
RepositoryIDTitleGround Truth
lightfm3.5Check for Duplicate Records in Data0.01965
5.3Ensure Model Output Shape Aligns with Expectation0.518120
4.2Verify Data Split Proportion1.00273
- -
-
-
-
-

Consistency: jitterbox plot

-
-
stds = df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id').copy()
-stds.columns = [col[1] for col in stds.columns]
-stds = stds.reset_index()
-stds = stds.melt(id_vars='repo', var_name='id')
-stds['title'] = stds['id'].apply(lambda x: id_item_map[x])
-
-
-
box = alt.Chart().mark_boxplot(
-    color='grey',
-    opacity=0.5,
-    size=20,
-).encode(
-    x=alt.X('value:Q').title('System Output Uncertainty'),
-    y=alt.Y('title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))
-)
-
-stripplot = alt.Chart().mark_circle(size=100).encode(
-    y=alt.Y( 
-        'title:N',
-        axis=alt.Axis(ticks=False, grid=True, labels=True), 
-        scale=alt.Scale(), 
-    ), 
-    x='value:Q',
-    yOffset="jitter:Q",
-    color=alt.Color('id:N', legend=None),
-    tooltip='repo'
-).transform_calculate(
-    # Generate Gaussian jitter with a Box-Muller transform
-    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
-)
-
-plot = alt.layer(
-    box,
-    stripplot,
-    data=stds
-).configure_view( 
-    stroke=None
-).configure_axis( 
-    labelFontSize=12, 
-    titleFontSize=12
-).properties(
-    height=300, 
-    width=600,
-    title="30 Runs on Openja's Repositories for each Checklist Item"
-) 
-
-
-
plot
-
- - -
- -
-
-
-
-

improvement from 3.5 to 4o

-
-
#result_path = '../draft/batch_run_results/record_combine.yml'
-df_repo_run_file_4o, df_repo_run_4o, df_repo_4o__stat, df_repo_4o__count = read_and_preprocess(
-    '../data/processed/batch_run_4o/record_combine.yml'
-)
-
-
-
df_repo_4o__stat
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repoidcountmeanstd
0lightfm2.130.01.00.0
1lightfm3.230.01.00.0
2lightfm3.530.01.00.0
3lightfm4.230.01.00.0
4lightfm5.330.01.00.0
5lightfm6.130.01.00.0
6lightfm6.230.01.00.0
- -
-
-
-
-
df1 = df_repo__stat.query('(repo == "lightfm") & (id == "4.2")').copy()
-df1['model'] = ['gpt-3.5-turbo']
-
-df2 = df_repo_4o__stat.query('(repo == "lightfm") & (id == "4.2")').copy()
-df2['model'] = ['gpt-4o']
-
-df_model_comp = pd.concat((df1, df2), axis=0)
-
-
-
base = alt.Chart(df_model_comp).transform_calculate(
-    min="max(0, datum.mean-datum.std)",
-    max="min(1, datum.mean+datum.std)"
-)
-    
-# generate the points
-points = base.mark_point(
-    filled=True,
-    size=50,
-    color='black'
-).encode(
-    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title("System Output").axis(
-        labelExpr="datum.value % 0.5 ? null : datum.label"
-    ),
-    y=alt.Y('model:N').title("Model")#.scale(domainMin=0, domainMax=1).title('Score'),
-)
-    
-# generate the error bars
-errorbars = base.mark_errorbar().encode(
-    x=alt.X("min:Q").title('1 SD'), #"id:N",
-    x2="max:Q",
-    y="model:N"
-)
-
-(points + errorbars).configure_axis( 
-    labelFontSize=12, 
-    titleFontSize=12
-).properties(
-    height=200,
-    width=400,
-    title={
-        'text': '30 Runs on Checklist Item: "Ensure Data File Loads as Expected"',
-        'subtitle': "Ground Truth = 1"
-    }
-)
-
- - -
- -
-
- - -
- -
- -
- - - - \ No newline at end of file diff --git a/docs/final_report.html b/docs/final_report.html index e04f18e..9f4d0ee 100644 --- a/docs/final_report.html +++ b/docs/final_report.html @@ -54,7 +54,26 @@ @media screen { pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } } - +/* CSS for citations */ +div.csl-bib-body { } +div.csl-entry { + clear: both; +} +.hanging-indent div.csl-entry { + margin-left:2em; + text-indent:-2em; +} +div.csl-left-margin { + min-width:2em; + float:left; +} +div.csl-right-inline { + margin-left:2em; + padding-left:1em; +} +div.csl-indent { + margin-left: 2em; +} @@ -93,6 +112,9 @@ "search-label": "Search" } } + + + @@ -107,7 +129,7 @@ - + +
import pandas as pd
+gt = pd.read_csv('ground_truth.csv')
+gt
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
idtitleDeepSpeechlightfmqlib
02.1Ensure Data File Loads as Expected0.01.00.5
13.2Data in the Expected Format0.01.01.0
23.5Check for Duplicate Records in Data0.00.00.0
34.2Verify Data Split Proportion0.01.00.5
45.3Ensure Model Output Shape Aligns with Expectation0.00.51.0
56.1Verify Evaluation Metrics Implementation0.01.01.0
66.2Evaluate Model's Performance Against Thresholds0.01.01.0
+ +
+
-

Caption: Ground truth data on the 3 repositories

+

Ground truth data for the 3 repositories. (1 = fully satisfied, 0.5 = partially satisfied, 0 = not satisfied)

Code -
# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo
+
# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo
+import altair as alt
+import pandas as pd
+
+df_repo__stat = pd.read_csv('score_stat_by_repo_3.5-turbo.csv')
+gt = pd.read_csv('ground_truth.csv')
+gt = gt.melt(id_vars=['id', 'title'], var_name='repo', value_name='ground_truth')
+
+df_repo__stat_with_gt = df_repo__stat.merge(gt, on=['id', 'title', 'repo'])
+
+base = alt.Chart(
+    df_repo__stat_with_gt.query('repo in ["lightfm", "qlib", "DeepSpeech"]')
+).transform_calculate(
+    min="max(0, datum.mean-datum.std)",
+    max="min(1, datum.mean+datum.std)"
+)
+    
+# generate the points
+points = base.mark_point(
+    filled=True,
+    size=50,
+    color='black'
+).encode(
+    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title("Score").axis(
+        labelExpr="datum.value % 0.5 ? null : datum.label"
+    ),
+    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),
+)
+
+# generate the points for ground truth
+gt_points = base.mark_point(
+    filled=True,
+    size=200,
+    color='green',
+    shape="diamond"
+).encode(
+    x=alt.X('ground_truth:Q'),
+    y=alt.Y('id_title:N')
+)
+
+# generate the error bars
+errorbars = base.mark_errorbar().encode(
+    x=alt.X("min:Q").title('1 SD'), #"id:N",
+    x2="max:Q",
+    y="id_title:N"
+)
+
+(gt_points + points + errorbars).facet(
+    column=alt.Column('repo:N').title(None)
+).configure_axis( 
+    labelFontSize=12, 
+    titleFontSize=12
+)
+
+ + +
+ +
-

Caption: Comparison of the satisfaction determined by our system versus the ground truth for each checklist item and repository

+

Comparison of our system’s satisfaction determination versus the ground truth for each checklist item and repository

-

We found that our tool tends to undermine the actual satisfying cases. For the items that are actually satisfied (score = 1), our tool tends to classify as partially satisfied (score = 0.5), while for those that are partially satisfied (score = 0.5), our tool often classfies as not satisfied (score = 0).

+

Our tool tends to underrate satisfying cases, which often classifies fully satisfied items as partially satisfied and partially satisfied items as not satisfied.

Code -
# FIXME: contingency table
+
df_repo_run = pd.read_csv('score_by_repo_run_3.5-turbo.csv')
+
+df_repo_run = df_repo_run.merge(gt, on=['id', 'title', 'repo'])
+
+contingency_table = pd.pivot_table(
+    df_repo_run,
+    values='run', 
+    index=['repo', 'id_title', 'ground_truth'], 
+    columns=['score'],
+    aggfunc='count', 
+    fill_value=0
+)
+contingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth']
+contingency_table.sort_index(level=[0, 2])
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
score0.00.51.0
RepositoryChecklist ItemGround Truth
lightfm3.5. Check for Duplicate Records in Data0.03000
5.3. Ensure Model Output Shape Aligns with Expectation0.51290
2.1. Ensure Data File Loads as Expected1.00030
3.2. Data in the Expected Format1.00300
4.2. Verify Data Split Proportion1.001119
6.1. Verify Evaluation Metrics Implementation1.00525
6.2. Evaluate Model's Performance Against Thresholds1.00129
qlib3.5. Check for Duplicate Records in Data0.02370
2.1. Ensure Data File Loads as Expected0.50030
4.2. Verify Data Split Proportion0.53252
3.2. Data in the Expected Format1.001416
5.3. Ensure Model Output Shape Aligns with Expectation1.01254
6.1. Verify Evaluation Metrics Implementation1.021810
6.2. Evaluate Model's Performance Against Thresholds1.00246
+ +
+
-

Contingency table of the satisfaction determined by our system versus the ground truth

+

Contingency table of our system’s satisfaction determination versus the ground truth

-

The accuracy issue may be attributed to the need for improvement of prompts in our checklist.

- -
-

Consistency

-

Since the completeness score from LLMs contain randomness, we further studied the consistency of scores across checklist items and reposities.

+

The accuracy issue may be attributed to a need to improve our checklist prompts.

+
    +
  1. Consistency
  2. +
+

As the completeness scores from LLMs contain randomness, we examined the consistency of completeness scores across checklist items and repositories.

Code -
# FIXME: jitter-boxplot, checklist item vs. SD
+
stds = df_repo__stat[['repo', 'std', 'id_title']].pivot(index='repo', columns='id_title').copy()
+stds.columns = [col[1] for col in stds.columns]
+stds = stds.reset_index()
+stds = stds.melt(id_vars='repo', var_name='id_title')
+
+base = alt.Chart(stds)
+
+box = base.mark_boxplot(
+    color='grey',
+    opacity=0.5,
+    size=20,
+).encode(
+    x=alt.X('value:Q').title('Standard Deviation of Scores'),
+    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))
+)
+
+stripplot = base.mark_circle(size=100).encode(
+    y=alt.Y( 
+        'id_title:N',
+        axis=alt.Axis(ticks=False, grid=True, labels=True), 
+        scale=alt.Scale(), 
+    ), 
+    x='value:Q',
+    yOffset="jitter:Q",
+    color=alt.Color('id_title:N', legend=None),
+    tooltip='repo'
+).transform_calculate(
+    # Generate Gaussian jitter with a Box-Muller transform
+    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
+)
+
+(
+    box + stripplot
+).configure_view( 
+    stroke=None
+).configure_axis( 
+    labelFontSize=12, 
+    titleFontSize=12
+).properties(
+    height=300, 
+    width=600,
+    title="30 Runs on Openja's Repositories for each Checklist Item"
+) 
+
+ + +
+ +
-

Caption: Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores of 30 runs of a sigle repository

+

Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores from 30 runs of a single repository.

-

We found 2 diverging cases. For example, it shows high standard deviations across repositories for item 3.2 Data in the Expected Format. This might be a proof of poor prompt quality, making it ambiguous for the LLM and hence hard to produce consistent results. Prompt engineering might solve this problem.

-

On the other hand, there are outliers yielding exceptionally high standard deviations for item 5.3 Ensure Model Output Shape Aligns with Expectation. This may be because those repositories are unorthodox, and careful manual examination is required to achieve a more robust conclusion.

-
+

We identified two diverging cases:

+
    +
  1. High Standard Deviations
  2. +
+

Items like 3.2 Data in the Expected Format showed high standard deviations across repositories. This might indicate potential poor prompt quality for the LLM to produce consistent results. Improved prompt engineering could address this issue.

+
    +
  1. Outliers with High Standard Deviations
  2. +
+

Items like 5.3 Ensure Model Output Shape Aligns with Expectation had outliers with exceptionally high standard deviations, which is possibly due to unorthodox repositories. A careful manual examination is required for a more definitive conclusion.

Comparison of gpt-3.5-turbo and gpt-4o

-

To examine if newer LLMs help in both metrics, we preliminarily compared system outputs from gpt-4o and gpt-3.5-turbo on the lightfm repository, we observed that the gpt-4o system consistently returned “Satisfied”, which deviates from the ground truth.

+

To evaluate if newer LLMs improve performance, we preliminarily compared outputs from gpt-4o and gpt-3.5-turbo on the lightfm repository. We observed that gpt-4o consistently returned “Satisfied,” which deviated from the ground truth.

Code -
# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo
+
# FIXME: jitter-mean-sd plot (checklist item vs. score) for each repo
+df_repo_4o__stat = pd.read_csv('score_stat_by_repo_4o.csv')
+df_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])
+df_repo_4o__stat_with_gt['model'] = 'gpt-4o'
+
+df_repo_35turbo__stat_with_gt = df_repo__stat_with_gt.query("repo == 'lightfm'").copy()
+df_repo_35turbo__stat_with_gt['model'] = 'gpt-3.5-turbo'
+
+df_model_comp = pd.concat(
+    (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), 
+    axis=0
+)
+
+base = alt.Chart(
+    df_model_comp
+).transform_calculate(
+    min="max(0, datum.mean-datum.std)",
+    max="min(1, datum.mean+datum.std)"
+)
+    
+# generate the points
+points = base.mark_point(
+    filled=True,
+    size=50,
+    color='black'
+).encode(
+    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title("Score").axis(
+        labelExpr="datum.value % 0.5 ? null : datum.label"
+    ),
+    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),
+)
+
+# generate the points for ground truth
+gt_points = base.mark_point(
+    filled=True,
+    size=200,
+    color='green',
+    shape="diamond"
+).encode(
+    x=alt.X('ground_truth:Q'),
+    y=alt.Y('id_title:N')
+)
+
+# generate the error bars
+errorbars = base.mark_errorbar().encode(
+    x=alt.X("min:Q").title('1 SD'), #"id:N",
+    x2="max:Q",
+    y="id_title:N"
+)
+
+(gt_points + points + errorbars).facet(
+    column=alt.Column('model:N').title(None)
+).configure_axis( 
+    labelFontSize=12, 
+    titleFontSize=12
+)
+
+ + +
+ +
-

Caption: Comparison of the satisfaction using gpt-4o versus using gpt-3.5-turbo for each checklist item on lightfm

+

Comparison of satisfaction using gpt-4o versus gpt-3.5-turbo for each checklist item on lightfm

-

Further investigation into gpt-4o is required to address this issue and enhance the system performance.

+

Further investigation into gpt-4o is required to determine its effectiveness in system performance.

@@ -437,40 +1040,82 @@

Com

Conclusion

Wrap Up

-

Our project, FixML, represents a significant step forward in the field of machine learning (ML) testing by providing curated checklists and automated tools that enhance the evaluation and creation of test suites for ML models. The development and implementation of FixML have been driven by both the need of better quality assurance in ML systems, and the current limitations of traditional testing methods on ML projects which are either too general without comprehensive clarification, or are too human-reliant.

-

FixML seamlessly takes in the user’s ML codebase, identifies and extracted its existing test suites. Together with the curated checklist on ML testing, FixML leverages Large Language Models (LLMs) to assess the completeness of the test suites and output detailed evaluation reports with completeness scores and specific reasons. This assists users in understanding the performance of their current test suites with insights. Additionally, FixML can generate test function specifications corresponding to the curated checklist, helping users utilizing their test suites.

-

In return, FixML solution combines the scalability of automated testing with the reliability of expert evaluation. By automating the evaluation process, FixML significantly reduces the time and human effort required to assess the quality of ML test suites. This popularizes thorough and efficient quality assessment on ML projects.

+

The development of FixML has been driven by the need of better quality assurance in ML systems and the current limitations of traditional testing methods on ML projects. FixML provides curated checklists and automated tools that enhance the evaluation and creation of test suites for ML projects. This in return, significantly reduces the time and effort required to assess the completeness of ML test suites, and thus promotes thorough and efficient assessment on ML projects.

Limitation & Future Improvement

-

While FixML provides substantial benefits, there are limitations and areas that aim to be addressed in future development:

+

While FixML provides substantial benefits, there are limitations and areas to be addressed in future development:

  1. Specialized Checklist
-

The current checklist is designed to be general and may not cover all specific requirements for different ML projects. Future development will focus on creating more specialized checklists for different domains and project types, allowing for more tailored evaluations. Since the format of the checklist is designed to allow users to easily expand, edit and select checklist items based on their specific use case, we welcome any collaboration with ML researchers on the creation of specalized checklists.

+

The default checklist is general and may not cover all requirements for different ML projects. Future development will focus on creating specialized checklists for tailored evaluations across various domains and project types. Collaboration with ML researchers is welcomed for creating specialized checklists based on specific use cases.

  1. Enhanced Test Evaluator
-

Our current study unveils the varying accuracy and consistency issues on the evaluation results using OpenAI GPT models. Future improvements involves prompt enhancement with prompt engineering techniques and support for multiple LLMs for higher performance and flexibility of FixML test evaluator functionality. We also expect to deliver user guidelines in editing the prompts in our system, where ML developers can customize prompts for better performance and collaborate with us to embed them into the system.

+

Our study reveals the accuracy and consistency issues on the evaluation results using OpenAI GPT-3.5-turbo model. Future improvements involves better prompt engineering techniques and support for multiple LLMs for enhanced performance and flexibility. User guidelines in prompt creation will be provided to facilitate collaboration with ML developers.

  1. Customized Test Specification
-

FixML test specification generator currently produces general test function skeletons solely based on the curated checklist without the context of the specific ML projects. Future developments will involve the integration of the ML project codebase in the generation process to output customized test functions skeletons. This further lower the barrier of ML users in creating comprehensive test suites relevant to the projects.

+

Future developments will integrate project-specific information to produce customized test function skeletons. This may further encourage users to create comprehensive tests.

  1. Workflow Optimization #FIXME: have to review whether to include as it seems lower priority.
-

The current test evaluator and test specification generator are separate entities. This could be improved by embedding a workflow engine that allows the system to automatically take actions based on the LLM response. For instance, if the LLM response suggests that test suites are partially satisfied or non-satisfied, the system could automatically run the test generator to produce test function skeletons and then reevaluate them until they are satisfied or some threshold is met. This would create a more cohesive and efficient workflow, reducing manual intervention and improving overall system performance.

+

The test evaluator and test specification generator are currently separate. Future improvements could embed a workflow engine that automatically takes actions based on LLM responses. This creates a more cohesive and efficient workflow, recues manual intervention, and improves overall system performance.

  1. Performance Optimization #FIXME: have to review whether to include as it seems lower priority.
-

Performance optimization is another critical area for future development. As FixML handles large codebases and complex evaluations, optimizing the system to handle these tasks more efficiently is essential. This includes improving the speed and accuracy of the LLM responses, reducing the time taken to analyze and generate reports, and ensuring the system can scale effectively to handle more extensive and more complex projects.

-

By addressing these limitations and focusing on these future improvements, FixML will become an even more powerful tool for ensuring the quality and robustness of machine learning and data science projects.

+

As FixML handles large codebases and complex evaluations, performance optimization is essential. Future developments will focus on improving the speed and accuracy of LLM responses, reducing analysis and report generation times, and ensuring scalability for handling larger and more complex projects.

+

By addressing these limitations and implementing future improvements, we aim for FixML to achieve better performance and contribute to the development of better ML systems, and ultimately enhance human life.

+
+ +
+ + -
- +

References

+
+Alexander, Rohan, Lindsay Katz, Callandra Moore, and Zane Schwartz. 2023. “Evaluating the Decency and Consistency of Data Validation Tests Generated by LLMs.” arXiv Preprint arXiv:2310.01402. +
+
+Belanger, Ashley. 2024. “Air Canada Must Honor Refund Policy Invented by Airline’s Chatbot.” Ars Technica. https://arstechnica.com/tech-policy/2024/02/air-canada-must-honor-refund-policy-invented-by-airlines-chatbot/. +
+
+Gawande, Atul. 2010. Checklist Manifesto, the (HB). Penguin Books India. +
+
+Grand-View-Research. 2021. “Artificial Intelligence Market Size, Share & Trends Analysis Report by Solution, by Technology (Deep Learning, Machine Learning), by End-Use, by Region, and Segment Forecasts, 2023 2030.” Grand View Research San Francisco. +
+
+Jordan, Jeremy. 2020. “Effective Testing for Machine Learning Systems.” https://www.jeremyjordan.me/testing-ml/. +
+
+Kapoor, Sayash, and Arvind Narayanan. 2022. “Leakage and the Reproducibility Crisis in ML-Based Science.” arXiv Preprint arXiv:2207.07048. +
+
+Nunwick, Alice. 2023. “ITutorGroup Settles AI Hiring Lawsuit Alleging Age Discrimination.” Verdict. https://www.verdict.co.uk/itutorgroup-settles-ai-hiring-lawsuit-alleging-age-discrimination/. +
+
+Openja, Moses, Foutse Khomh, Armstrong Foundjem, Zhen Ming, Mouna Abidi, Ahmed E Hassan, et al. 2023. “Studying the Practices of Testing Machine Learning Software in the Wild.” arXiv Preprint arXiv:2312.12604. +
+
+Pineau, Joelle, Philippe Vincent-Lamarre, Koustuv Sinha, Vincent Larivière, Alina Beygelzimer, Florence d’Alché-Buc, Emily Fox, and Hugo Larochelle. 2021. “Improving Reproducibility in Machine Learning Research (a Report from the Neurips 2019 Reproducibility Program).” Journal of Machine Learning Research 22 (164): 1–20. +
+
+Regidi, Asheeta. 2019. “SEBI’s Circular: The Black Box Conundrum and Misrepresentation in AI-Based Mutual Funds.” Firstpost. https://www.firstpost.com/business/sebis-circular-the-black-box-conundrum-and-misrepresentation-in-ai-based-mutual-funds-6625161.html. +
+
+Shepardson, David. 2023. “GM’s Cruise Recalling 950 Driverless Cars After Pedestrian Dragged in Crash.” Reuters. https://www.reuters.com/business/autos-transportation/gms-cruise-recall-950-driverless-cars-after-accident-involving-pedestrian-2023-11-08/. +
+
+Team, Microsoft Industry Solutions Engineering. 2023. “Testing Data Science and MLOps Code.” Testing Data Science and MLOps Code - Engineering Fundamentals Playbook. https://microsoft.github.io/code-with-engineering-playbook/machine-learning/ml-testing/. +
+
+Zhang, Yue, Yafu Li, Leyang Cui, Deng Cai, Lemao Liu, Tingchen Fu, Xinting Huang, et al. 2023. “Siren’s Song in the AI Ocean: A Survey on Hallucination in Large Language Models.” https://arxiv.org/abs/2309.01219. +
+
@@ -73,7 +92,7 @@ - + - - - - - - - -
- - -
- - - -
- - - -
-
#!pip install scipy altair
-
-
-
import os
-import json
-import yaml
-import pandas as pd
-from collections import Counter
-
-id_item_map = {
-    '2.1': 'Ensure Data File Loads as Expected',
-    '3.2': 'Data in the Expected Format',
-    '3.5': 'Check for Duplicate Records in Data',
-    '4.2': 'Verify Data Split Proportion',
-    '5.3': 'Ensure Model Output Shape Aligns with Expectation',
-    '6.1': 'Verify Evaluation Metrics Implementation',
-    '6.2': "Evaluate Model's Performance Against Thresholds"
-}
-
-ground_truth = [
-    {'repo': 'lightfm', 'id': '2.1', 'score': 1},
-    {'repo': 'lightfm', 'id': '3.2', 'score': 1},
-    {'repo': 'lightfm', 'id': '3.5', 'score': 0},
-    {'repo': 'lightfm', 'id': '4.2', 'score': 1},
-    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},
-    {'repo': 'lightfm', 'id': '6.1', 'score': 1},
-    {'repo': 'lightfm', 'id': '6.2', 'score': 1},
-    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},
-    {'repo': 'qlib', 'id': '3.2', 'score': 1},
-    {'repo': 'qlib', 'id': '3.5', 'score': 0},
-    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},
-    {'repo': 'qlib', 'id': '5.3', 'score': 1},
-    {'repo': 'qlib', 'id': '6.1', 'score': 1},
-    {'repo': 'qlib', 'id': '6.2', 'score': 1},
-    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},
-]
-
-def get_score_report_from_response(resp_path, verbose=False):
-    if verbose:
-        print(resp_path)
-    with open(resp_path, 'rb') as file:
-        response = json.load(file)
-    
-    reports = [] # report for each test file
-    for result in response['call_results']: # one test file per response
-        if result['parsed_response']:
-            resp = result['parsed_response']['results']
-            for item in resp:
-                item['file'] = result['files_evaluated'][0] 
-                item['success'] = result['success']
-                reports.append(item)
-        # FIXME: not handled failed run for now
-        # else: # if the run is failed, the parsed_response will be None
-        #     reports.append({
-        #         'ID': '2.1', 
-        #         'Title': '',
-        #         'Requirement': '',
-        #         'Observation': '',
-        #         'Functions': [],
-        #         'Evaluation': '',
-        #         'Score': 0,
-        #         'file': result['files_evaluated'][0],
-        #         'success': result['success']
-        #     })
-    
-    reports_df = pd.DataFrame(reports)
-    df = (
-        reports_df
-        .pivot(index='file', columns='ID', values='Score')
-        .rename_axis(None, axis=1)
-    )
-    df['success'] = reports_df.groupby(['file'])['success'].all()
-    df['response_path'] = os.path.abspath(resp_path)
-    
-    return df.reset_index()
-
-def get_scores_by_repo_by_run_by_file(batch_run_dir_path, record_yml='record_combine.yml', verbose=False):
-    ''' Get score for each checklist item, by repository, by run and by test file
-    '''
-    with open(os.path.join(batch_run_dir_path, record_yml), 'r') as file:
-        config = pd.DataFrame(yaml.safe_load(file))
-
-    config['response_path'] = config['response_path'].apply(
-        lambda x: os.path.abspath(os.path.join(batch_run_dir_path, x))
-    )
-    
-    tmp = [
-        get_score_report_from_response(
-            os.path.join(batch_run_dir_path, path),
-            verbose=verbose
-        ) for path in config['response_path']
-    ]
-    tmp = pd.concat(tmp, axis=0).reset_index(drop=True)
-    
-    return config.merge(tmp, on='response_path', how='left')
-
-def preprocess(df_repo_run_file, id_item_map=None):
-    if id_item_map is None:
-        id_item_map = {
-            '2.1': 'Ensure Data File Loads as Expected',
-            '3.2': 'Data in the Expected Format',
-            '3.5': 'Check for Duplicate Records in Data',
-            '4.2': 'Verify Data Split Proportion',
-            '5.3': 'Ensure Model Output Shape Aligns with Expectation',
-            '6.1': 'Verify Evaluation Metrics Implementation',
-            '6.2': "Evaluate Model's Performance Against Thresholds"
-        }
-
-    # prepare score data by repo, by run
-    df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
-        id: ['max'] for id in id_item_map.keys()
-    })
-    df_repo_run.columns = [col[0] for col in df_repo_run.columns]
-    df_repo_run = df_repo_run.reset_index()
-    
-    # prepare statistics of scores by repo
-    df_repo__stat = df_repo_run.groupby(['repo']).agg({
-        id: ['mean', 'std', 'count'] for id in id_item_map.keys()
-    })
-    df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
-    df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
-    df_repo__stat = (
-        df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
-        .reset_index()
-        .rename_axis(None, axis=1)
-    )
-    df_repo__stat['title'] = df_repo__stat['id'].apply(lambda x: id_item_map[x])
-    df_repo__stat['id_title'] = df_repo__stat['id'] + '. ' + df_repo__stat['title']
-    
-    # prepare counting of scores by repo
-    df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
-    for id in list(id_item_map.keys())[1:]:
-        df_repo__count = df_repo__count.merge(
-            df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
-            on=['repo', 'level_1'],
-            how='outer'
-        )
-    #df_repo__count['title'] = df_repo__count['id'].apply(lambda x: id_item_map[x])
-    
-    df_repo__count = df_repo__count.fillna(0)
-
-    df_repo_run = df_repo_run.melt(id_vars=['repo', 'run'], var_name='id', value_name='score')
-    df_repo_run['title'] = df_repo_run['id'].apply(lambda x: id_item_map[x])
-    df_repo_run['id_title'] = df_repo_run['id'] + '. ' + df_repo_run['title']
-    
-    return (df_repo_run, df_repo__stat, df_repo__count)
-
-
-
df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/processed/batch_run_3.5-turbo/')
-df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)
-
-
-
df_repo_run.to_csv('score_by_repo_run_3.5-turbo.csv', index=False)
-df_repo__stat.to_csv('score_stat_by_repo_3.5-turbo.csv', index=False)
-df_repo__count.to_csv('score_count_by_repo_3.5-turbo.csv', index=False)
-
-
-
df_repo_run_file = get_scores_by_repo_by_run_by_file('../../data/processed/batch_run_4o/')
-df_repo_run, df_repo__stat, df_repo__count = preprocess(df_repo_run_file)
-
-df_repo_run.to_csv('score_by_repo_run_4o.csv', index=False)
-df_repo__stat.to_csv('score_stat_by_repo_4o.csv', index=False)
-df_repo__count.to_csv('score_count_by_repo_4o.csv', index=False)
-
-
-
ground_truth_df = pd.DataFrame(ground_truth)
-ground_truth_df['title'] = ground_truth_df['id'].apply(lambda x: id_item_map[x])
-ground_truth_df = ground_truth_df.pivot(index=['id', 'title'], columns='repo', values='score')
-ground_truth_df.to_csv('ground_truth.csv')
-
-
-
#df_repo__count.melt(id_vars=['repo', 'level_1'])
-
-
-
-
- - - -
- -
- - - - \ No newline at end of file diff --git a/report/final_report/docs/02_finding-report.html b/report/final_report/docs/02_finding-report.html deleted file mode 100644 index 3102504..0000000 --- a/report/final_report/docs/02_finding-report.html +++ /dev/null @@ -1,2241 +0,0 @@ - - - - - - - - - -NOTE: the result is based on the code base abb9a21, which is similar to the commit 69d61a9 in the main branch - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
- - -
- - - -
- -
-
-

NOTE: the result is based on the code base abb9a21, which is similar to the commit 69d61a9 in the main branch

-
- - - -
- - - - -
- - -
- -
-
import scipy
-import pickle
-import yaml
-import pandas as pd
-import altair as alt
-from collections import Counter
-
-
-
def get_report(response):
-    report = []
-    for result in response.call_results:
-        if result.parsed_response:
-            resp = result.parsed_response['results']
-            for item in resp:
-                item['file'] = result.files_evaluated[0] 
-                item['success'] = result.success
-                report.append(item)
-        else:
-            report.append({
-                'ID': '2.1', # FIXME
-                'Title': '',
-                'Requirement': '',
-                'Observation': '',
-                'Functions': [],
-                'Evaluation': '',
-                'Score': 0,
-                'file': result.files_evaluated[0],
-                'success': result.success
-            })
-    return pd.DataFrame(report)
-
-def extract_file_and_scores(resp_path, verbose=False):
-    if verbose:
-        print(resp_path)
-    with open(resp_path, 'rb') as file:
-        response = pickle.load(file)
-    report = get_report(response)
-    df = (
-        report
-        .pivot(index='file', columns='ID', values='Score')
-        .rename_axis(None, axis=1)
-    )
-    df['success'] = report.groupby(['file'])['success'].all()
-    df['response_path'] = resp_path
-    return df.reset_index()
-
-def generate_stat_plot(df_repo__stat, ground_truth=None, facet_col='repo', repo=None, id=None):
-    """
-    Generate Stat plot across all repo and all checklist item
-    Optional to incorporate ground truth and select specific repo/checklist item
-    """
-    if facet_col == 'repo':
-        x_col = 'id'
-        x_title = 'Checklist ID'
-    elif facet_col == 'id':
-        x_col = 'repo'
-        x_title = 'Repository'
-    
-    # the base chart
-    if repo:
-        df_repo__stat = df_repo__stat.query(f'repo == "{repo}"')
-    if id:
-        df_repo__stat = df_repo__stat.query(f'id == "{id}"')
-    
-    base = alt.Chart().transform_calculate(
-        min="max(0, datum.mean-datum.std)",
-        max="min(1, datum.mean+datum.std)"
-    )
-    
-    # generate the points
-    points = base.mark_point(
-        filled=True,
-        size=50,
-        color='black'
-    ).encode(
-        x=alt.X(f'{x_col}:O').axis(labelAngle=0).title(x_title),
-        y=alt.Y('mean:Q').scale(domainMin=0, domainMax=1).title('Score'),
-    )
-    
-    # generate the error bars
-    errorbars = base.mark_errorbar().encode(
-        x=f"{x_col}:O",
-        y=alt.Y("min:Q").title('1 SD'),
-        y2="max:Q"
-    )
-
-    plot = points + errorbars
-    
-    if ground_truth is not None:
-        # generate points of ground truth
-        if repo:
-            ground_truth = ground_truth.query(f'repo == "{repo}"')
-        if id:
-            ground_truth = ground_truth.query(f'id == "{id}"')
-        
-        df_repo__stat = pd.merge(df_repo__stat, ground_truth, how='left', on=['repo', 'id'])
-        
-        gt_points = alt.Chart().mark_point(
-            filled=True,
-            size=100,
-            color='green',
-            shape="diamond"
-        ).encode(
-            x=alt.X(f'{x_col}:O'),
-            y=alt.Y('score:Q')
-        )
-
-        plot += gt_points
-
-    plot = alt.layer(
-                plot,
-                data=df_repo__stat
-            ).properties(
-                width=400,
-            ).facet(
-                column=f'{facet_col}',
-                columns=2
-            )
-
-    return plot
-
-
-

preprocess data

-
-
checklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']
-
-#result_path = '../draft/batch_run_results/record_combine.yml'
-result_path = '../data/processed/batch_run/record_combine.yml'
-with open(result_path, 'r') as file:
-    config = pd.DataFrame(yaml.safe_load(file))
-
-# prepare score data by repo, run, file
-tmp = [
-    extract_file_and_scores(path) for path in config['response_path'] # FIXME: excluded deepchem
-]
-tmp = pd.concat(tmp, axis=0).reset_index(drop=True)
-
-raw_df_repo_run_file = config.merge(tmp, on='response_path', how='left')
-
-
-
# filter non-test files in qlib
-df_repo_run_file = raw_df_repo_run_file.query('(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))')
-
-# prepare score data by repo, run
-df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
-    id: ['max'] for id in checklist_ids
-})
-df_repo_run.columns = [col[0] for col in df_repo_run.columns]
-df_repo_run = df_repo_run.reset_index()
-
-# prepare statistics of scores by repo
-df_repo__stat = df_repo_run.groupby(['repo']).agg({
-    id: ['mean', 'std', 'count'] for id in checklist_ids
-})
-df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
-df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
-df_repo__stat = (
-    df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
-    .reset_index()
-    .rename_axis(None, axis=1)
-)
-
-# prepare counting of scores by repo
-df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
-for id in checklist_ids[1:]:
-    df_repo__count = df_repo__count.merge(
-        df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
-        on=['repo', 'level_1'],
-        how='outer'
-    )
-
-df_repo__count = df_repo__count.fillna(0)
-
-
-
-

Runs Quality

-
-

1. Some non-test files are included in the evaluation

-

For example, the ./nanodet/nanodet/trainer/task.py

-
-
raw_df_repo_run_file.query('repo == "nanodet"')['file'].unique()[:3]
-
-
array(['../data/raw/openja/nanodet/nanodet/trainer/task.py',
-       '../data/raw/openja/nanodet/tests/test_configs/test_config.py',
-       '../data/raw/openja/nanodet/tests/test_data/test_batch_process.py'],
-      dtype=object)
-
-
-
-
-

2. Evaluation on the file magenta/magenta/models/music_vae/data_test.py is always failed

-
-
df_repo_run_file[~df_repo_run_file.success]['file'].unique()
-
-
array(['../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',
-       '../data/raw/openja/paperless-ng/src/documents/tests/test_api.py'],
-      dtype=object)
-
-
-
-
-

3. DeepSpeech, lightfm and magenta have the least (Python) test files

-
-
df_repo_run_file.query('run == 1').groupby(['repo'])['file'].count().reset_index()
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repofile
0DeepSpeech3
1apollo14
2lightfm7
3magenta8
4mmf70
5mycroft-core64
6nanodet42
7paperless-ng35
8qlib31
- -
-
-
-
-
-

4. The test files are not always in a tests/ folder. Is it be good practice to always do that? Should it be one of the checklist item to ensure all tests placed under tests/ folder?

-

For example, magenta

-
-
df_repo_run_file.query('repo == "magenta"')['file'].unique()
-
-
array(['../data/raw/openja/magenta/conftest.py',
-       '../data/raw/openja/magenta/magenta/common/state_util_test.py',
-       '../data/raw/openja/magenta/magenta/models/coconet/export_saved_model_test.py',
-       '../data/raw/openja/magenta/magenta/models/coconet/lib_data.py',
-       '../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',
-       '../data/raw/openja/magenta/magenta/models/onsets_frames_transcription/create_dataset_lib_test.py',
-       '../data/raw/openja/magenta/magenta/models/score2perf/datagen_beam_test.py',
-       '../data/raw/openja/magenta/magenta/pipelines/pipeline_test.py'],
-      dtype=object)
-
-
-
-
-
-

Findings on 8 repos

-
-
df_repo_run_file.repo.unique()
-
-
array(['lightfm', 'qlib', 'mmf', 'nanodet', 'magenta', 'DeepSpeech',
-       'paperless-ng', 'mycroft-core', 'apollo'], dtype=object)
-
-
-
-

1. Overview of accuracy and consistency lightfm evaluation

-

Let the ground truth of the lightfm is as the following:

-
-
# Ground truth
-ground_truth = pd.DataFrame([
-    {'repo': 'lightfm', 'id': '2.1', 'score': 1},
-    {'repo': 'lightfm', 'id': '3.2', 'score': 1},
-    {'repo': 'lightfm', 'id': '3.5', 'score': 0},
-    {'repo': 'lightfm', 'id': '4.2', 'score': 1},
-    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},
-    {'repo': 'lightfm', 'id': '6.1', 'score': 1},
-    {'repo': 'lightfm', 'id': '6.2', 'score': 1},
-    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},
-    {'repo': 'qlib', 'id': '3.2', 'score': 1},
-    {'repo': 'qlib', 'id': '3.5', 'score': 0},
-    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},
-    {'repo': 'qlib', 'id': '5.3', 'score': 1},
-    {'repo': 'qlib', 'id': '6.1', 'score': 1},
-    {'repo': 'qlib', 'id': '6.2', 'score': 1},
-    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},
-])
-ground_truth[ground_truth.repo == 'lightfm']
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repoidscore
0lightfm2.11.0
1lightfm3.21.0
2lightfm3.50.0
3lightfm4.21.0
4lightfm5.30.5
5lightfm6.11.0
6lightfm6.21.0
- -
-
-
-
-
generate_stat_plot(df_repo__stat, ground_truth=ground_truth, repo="lightfm", facet_col='repo')
-
- - -
- -
-
-

The distribution of the scores for each checklist items:

-
-
df_repo__count.query('repo == "lightfm"')
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repolevel_12.13.23.54.25.36.16.2
6lightfm0.00.01.019.00.018.00.00.0
7lightfm0.51.029.06.027.012.020.04.0
8lightfm1.029.00.05.03.00.010.026.0
- -
-
-
-

Observations: The system evaluation kind of aligns with our evaluation, that is, - for those items that we believe “Satisfied” (Score = 1), the system mostly output 0.5 or 1 - for those items that we believe “Partially Satisfied” or “Not Satisfied”, the system mostly output 0.5 or 0 - some checklist items display high variance, e.g. 3.5, 5.3 and 6.1.

-
-
-

2. Overview of qlib

-

Let the ground truth of the qlib is as the following (FIXME: to be confirmed):

-
-
# Ground truth
-ground_truth[ground_truth.repo == 'qlib']
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repoidscore
7qlib2.10.5
8qlib3.21.0
9qlib3.50.0
10qlib4.20.5
11qlib5.31.0
12qlib6.11.0
13qlib6.21.0
- -
-
-
-
-
generate_stat_plot(df_repo__stat, ground_truth=ground_truth, repo="qlib", facet_col='repo')
-
- - -
- -
-
-
-
df_repo__count.query('repo == "qlib"')
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repolevel_12.13.23.54.25.36.16.2
24qlib0.00.01.029.03.014.04.01.0
25qlib0.50.012.01.027.016.024.026.0
26qlib1.030.017.00.00.00.02.03.0
- -
-
-
-

Observations: - There are more disagreement between system and manual evaluation - especially for 5.3, 6.1, 6.2. - The items consistency in this repo are not similar to those in lightfm. - e.g. Variance for 3.5 is greatly reduced. Variance for 3.2 becomes larger. - However, qlib is not just a machine learning project, it also contains a software inside. - e.g. It has a lot of randomly generated data by itself, instead of reading a data to perform analysis, it seems to deviate from the objective of 2.1.

-
-
-

3. The consistency for each checklist items

-
    -
  • Why is it important? If the score of a particular item varies a lot when evaluating a repository, it might mean that its prompt (Requirement) is confusing to the LLM, or the checklist item itself is not well defined.
  • -
-
-
df_repo__stat.pivot(index='id', columns='repo', values='std')
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repoDeepSpeechapollolightfmmagentammfmycroft-corenanodetpaperless-ngqlib
id
2.10.4794630.1525640.0912870.4234510.0000000.0000000.0000000.2421170.000000
3.20.4068380.2150920.0912870.1895250.2450660.2780270.2397320.0912870.285673
3.50.0000000.0000000.3880400.2520030.1268540.0000000.2520030.0000000.091287
4.20.0000000.0000000.1525640.0912870.1268540.0000000.2542740.0000000.152564
5.30.0000000.0000000.2491360.0000000.1268540.0000000.0000000.0000000.253708
6.10.3511070.1728730.2397320.2520030.2330460.0000000.2856730.0000000.224888
6.20.0000000.0000000.1728730.0000000.2012890.2537080.2604150.1268540.182574
- -
-
-
-
-
alt.Chart(df_repo__stat).mark_boxplot().encode(
-    x="std:Q",
-    y='id:N'
-).properties(
-    height=200,
-    width=400
-)
-
- - -
- -
-
-

Observations: - The evaluation of the checklist item 2.1 Ensure Data File Loads as Expected is usually stable. - When evaluating a repository, 50% of the time its standard deviation is smaller than 0.05, the smallest among the others.

-

Below shows the breakdown of item scores for each repository:
-(NOTE: only lightfm and qlib have ground truth, in green diamond)

-
-
generate_stat_plot(df_repo__stat, ground_truth=ground_truth, facet_col='id')
-
- - -
- -
-
-

Observations: - (TBC) The standard deviation for Item 3.5 and 5.3 shows great variation, which might imply that test cases in some repo might be confusing to LLM while some are clear. - (TBC) The standard deviation for Item 5.3, 6.1, 6.2 are relatively high and consistent, which might imply that there is room for refining the prompt to reduce consistency issue.

-
-
-

4. The consistency for each checklist items, compared to the lightfm

-
    -
  • Why is it important? We optimized the consistency of our system using lightfm. Therefore, we treat this repository as a benchmark. If a particular checklist item has a much worse consistency in other repository, that might mean that the prompt for that item is not generalizable.
  • -
-

Below shows the standard deviations in a 30 runs for each checklist item for each repository:

-
-
stds = df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id')
-stds
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
std
id2.13.23.54.25.36.16.2
repo
DeepSpeech0.4794630.4068380.0000000.0000000.0000000.3511070.000000
apollo0.1525640.2150920.0000000.0000000.0000000.1728730.000000
lightfm0.0912870.0912870.3880400.1525640.2491360.2397320.172873
magenta0.4234510.1895250.2520030.0912870.0000000.2520030.000000
mmf0.0000000.2450660.1268540.1268540.1268540.2330460.201289
mycroft-core0.0000000.2780270.0000000.0000000.0000000.0000000.253708
nanodet0.0000000.2397320.2520030.2542740.0000000.2856730.260415
paperless-ng0.2421170.0912870.0000000.0000000.0000000.0000000.126854
qlib0.0000000.2856730.0912870.1525640.2537080.2248880.182574
- -
-
-
-
-
stds_p = stds.copy()
-stds_p.columns = [col[1] for col in stds_p.columns]
-stds_p = stds_p.reset_index()
-stds_p = stds_p.melt(id_vars='repo', var_name='id')
-
-
-
stds_p.head()
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repoidvalue
0DeepSpeech2.10.479463
1apollo2.10.152564
2lightfm2.10.091287
3magenta2.10.423451
4mmf2.10.000000
- -
-
-
-
-
# stripplot = (
-#     alt.Chart(stds_p)
-#     .mark_point(filled=True, size=100)
-#     .transform_calculate( 
-#         # Generate Gaussian jitter with a Box-Muller transform 
-#         jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
-#         # jitter='random()'
-#     ).encode( 
-#         y=alt.Y( 
-#             'jitter:Q', 
-#             title=None, 
-#             axis=alt.Axis(ticks=False, grid=True, labels=False), 
-#             scale=alt.Scale(), 
-#         ), 
-#         x=alt.X('value:Q'), 
-#         color=alt.Color('repo:N'),
-#         row=alt.Row( 
-#             'id:N',
-#             header=alt.Header(
-#                 labelFontSize=16,
-#                 labelAngle=0
-#             )
-#         ),
-#         tooltip='repo'
-#     ).configure_facet( 
-#         spacing=0
-#     ).configure_view( 
-#         stroke=None
-#     ).configure_axis( 
-#         labelFontSize=16, 
-#         titleFontSize=16
-#     ).properties(
-#         height=50, 
-#         width=600
-#     ) 
-# )
-    
-# stripplot 
-
-
-
def generate_jitterbox_plot(df_stds_p):
-    """
-    Generate jitterbox plot across all repo and all checklist item
-    """
-    box = alt.Chart().mark_boxplot(
-        color='grey',
-        opacity=0.5,
-        size=20,
-    ).encode(
-        x=alt.X('value:Q').title('SD(Score)'),
-        y=alt.Y('id:N', title=None, axis=alt.Axis(labelPadding=10, grid=False))
-    )
-    
-    stripplot = alt.Chart().mark_circle(size=100).encode(
-        y=alt.Y( 
-            'id:N',
-            axis=alt.Axis(ticks=False, grid=True, labels=True), 
-            scale=alt.Scale(), 
-        ), 
-        x='value:Q',
-        yOffset="jitter:Q",
-        color=alt.Color('id:N', legend=None),
-        tooltip='repo'
-    ).transform_calculate(
-        # Generate Gaussian jitter with a Box-Muller transform
-        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
-    )
-    
-    plot = alt.layer(
-        box,
-        stripplot,
-        data=df_stds_p
-    ).configure_view( 
-        stroke=None
-    ).configure_axis( 
-        labelFontSize=16, 
-        titleFontSize=16
-    ).properties(
-        height=300, 
-        width=600
-    ) 
-    
-    return plot
-
-
-
generate_jitterbox_plot(stds_p)
-
- - -
- -
-
-
-
alt.Chart(df_repo__stat).mark_boxplot().encode(
-    x="std:Q",
-    y='id:N'
-).properties(
-    height=200,
-    width=400
-)
-
- - -
- -
-
-
-
# !pip install altair_catplot
-# !pip install seaborn
-
-
-
# import altair_catplot
-
-# altair_catplot.catplot(
-#     stds_p, 
-#     transform ='jitterbox', 
-#     mark ='point', 
-#     encoding = dict(
-#         x = alt.X('value:Q'), 
-#         y = alt.Y('id:N'), 
-#         color = alt.Color('repo:N')
-#     ) 
-# )
-
-
-
F = stds.drop(index='lightfm') / stds.loc['lightfm']
-
-base = alt.Chart(
-    F.melt(ignore_index=False).reset_index()[['repo', 'id', 'value']]
-).transform_calculate(
-    benchmark="1",
-    threshold=f"{scipy.stats.f.ppf(0.975, 29, 29)}"
-)
-
-point = base.mark_point(
-    filled=True,
-    size=100,
-).encode(
-    x=alt.X('value:Q').title("std ratio (c.f. lightfm)"),
-    y='id:N',
-    color='repo',
-    tooltip='repo'
-).properties(
-    height=200,
-    width=400
-)
-
-point \
-+ base.mark_rule(color='black').encode(x="benchmark:Q") \
-+ base.mark_rule(color='red').encode(x="threshold:Q")
-# jitter instead of mark_point <-- prompt vs. repo problem?
-# prompt: sd of checklist item for all repo is high
-# repo: most of repo have low sd, the repo we're looking at has outlier
-
- - -
- -
-
-

Observations: - The evaluation of the checklist item 3.2 Data in the Expected Format becomes much more unstable in most of other repositories. - That of the 2.1 is significantly unstable in the repo paperless-ng, magenta and DeepSpeech, but it may be due to the repo itself.

-

TODO: to look into the 3.2’s scores.

-
-
-

TODO: Given ground truth == 1, distribution of system score?

-
-
-

TODO: Given ground truth == 0, distribution of system score?

-
-
def generate_histogram_plot(df_repo_run_long, df_ground_truth=None, repo=None, id=None):
-    """
-    Generate histogram across all repo and all checklist item
-    Optional to incorporate ground truth and select specific repo/checklist item
-    """
-    # data
-    repo_data = df_repo_run_long.copy()
-    if repo:
-        repo_data = repo_data.query(f'repo == "{repo}"')
-    if id:
-        repo_data = repo_data.query(f'id == "{id}"')
-
-    # base histogram chart
-    base = alt.Chart().mark_bar().encode(
-                x=alt.X('eval_score:Q', title='Score'), 
-                y=alt.Y('count()'), 
-                color=alt.value('grey'),
-                size=alt.value(20),
-            )
-    
-    if df_ground_truth is not None:
-        # data
-        gt_data = df_ground_truth.copy()
-        if repo:
-            gt_data = gt_data.query(f'repo == "{repo}"')
-        if id:
-            gt_data = gt_data.query(f'id == "{id}"')
-        
-        repo_data = pd.merge(repo_data, gt_data, how='left', on=['repo', 'id'])
-        repo_data['is_equal_to_gt'] = repo_data['eval_score'] == repo_data['score']
-        
-        # base histogram chart
-        base = base.encode(
-                    color=alt.Color('is_equal_to_gt', scale=alt.Scale(range=['grey', 'green']), legend=None)
-                )
-        base += base.mark_text().encode(
-            text=alt.value('Ground Truth'),
-            x='score',
-            size=alt.value(10),
-            color=alt.value('green'),
-        )
-
-    plot = alt.layer(
-                base,
-                data=repo_data
-            ).properties(
-                width=200,
-                height=200,
-            ).facet(
-                row='repo',
-                column='id'
-            )        
-    
-    return plot
-
-
-
-

Contingency Table

-
-
df_repo_run_p = pd.melt(df_repo_run, id_vars=['repo', 'run'], var_name='id', value_name='eval_score')
-df_repo_run_p = pd.merge(df_repo_run_p, ground_truth, how='inner', on=['repo', 'id'])
-df_repo_run_p = df_repo_run_p.rename(columns={'score': 'ground_truth'})
-pd.pivot_table(df_repo_run_p, values='run', index=['ground_truth'], columns=['eval_score'], aggfunc='count', fill_value=0)
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
eval_score0.00.51.0
ground_truth
0.0227835
0.5213930
1.02115990
- -
-
-
-
-
df_repo_run_p
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
reporunideval_scoreground_truth
0DeepSpeech12.11.00.0
1DeepSpeech22.11.00.0
2DeepSpeech32.10.00.0
3DeepSpeech42.11.00.0
4DeepSpeech52.10.00.0
..................
625qlib266.20.51.0
626qlib276.20.51.0
627qlib286.20.51.0
628qlib296.21.01.0
629qlib306.20.51.0
- -

630 rows × 5 columns

-
-
-
-
-
# generate_histogram_plot(df_repo_run_p, df_ground_truth=ground_truth)
-
- - -
-
- -
- -
- - - - \ No newline at end of file diff --git a/report/final_report/docs/02_plots-for-final-report.html b/report/final_report/docs/02_plots-for-final-report.html deleted file mode 100644 index 2ed9083..0000000 --- a/report/final_report/docs/02_plots-for-final-report.html +++ /dev/null @@ -1,931 +0,0 @@ - - - - - - - - - -plots-for-final-report - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
- - -
- - - -
- - - -
-
import altair as alt
-import pandas as pd
-
-df_repo__stat = pd.read_csv('score_stat_by_repo_3.5-turbo.csv')
-gt = pd.read_csv('ground_truth.csv')
-gt = gt.melt(id_vars=['id', 'title'], var_name='repo', value_name='ground_truth')
-
-df_repo__stat_with_gt = df_repo__stat.merge(gt, on=['id', 'title', 'repo'])
-
-base = alt.Chart(
-    df_repo__stat_with_gt.query('repo in ["lightfm", "qlib", "DeepSpeech"]')
-).transform_calculate(
-    min="max(0, datum.mean-datum.std)",
-    max="min(1, datum.mean+datum.std)"
-)
-    
-# generate the points
-points = base.mark_point(
-    filled=True,
-    size=50,
-    color='black'
-).encode(
-    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title("Score").axis(
-        labelExpr="datum.value % 0.5 ? null : datum.label"
-    ),
-    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),
-)
-
-# generate the points for ground truth
-gt_points = base.mark_point(
-    filled=True,
-    size=200,
-    color='green',
-    shape="diamond"
-).encode(
-    x=alt.X('ground_truth:Q'),
-    y=alt.Y('id_title:N')
-)
-
-# generate the error bars
-errorbars = base.mark_errorbar().encode(
-    x=alt.X("min:Q").title('1 SD'), #"id:N",
-    x2="max:Q",
-    y="id_title:N"
-)
-
-(gt_points + points + errorbars).facet(
-    column=alt.Column('repo:N').title(None)
-).configure_axis( 
-    labelFontSize=12, 
-    titleFontSize=12
-)
-
- - -
- -
-
-
-
df_repo_run = pd.read_csv('score_by_repo_run_3.5-turbo.csv')
-
-df_repo_run = df_repo_run.merge(gt, on=['id', 'title', 'repo'])
-
-contingency_table = pd.pivot_table(
-    df_repo_run,
-    values='run', 
-    index=['repo', 'id_title', 'ground_truth'], 
-    columns=['score'],
-    aggfunc='count', 
-    fill_value=0
-)
-contingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth']
-contingency_table.sort_index(level=[0, 2])
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
score0.00.51.0
RepositoryChecklist ItemGround Truth
lightfm3.5. Check for Duplicate Records in Data0.03000
5.3. Ensure Model Output Shape Aligns with Expectation0.51290
2.1. Ensure Data File Loads as Expected1.00030
3.2. Data in the Expected Format1.00300
4.2. Verify Data Split Proportion1.001119
6.1. Verify Evaluation Metrics Implementation1.00525
6.2. Evaluate Model's Performance Against Thresholds1.00129
qlib3.5. Check for Duplicate Records in Data0.02370
2.1. Ensure Data File Loads as Expected0.50030
4.2. Verify Data Split Proportion0.53252
3.2. Data in the Expected Format1.001416
5.3. Ensure Model Output Shape Aligns with Expectation1.01254
6.1. Verify Evaluation Metrics Implementation1.021810
6.2. Evaluate Model's Performance Against Thresholds1.00246
- -
-
-
-
-
stds = df_repo__stat[['repo', 'std', 'id_title']].pivot(index='repo', columns='id_title').copy()
-stds.columns = [col[1] for col in stds.columns]
-stds = stds.reset_index()
-stds = stds.melt(id_vars='repo', var_name='id_title')
-
-base = alt.Chart(stds)
-
-box = base.mark_boxplot(
-    color='grey',
-    opacity=0.5,
-    size=20,
-).encode(
-    x=alt.X('value:Q').title('Standard Deviation of Scores'),
-    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))
-)
-
-stripplot = base.mark_circle(size=100).encode(
-    y=alt.Y( 
-        'id_title:N',
-        axis=alt.Axis(ticks=False, grid=True, labels=True), 
-        scale=alt.Scale(), 
-    ), 
-    x='value:Q',
-    yOffset="jitter:Q",
-    color=alt.Color('id_title:N', legend=None),
-    tooltip='repo'
-).transform_calculate(
-    # Generate Gaussian jitter with a Box-Muller transform
-    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
-)
-
-(
-    box + stripplot
-).configure_view( 
-    stroke=None
-).configure_axis( 
-    labelFontSize=12, 
-    titleFontSize=12
-).properties(
-    height=300, 
-    width=600,
-    title="30 Runs on Openja's Repositories for each Checklist Item"
-) 
-
- - -
- -
-
-
-
df_repo_4o__stat = pd.read_csv('score_stat_by_repo_4o.csv')
-df_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])
-df_repo_4o__stat_with_gt['model'] = 'gpt-4o'
-
-df_repo_35turbo__stat_with_gt = df_repo__stat_with_gt.query("repo == 'lightfm'").copy()
-df_repo_35turbo__stat_with_gt['model'] = 'gpt-3.5-turbo'
-
-df_model_comp = pd.concat(
-    (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), 
-    axis=0
-)
-
-base = alt.Chart(
-    df_model_comp
-).transform_calculate(
-    min="max(0, datum.mean-datum.std)",
-    max="min(1, datum.mean+datum.std)"
-)
-    
-# generate the points
-points = base.mark_point(
-    filled=True,
-    size=50,
-    color='black'
-).encode(
-    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title("Score").axis(
-        labelExpr="datum.value % 0.5 ? null : datum.label"
-    ),
-    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),
-)
-
-# generate the points for ground truth
-gt_points = base.mark_point(
-    filled=True,
-    size=200,
-    color='green',
-    shape="diamond"
-).encode(
-    x=alt.X('ground_truth:Q'),
-    y=alt.Y('id_title:N')
-)
-
-# generate the error bars
-errorbars = base.mark_errorbar().encode(
-    x=alt.X("min:Q").title('1 SD'), #"id:N",
-    x2="max:Q",
-    y="id_title:N"
-)
-
-(gt_points + points + errorbars).facet(
-    column=alt.Column('model:N').title(None)
-).configure_axis( 
-    labelFontSize=12, 
-    titleFontSize=12
-)
-
- - -
- -
-
- - - -
- -
- - - - \ No newline at end of file diff --git a/report/final_report/docs/04_plots-for-presentations.html b/report/final_report/docs/04_plots-for-presentations.html deleted file mode 100644 index eea2073..0000000 --- a/report/final_report/docs/04_plots-for-presentations.html +++ /dev/null @@ -1,1036 +0,0 @@ - - - - - - - - - -Accuracy: Contingency table - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
- - -
- - - -
- -
-
-

Accuracy: Contingency table

-
- - - -
- - - - -
- - -
- -
-
!pip install scipy altair
-
-
Requirement already satisfied: scipy in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (1.13.1)
-Collecting altair
-  Using cached altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
-Requirement already satisfied: numpy<2.3,>=1.22.4 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from scipy) (1.26.4)
-Requirement already satisfied: jinja2 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (3.1.4)
-Requirement already satisfied: jsonschema>=3.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (4.22.0)
-Requirement already satisfied: packaging in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (23.2)
-Requirement already satisfied: pandas>=0.25 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from altair) (2.2.2)
-Collecting toolz (from altair)
-  Using cached toolz-0.12.1-py3-none-any.whl.metadata (5.1 kB)
-Requirement already satisfied: attrs>=22.2.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema>=3.0->altair) (23.2.0)
-Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema>=3.0->altair) (2023.12.1)
-Requirement already satisfied: referencing>=0.28.4 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema>=3.0->altair) (0.35.1)
-Requirement already satisfied: rpds-py>=0.7.1 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jsonschema>=3.0->altair) (0.18.1)
-Requirement already satisfied: python-dateutil>=2.8.2 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas>=0.25->altair) (2.9.0.post0)
-Requirement already satisfied: pytz>=2020.1 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas>=0.25->altair) (2024.1)
-Requirement already satisfied: tzdata>=2022.7 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from pandas>=0.25->altair) (2024.1)
-Requirement already satisfied: MarkupSafe>=2.0 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from jinja2->altair) (2.1.5)
-Requirement already satisfied: six>=1.5 in /Users/johnshiu/miniconda3/envs/test-creation/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas>=0.25->altair) (1.16.0)
-Using cached altair-5.3.0-py3-none-any.whl (857 kB)
-Using cached toolz-0.12.1-py3-none-any.whl (56 kB)
-Installing collected packages: toolz, altair
-Successfully installed altair-5.3.0 toolz-0.12.1
-
-
-
-
import scipy
-import pickle
-import json
-import yaml
-import pandas as pd
-import altair as alt
-from collections import Counter
-
-
-
def get_report(response):
-    report = []
-    for result in response.call_results:
-        if result.parsed_response:
-            resp = result.parsed_response['results']
-            for item in resp:
-                item['file'] = result.files_evaluated[0] 
-                item['success'] = result.success
-                report.append(item)
-        else:
-            report.append({
-                'ID': '2.1', # FIXME
-                'Title': '',
-                'Requirement': '',
-                'Observation': '',
-                'Functions': [],
-                'Evaluation': '',
-                'Score': 0,
-                'file': result.files_evaluated[0],
-                'success': result.success
-            })
-    return pd.DataFrame(report)
-
-def get_report_json(response):
-    report = []
-    for result in response['call_results']:
-        if result['parsed_response']:
-            resp = result['parsed_response']['results']
-            for item in resp:
-                item['file'] = result['files_evaluated'][0] 
-                item['success'] = result['success']
-                report.append(item)
-        else:
-            report.append({
-                'ID': '2.1', # FIXME
-                'Title': '',
-                'Requirement': '',
-                'Observation': '',
-                'Functions': [],
-                'Evaluation': '',
-                'Score': 0,
-                'file': result.files_evaluated[0],
-                'success': result.success
-            })
-    return pd.DataFrame(report)
-
-def extract_file_and_scores(resp_path, verbose=False):
-    if verbose:
-        print(resp_path)
-    with open(resp_path, 'rb') as file:
-        try:
-            response = pickle.load(file)
-            report = get_report(response)
-        except:
-            response = json.load(file)
-            report = get_report_json(response)
-    df = (
-        report
-        .pivot(index='file', columns='ID', values='Score')
-        .rename_axis(None, axis=1)
-    )
-    df['success'] = report.groupby(['file'])['success'].all()
-    df['response_path'] = resp_path
-    return df.reset_index()
-
-
-
checklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']
-
-def read_and_preprocess(result_path):
-    with open(result_path, 'r') as file:
-        config = pd.DataFrame(yaml.safe_load(file))
-    
-    # prepare score data by repo, run, file
-    tmp = [
-        extract_file_and_scores(path) for path in config['response_path'] # FIXME: excluded deepchem
-    ]
-    tmp = pd.concat(tmp, axis=0).reset_index(drop=True)
-    
-    raw_df_repo_run_file = config.merge(tmp, on='response_path', how='left')
-
-    # filter non-test files in qlib
-    df_repo_run_file = raw_df_repo_run_file.query('(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))')
-    
-    # prepare score data by repo, run
-    df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
-        id: ['max'] for id in checklist_ids
-    })
-    df_repo_run.columns = [col[0] for col in df_repo_run.columns]
-    df_repo_run = df_repo_run.reset_index()
-    
-    # prepare statistics of scores by repo
-    df_repo__stat = df_repo_run.groupby(['repo']).agg({
-        id: ['mean', 'std', 'count'] for id in checklist_ids
-    })
-    df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
-    df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
-    df_repo__stat = (
-        df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
-        .reset_index()
-        .rename_axis(None, axis=1)
-    )
-    
-    # prepare counting of scores by repo
-    df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
-    for id in checklist_ids[1:]:
-        df_repo__count = df_repo__count.merge(
-            df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
-            on=['repo', 'level_1'],
-            how='outer'
-        )
-    
-    df_repo__count = df_repo__count.fillna(0)
-
-    return (df_repo_run_file, df_repo_run, df_repo__stat, df_repo__count)
-
-
-
# Ground truth
-ground_truth = pd.DataFrame([
-    {'repo': 'lightfm', 'id': '2.1', 'score': 1},
-    {'repo': 'lightfm', 'id': '3.2', 'score': 1},
-    {'repo': 'lightfm', 'id': '3.5', 'score': 0},
-    {'repo': 'lightfm', 'id': '4.2', 'score': 1},
-    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},
-    {'repo': 'lightfm', 'id': '6.1', 'score': 1},
-    {'repo': 'lightfm', 'id': '6.2', 'score': 1},
-    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},
-    {'repo': 'qlib', 'id': '3.2', 'score': 1},
-    {'repo': 'qlib', 'id': '3.5', 'score': 0},
-    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},
-    {'repo': 'qlib', 'id': '5.3', 'score': 1},
-    {'repo': 'qlib', 'id': '6.1', 'score': 1},
-    {'repo': 'qlib', 'id': '6.2', 'score': 1},
-    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},
-    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},
-])
-
-
-
id_item_map = {
-    '2.1': 'Ensure Data File Loads as Expected',
-    '3.2': 'Data in the Expected Format',
-    '3.5': 'Check for Duplicate Records in Data',
-    '4.2': 'Verify Data Split Proportion',
-    '5.3': 'Ensure Model Output Shape Aligns with Expectation',
-    '6.1': 'Verify Evaluation Metrics Implementation',
-    '6.2': "Evaluate Model's Performance Against Thresholds"
-}
-
-
-
#result_path = '../draft/batch_run_results/record_combine.yml'
-df_repo_run_file, df_repo_run, df_repo__stat, df_repo__count = read_and_preprocess(
-    '../data/processed/batch_run/record_combine.yml'
-)
-
-
-
cont_table = pd.melt(
-    df_repo_run.query('(repo == "lightfm")')[['repo', 'run', '3.5', '4.2', '5.3']], 
-    id_vars=['repo', 'run'], var_name='id', value_name='System Output')
-cont_table = pd.merge(cont_table, ground_truth, how='inner', on=['repo', 'id'])
-cont_table = cont_table.rename(columns={'score': 'ground_truth'})
-cont_table['title'] = cont_table['id'].apply(lambda x: id_item_map[x])
-#cont_table = cont_table[['repo', 'title', 'ground_truth', 'System Output', 'run']]
-cont_table = pd.pivot_table(cont_table, values='run', index=['repo', 'id', 'title', 'ground_truth'], columns=['System Output'], aggfunc='count', fill_value=0)
-cont_table.index.names = ['Repository', 'ID', 'Title', 'Ground Truth']
-cont_table.sort_index(level=3)
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
System Output0.00.51.0
RepositoryIDTitleGround Truth
lightfm3.5Check for Duplicate Records in Data0.01965
5.3Ensure Model Output Shape Aligns with Expectation0.518120
4.2Verify Data Split Proportion1.00273
- -
-
-
-
-

Consistency: jitterbox plot

-
-
stds = df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id').copy()
-stds.columns = [col[1] for col in stds.columns]
-stds = stds.reset_index()
-stds = stds.melt(id_vars='repo', var_name='id')
-stds['title'] = stds['id'].apply(lambda x: id_item_map[x])
-
-
-
box = alt.Chart().mark_boxplot(
-    color='grey',
-    opacity=0.5,
-    size=20,
-).encode(
-    x=alt.X('value:Q').title('System Output Uncertainty'),
-    y=alt.Y('title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))
-)
-
-stripplot = alt.Chart().mark_circle(size=100).encode(
-    y=alt.Y( 
-        'title:N',
-        axis=alt.Axis(ticks=False, grid=True, labels=True), 
-        scale=alt.Scale(), 
-    ), 
-    x='value:Q',
-    yOffset="jitter:Q",
-    color=alt.Color('id:N', legend=None),
-    tooltip='repo'
-).transform_calculate(
-    # Generate Gaussian jitter with a Box-Muller transform
-    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
-)
-
-plot = alt.layer(
-    box,
-    stripplot,
-    data=stds
-).configure_view( 
-    stroke=None
-).configure_axis( 
-    labelFontSize=12, 
-    titleFontSize=12
-).properties(
-    height=300, 
-    width=600,
-    title="30 Runs on Openja's Repositories for each Checklist Item"
-) 
-
-
-
plot
-
- - -
- -
-
-
-
-

improvement from 3.5 to 4o

-
-
#result_path = '../draft/batch_run_results/record_combine.yml'
-df_repo_run_file_4o, df_repo_run_4o, df_repo_4o__stat, df_repo_4o__count = read_and_preprocess(
-    '../data/processed/batch_run_4o/record_combine.yml'
-)
-
-
-
df_repo_4o__stat
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
repoidcountmeanstd
0lightfm2.130.01.00.0
1lightfm3.230.01.00.0
2lightfm3.530.01.00.0
3lightfm4.230.01.00.0
4lightfm5.330.01.00.0
5lightfm6.130.01.00.0
6lightfm6.230.01.00.0
- -
-
-
-
-
df1 = df_repo__stat.query('(repo == "lightfm") & (id == "4.2")').copy()
-df1['model'] = ['gpt-3.5-turbo']
-
-df2 = df_repo_4o__stat.query('(repo == "lightfm") & (id == "4.2")').copy()
-df2['model'] = ['gpt-4o']
-
-df_model_comp = pd.concat((df1, df2), axis=0)
-
-
-
base = alt.Chart(df_model_comp).transform_calculate(
-    min="max(0, datum.mean-datum.std)",
-    max="min(1, datum.mean+datum.std)"
-)
-    
-# generate the points
-points = base.mark_point(
-    filled=True,
-    size=50,
-    color='black'
-).encode(
-    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title("System Output").axis(
-        labelExpr="datum.value % 0.5 ? null : datum.label"
-    ),
-    y=alt.Y('model:N').title("Model")#.scale(domainMin=0, domainMax=1).title('Score'),
-)
-    
-# generate the error bars
-errorbars = base.mark_errorbar().encode(
-    x=alt.X("min:Q").title('1 SD'), #"id:N",
-    x2="max:Q",
-    y="model:N"
-)
-
-(points + errorbars).configure_axis( 
-    labelFontSize=12, 
-    titleFontSize=12
-).properties(
-    height=200,
-    width=400,
-    title={
-        'text': '30 Runs on Checklist Item: "Ensure Data File Loads as Expected"',
-        'subtitle': "Ground Truth = 1"
-    }
-)
-
- - -
- -
-
- - -
- -
- -
- - - - \ No newline at end of file diff --git a/report/final_report/docs/final_report.html b/report/final_report/docs/final_report.html index 268a2b9..9f4d0ee 100644 --- a/report/final_report/docs/final_report.html +++ b/report/final_report/docs/final_report.html @@ -7,7 +7,7 @@ -DSCI591 Capstone Final Report +final_report -
+
-

Caption: Comparison of the satisfaction determined by our system versus the ground truth for each checklist item and repository

+

Comparison of our system’s satisfaction determination versus the ground truth for each checklist item and repository

-

We found that our tool tends to undermine the actual satisfying cases. For the items that are actually satisfied (score = 1), our tool tends to classify as partially satisfied (score = 0.5), (FIXME: in the newer run, the actual 0.5 seems to be tagged quite accurately) while for those that are partially satisfied (score = 0.5), our tool often classifies as not satisfied (score = 0).

+

Our tool tends to underrate satisfying cases, which often classifies fully satisfied items as partially satisfied and partially satisfied items as not satisfied.

Code @@ -636,7 +631,7 @@

Accuracy

contingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth'] contingency_table.sort_index(level=[0, 2])
-
+
@@ -767,13 +762,13 @@

Accuracy

-

Contingency table of the satisfaction determined by our system versus the ground truth

+

Contingency table of our system’s satisfaction determination versus the ground truth

-

The accuracy issue may be attributed to the need for improvement of prompts in our checklist.

- -
-

Consistency

-

Since the completeness score from LLMs contain randomness, we further studied the consistency of scores across checklist items and repositories.

+

The accuracy issue may be attributed to a need to improve our checklist prompts.

+
    +
  1. Consistency
  2. +
+

As the completeness scores from LLMs contain randomness, we examined the consistency of completeness scores across checklist items and repositories.

Code @@ -821,31 +816,31 @@

Consistency

title="30 Runs on Openja's Repositories for each Checklist Item" )
-
+
-
+
-

Caption: Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores of 30 runs of a single repository

+

Standard deviations of the score for each checklist item. Each dot represents the standard deviation of scores from 30 runs of a single repository.

-

We found 2 diverging cases. For example, it shows high standard deviations across repositories for item 3.2 Data in the Expected Format. This might be a proof of poor prompt quality, making it ambiguous for the LLM and hence hard to produce consistent results. Prompt engineering might solve this problem.

-

On the other hand, there are outliers yielding exceptionally high standard deviations for item 5.3 Ensure Model Output Shape Aligns with Expectation. This may be because those repositories are unorthodox, but careful manual examination is required for a more definite conclusion.

-
+

We identified two diverging cases:

+
    +
  1. High Standard Deviations
  2. +
+

Items like 3.2 Data in the Expected Format showed high standard deviations across repositories. This might indicate potential poor prompt quality for the LLM to produce consistent results. Improved prompt engineering could address this issue.

+
    +
  1. Outliers with High Standard Deviations
  2. +
+

Items like 5.3 Ensure Model Output Shape Aligns with Expectation had outliers with exceptionally high standard deviations, which is possibly due to unorthodox repositories. A careful manual examination is required for a more definitive conclusion.

Comparison of gpt-3.5-turbo and gpt-4o

-

To examine if newer LLMs help in both metrics, we preliminarily compared system outputs from gpt-4o and gpt-3.5-turbo on the lightfm repository, we observed that the gpt-4o system consistently returned “Satisfied”, which deviates from the ground truth.

+

To evaluate if newer LLMs improve performance, we preliminarily compared outputs from gpt-4o and gpt-3.5-turbo on the lightfm repository. We observed that gpt-4o consistently returned “Satisfied,” which deviated from the ground truth.

Code @@ -959,31 +960,31 @@

Com titleFontSize=12 )

-
+
-
+
-

Caption: Comparison of the satisfaction using gpt-4o versus using gpt-3.5-turbo for each checklist item on lightfm

+

Comparison of satisfaction using gpt-4o versus gpt-3.5-turbo for each checklist item on lightfm

-

Further investigation into gpt-4o is required to address this issue and enhance the system performance.

+

Further investigation into gpt-4o is required to determine its effectiveness in system performance.

@@ -1039,34 +1040,32 @@

Com

Conclusion

Wrap Up

-

Our project, FixML, represents a significant step forward in the field of machine learning (ML) testing by providing curated checklists and automated tools that enhance the evaluation and creation of test suites for ML models. The development and implementation of FixML have been driven by both the need of better quality assurance in ML systems, and the current limitations of traditional testing methods on ML projects which are either too general without comprehensive clarification, or are too human-reliant.

-

FixML seamlessly takes in the user’s ML codebase, identifies and extracted its existing test suites. Together with the curated checklist on ML testing, FixML leverages Large Language Models (LLMs) to assess the completeness of the test suites and output detailed evaluation reports with completeness scores and specific reasons. This assists users in understanding the performance of their current test suites with insights. Additionally, FixML can generate test function specifications corresponding to the curated checklist, helping users utilizing their test suites.

-

In return, FixML solution combines the scalability of automated testing with the reliability of expert evaluation. By automating the evaluation process, FixML significantly reduces the time and human effort required to assess the quality of ML test suites. This popularizes thorough and efficient quality assessment on ML projects.

+

The development of FixML has been driven by the need of better quality assurance in ML systems and the current limitations of traditional testing methods on ML projects. FixML provides curated checklists and automated tools that enhance the evaluation and creation of test suites for ML projects. This in return, significantly reduces the time and effort required to assess the completeness of ML test suites, and thus promotes thorough and efficient assessment on ML projects.

Limitation & Future Improvement

-

While FixML provides substantial benefits, there are limitations and areas that aim to be addressed in future development:

+

While FixML provides substantial benefits, there are limitations and areas to be addressed in future development:

  1. Specialized Checklist
-

The current checklist is designed to be general and may not cover all specific requirements for different ML projects. Future development will focus on creating more specialized checklists for different domains and project types, allowing for more tailored evaluations. Since the format of the checklist is designed to allow users to easily expand, edit and select checklist items based on their specific use case, we welcome any collaboration with ML researchers on the creation of specalized checklists.

+

The default checklist is general and may not cover all requirements for different ML projects. Future development will focus on creating specialized checklists for tailored evaluations across various domains and project types. Collaboration with ML researchers is welcomed for creating specialized checklists based on specific use cases.

  1. Enhanced Test Evaluator
-

Our current study unveils the varying accuracy and consistency issues on the evaluation results using OpenAI GPT models. Future improvements involves prompt enhancement with prompt engineering techniques and support for multiple LLMs for higher performance and flexibility of FixML test evaluator functionality. We also expect to deliver user guidelines in editing the prompts in our system, where ML developers can customize prompts for better performance and collaborate with us to embed them into the system.

+

Our study reveals the accuracy and consistency issues on the evaluation results using OpenAI GPT-3.5-turbo model. Future improvements involves better prompt engineering techniques and support for multiple LLMs for enhanced performance and flexibility. User guidelines in prompt creation will be provided to facilitate collaboration with ML developers.

  1. Customized Test Specification
-

FixML test specification generator currently produces general test function skeletons solely based on the curated checklist without the context of the specific ML projects. Future developments will involve the integration of the ML project codebase in the generation process to output customized test functions skeletons. This further lower the barrier of ML users in creating comprehensive test suites relevant to the projects.

+

Future developments will integrate project-specific information to produce customized test function skeletons. This may further encourage users to create comprehensive tests.

  1. Workflow Optimization #FIXME: have to review whether to include as it seems lower priority.
-

The current test evaluator and test specification generator are separate entities. This could be improved by embedding a workflow engine that allows the system to automatically take actions based on the LLM response. For instance, if the LLM response suggests that test suites are partially satisfied or non-satisfied, the system could automatically run the test generator to produce test function skeletons and then reevaluate them until they are satisfied or some threshold is met. This would create a more cohesive and efficient workflow, reducing manual intervention and improving overall system performance.

+

The test evaluator and test specification generator are currently separate. Future improvements could embed a workflow engine that automatically takes actions based on LLM responses. This creates a more cohesive and efficient workflow, recues manual intervention, and improves overall system performance.

  1. Performance Optimization #FIXME: have to review whether to include as it seems lower priority.
-

Performance optimization is another critical area for future development. As FixML handles large codebases and complex evaluations, optimizing the system to handle these tasks more efficiently is essential. This includes improving the speed and accuracy of the LLM responses, reducing the time taken to analyze and generate reports, and ensuring the system can scale effectively to handle more extensive and more complex projects.

-

By addressing these limitations and focusing on these future improvements, FixML will become an even more powerful tool for ensuring the quality and robustness of machine learning and data science projects.

+

As FixML handles large codebases and complex evaluations, performance optimization is essential. Future developments will focus on improving the speed and accuracy of LLM responses, reducing analysis and report generation times, and ensuring scalability for handling larger and more complex projects.

+

By addressing these limitations and implementing future improvements, we aim for FixML to achieve better performance and contribute to the development of better ML systems, and ultimately enhance human life.

@@ -1074,25 +1073,45 @@

Limitation & -

References

Alexander, Rohan, Lindsay Katz, Callandra Moore, and Zane Schwartz. 2023. “Evaluating the Decency and Consistency of Data Validation Tests Generated by LLMs.” arXiv Preprint arXiv:2310.01402.
+
+Belanger, Ashley. 2024. “Air Canada Must Honor Refund Policy Invented by Airline’s Chatbot.” Ars Technica. https://arstechnica.com/tech-policy/2024/02/air-canada-must-honor-refund-policy-invented-by-airlines-chatbot/. +
+
+Gawande, Atul. 2010. Checklist Manifesto, the (HB). Penguin Books India. +
Grand-View-Research. 2021. “Artificial Intelligence Market Size, Share & Trends Analysis Report by Solution, by Technology (Deep Learning, Machine Learning), by End-Use, by Region, and Segment Forecasts, 2023 2030.” Grand View Research San Francisco.
+
+Jordan, Jeremy. 2020. “Effective Testing for Machine Learning Systems.” https://www.jeremyjordan.me/testing-ml/. +
Kapoor, Sayash, and Arvind Narayanan. 2022. “Leakage and the Reproducibility Crisis in ML-Based Science.” arXiv Preprint arXiv:2207.07048.
+
+Nunwick, Alice. 2023. “ITutorGroup Settles AI Hiring Lawsuit Alleging Age Discrimination.” Verdict. https://www.verdict.co.uk/itutorgroup-settles-ai-hiring-lawsuit-alleging-age-discrimination/. +
Openja, Moses, Foutse Khomh, Armstrong Foundjem, Zhen Ming, Mouna Abidi, Ahmed E Hassan, et al. 2023. “Studying the Practices of Testing Machine Learning Software in the Wild.” arXiv Preprint arXiv:2312.12604.
+
+Pineau, Joelle, Philippe Vincent-Lamarre, Koustuv Sinha, Vincent Larivière, Alina Beygelzimer, Florence d’Alché-Buc, Emily Fox, and Hugo Larochelle. 2021. “Improving Reproducibility in Machine Learning Research (a Report from the Neurips 2019 Reproducibility Program).” Journal of Machine Learning Research 22 (164): 1–20. +
Regidi, Asheeta. 2019. “SEBI’s Circular: The Black Box Conundrum and Misrepresentation in AI-Based Mutual Funds.” Firstpost. https://www.firstpost.com/business/sebis-circular-the-black-box-conundrum-and-misrepresentation-in-ai-based-mutual-funds-6625161.html.
+
+Shepardson, David. 2023. “GM’s Cruise Recalling 950 Driverless Cars After Pedestrian Dragged in Crash.” Reuters. https://www.reuters.com/business/autos-transportation/gms-cruise-recall-950-driverless-cars-after-accident-involving-pedestrian-2023-11-08/. +
+
+Team, Microsoft Industry Solutions Engineering. 2023. “Testing Data Science and MLOps Code.” Testing Data Science and MLOps Code - Engineering Fundamentals Playbook. https://microsoft.github.io/code-with-engineering-playbook/machine-learning/ml-testing/. +
Zhang, Yue, Yafu Li, Leyang Cui, Deng Cai, Lemao Liu, Tingchen Fu, Xinting Huang, et al. 2023. “Siren’s Song in the AI Ocean: A Survey on Hallucination in Large Language Models.” https://arxiv.org/abs/2309.01219.
diff --git a/report/final_report/docs/img/checklist_sample.png b/report/final_report/docs/img/checklist_sample.png new file mode 100644 index 0000000..96bfa2d Binary files /dev/null and b/report/final_report/docs/img/checklist_sample.png differ diff --git a/report/final_report/docs/logo.png b/report/final_report/docs/img/logo.png similarity index 100% rename from report/final_report/docs/logo.png rename to report/final_report/docs/img/logo.png diff --git a/report/final_report/docs/img/proposed_system_overview.png b/report/final_report/docs/img/proposed_system_overview.png new file mode 100644 index 0000000..f830376 Binary files /dev/null and b/report/final_report/docs/img/proposed_system_overview.png differ diff --git a/report/final_report/docs/img/test_evaluation_report_sample.png b/report/final_report/docs/img/test_evaluation_report_sample.png new file mode 100644 index 0000000..2bc1528 Binary files /dev/null and b/report/final_report/docs/img/test_evaluation_report_sample.png differ diff --git a/report/final_report/docs/img/test_spec_sample.png b/report/final_report/docs/img/test_spec_sample.png new file mode 100644 index 0000000..4eb2baf Binary files /dev/null and b/report/final_report/docs/img/test_spec_sample.png differ diff --git a/report/final_report/docs/index.html b/report/final_report/docs/index.html index d8cf711..09f42ed 100644 --- a/report/final_report/docs/index.html +++ b/report/final_report/docs/index.html @@ -1,7 +1,7 @@ - Redirect to 02_plots-for-final-report.html - + Redirect to final_report.html + diff --git a/report/final_report/docs/proposal.html b/report/final_report/docs/proposal.html index 9079d53..9a14a8b 100644 --- a/report/final_report/docs/proposal.html +++ b/report/final_report/docs/proposal.html @@ -7,7 +7,7 @@ -Proposal Report - Checklists and LLM prompts for efficient and effective test creation in data analysis +proposal