import scipy
-import pickle
-import yaml
-import pandas as pd
-import altair as alt
-from collections import Counter
def get_report(response):
-= []
- report for result in response.call_results:
- if result.parsed_response:
- = result.parsed_response['results']
- resp for item in resp:
- 'file'] = result.files_evaluated[0]
- item['success'] = result.success
- item[
- report.append(item)else:
-
- report.append({'ID': '2.1', # FIXME
- 'Title': '',
- 'Requirement': '',
- 'Observation': '',
- 'Functions': [],
- 'Evaluation': '',
- 'Score': 0,
- 'file': result.files_evaluated[0],
- 'success': result.success
-
- })return pd.DataFrame(report)
-
-def extract_file_and_scores(resp_path, verbose=False):
-if verbose:
- print(resp_path)
- with open(resp_path, 'rb') as file:
- = pickle.load(file)
- response = get_report(response)
- report = (
- df
- report='file', columns='ID', values='Score')
- .pivot(indexNone, axis=1)
- .rename_axis(
- )'success'] = report.groupby(['file'])['success'].all()
- df['response_path'] = resp_path
- df[return df.reset_index()
-
-def generate_stat_plot(df_repo__stat, ground_truth=None, facet_col='repo', repo=None, id=None):
-"""
- Generate Stat plot across all repo and all checklist item
- Optional to incorporate ground truth and select specific repo/checklist item
- """
-if facet_col == 'repo':
- = 'id'
- x_col = 'Checklist ID'
- x_title elif facet_col == 'id':
- = 'repo'
- x_col = 'Repository'
- x_title
- # the base chart
- if repo:
- = df_repo__stat.query(f'repo == "{repo}"')
- df_repo__stat if id:
- = df_repo__stat.query(f'id == "{id}"')
- df_repo__stat
- = alt.Chart().transform_calculate(
- base min="max(0, datum.mean-datum.std)",
- max="min(1, datum.mean+datum.std)"
-
- )
- # generate the points
- = base.mark_point(
- points =True,
- filled=50,
- size='black'
- color
- ).encode(=alt.X(f'{x_col}:O').axis(labelAngle=0).title(x_title),
- x=alt.Y('mean:Q').scale(domainMin=0, domainMax=1).title('Score'),
- y
- )
- # generate the error bars
- = base.mark_errorbar().encode(
- errorbars =f"{x_col}:O",
- x=alt.Y("min:Q").title('1 SD'),
- y="max:Q"
- y2
- )
-= points + errorbars
- plot
- if ground_truth is not None:
- # generate points of ground truth
- if repo:
- = ground_truth.query(f'repo == "{repo}"')
- ground_truth if id:
- = ground_truth.query(f'id == "{id}"')
- ground_truth
- = pd.merge(df_repo__stat, ground_truth, how='left', on=['repo', 'id'])
- df_repo__stat
- = alt.Chart().mark_point(
- gt_points =True,
- filled=100,
- size='green',
- color="diamond"
- shape
- ).encode(=alt.X(f'{x_col}:O'),
- x=alt.Y('score:Q')
- y
- )
-+= gt_points
- plot
-= alt.layer(
- plot
- plot,=df_repo__stat
- data
- ).properties(=400,
- width
- ).facet(=f'{facet_col}',
- column=2
- columns
- )
-return plot
preprocess data
-= ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']
- checklist_ids
-#result_path = '../draft/batch_run_results/record_combine.yml'
-= '../data/processed/batch_run/record_combine.yml'
- result_path with open(result_path, 'r') as file:
-= pd.DataFrame(yaml.safe_load(file))
- config
-# prepare score data by repo, run, file
-= [
- tmp for path in config['response_path'] # FIXME: excluded deepchem
- extract_file_and_scores(path)
- ]= pd.concat(tmp, axis=0).reset_index(drop=True)
- tmp
-= config.merge(tmp, on='response_path', how='left') raw_df_repo_run_file
# filter non-test files in qlib
-= raw_df_repo_run_file.query('(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))')
- df_repo_run_file
-# prepare score data by repo, run
-= df_repo_run_file.groupby(['repo', 'run']).agg({
- df_repo_run id: ['max'] for id in checklist_ids
-
- })= [col[0] for col in df_repo_run.columns]
- df_repo_run.columns = df_repo_run.reset_index()
- df_repo_run
-# prepare statistics of scores by repo
-= df_repo_run.groupby(['repo']).agg({
- df_repo__stat id: ['mean', 'std', 'count'] for id in checklist_ids
-
- })= pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
- df_repo__stat = ['repo', 'id', 'stat', 'value']
- df_repo__stat.columns = (
- df_repo__stat =['repo', 'id'], columns='stat', values='value')
- df_repo__stat.pivot(index
- .reset_index()None, axis=1)
- .rename_axis(
- )
-# prepare counting of scores by repo
-= df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
- df_repo__count for id in checklist_ids[1:]:
-= df_repo__count.merge(
- df_repo__count 'repo'])[id].apply(Counter).reset_index(),
- df_repo_run.groupby([=['repo', 'level_1'],
- on='outer'
- how
- )
-= df_repo__count.fillna(0) df_repo__count
Runs Quality
-1. Some non-test files are included in the evaluation
-For example, the ./nanodet/nanodet/trainer/task.py
'repo == "nanodet"')['file'].unique()[:3] raw_df_repo_run_file.query(
array(['../data/raw/openja/nanodet/nanodet/trainer/task.py',
- '../data/raw/openja/nanodet/tests/test_configs/test_config.py',
- '../data/raw/openja/nanodet/tests/test_data/test_batch_process.py'],
- dtype=object)
-2. Evaluation on the file magenta/magenta/models/music_vae/data_test.py
is always failed
-~df_repo_run_file.success]['file'].unique() df_repo_run_file[
array(['../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',
- '../data/raw/openja/paperless-ng/src/documents/tests/test_api.py'],
- dtype=object)
-3. DeepSpeech
, lightfm
and magenta
have the least (Python) test files
-'run == 1').groupby(['repo'])['file'].count().reset_index() df_repo_run_file.query(
- | repo | -file | -
---|---|---|
0 | -DeepSpeech | -3 | -
1 | -apollo | -14 | -
2 | -lightfm | -7 | -
3 | -magenta | -8 | -
4 | -mmf | -70 | -
5 | -mycroft-core | -64 | -
6 | -nanodet | -42 | -
7 | -paperless-ng | -35 | -
8 | -qlib | -31 | -
4. The test files are not always in a tests/
folder. Is it be good practice to always do that? Should it be one of the checklist item to ensure all tests placed under tests/
folder?
-For example, magenta
'repo == "magenta"')['file'].unique() df_repo_run_file.query(
array(['../data/raw/openja/magenta/conftest.py',
- '../data/raw/openja/magenta/magenta/common/state_util_test.py',
- '../data/raw/openja/magenta/magenta/models/coconet/export_saved_model_test.py',
- '../data/raw/openja/magenta/magenta/models/coconet/lib_data.py',
- '../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',
- '../data/raw/openja/magenta/magenta/models/onsets_frames_transcription/create_dataset_lib_test.py',
- '../data/raw/openja/magenta/magenta/models/score2perf/datagen_beam_test.py',
- '../data/raw/openja/magenta/magenta/pipelines/pipeline_test.py'],
- dtype=object)
-Findings on 8 repos
- df_repo_run_file.repo.unique()
array(['lightfm', 'qlib', 'mmf', 'nanodet', 'magenta', 'DeepSpeech',
- 'paperless-ng', 'mycroft-core', 'apollo'], dtype=object)
-1. Overview of accuracy and consistency lightfm
evaluation
-Let the ground truth of the lightfm
is as the following:
# Ground truth
-= pd.DataFrame([
- ground_truth 'repo': 'lightfm', 'id': '2.1', 'score': 1},
- {'repo': 'lightfm', 'id': '3.2', 'score': 1},
- {'repo': 'lightfm', 'id': '3.5', 'score': 0},
- {'repo': 'lightfm', 'id': '4.2', 'score': 1},
- {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},
- {'repo': 'lightfm', 'id': '6.1', 'score': 1},
- {'repo': 'lightfm', 'id': '6.2', 'score': 1},
- {'repo': 'qlib', 'id': '2.1', 'score': 0.5},
- {'repo': 'qlib', 'id': '3.2', 'score': 1},
- {'repo': 'qlib', 'id': '3.5', 'score': 0},
- {'repo': 'qlib', 'id': '4.2', 'score': 0.5},
- {'repo': 'qlib', 'id': '5.3', 'score': 1},
- {'repo': 'qlib', 'id': '6.1', 'score': 1},
- {'repo': 'qlib', 'id': '6.2', 'score': 1},
- {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},
- {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},
- {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},
- {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},
- {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},
- {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},
- {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},
- {
- ])== 'lightfm'] ground_truth[ground_truth.repo
- | repo | -id | -score | -
---|---|---|---|
0 | -lightfm | -2.1 | -1.0 | -
1 | -lightfm | -3.2 | -1.0 | -
2 | -lightfm | -3.5 | -0.0 | -
3 | -lightfm | -4.2 | -1.0 | -
4 | -lightfm | -5.3 | -0.5 | -
5 | -lightfm | -6.1 | -1.0 | -
6 | -lightfm | -6.2 | -1.0 | -
=ground_truth, repo="lightfm", facet_col='repo') generate_stat_plot(df_repo__stat, ground_truth
The distribution of the scores for each checklist items:
-'repo == "lightfm"') df_repo__count.query(
- | repo | -level_1 | -2.1 | -3.2 | -3.5 | -4.2 | -5.3 | -6.1 | -6.2 | -
---|---|---|---|---|---|---|---|---|---|
6 | -lightfm | -0.0 | -0.0 | -1.0 | -19.0 | -0.0 | -18.0 | -0.0 | -0.0 | -
7 | -lightfm | -0.5 | -1.0 | -29.0 | -6.0 | -27.0 | -12.0 | -20.0 | -4.0 | -
8 | -lightfm | -1.0 | -29.0 | -0.0 | -5.0 | -3.0 | -0.0 | -10.0 | -26.0 | -
Observations: The system evaluation kind of aligns with our evaluation, that is, - for those items that we believe “Satisfied” (Score = 1), the system mostly output 0.5 or 1 - for those items that we believe “Partially Satisfied” or “Not Satisfied”, the system mostly output 0.5 or 0 - some checklist items display high variance, e.g. 3.5, 5.3 and 6.1.
-2. Overview of qlib
-Let the ground truth of the qlib
is as the following (FIXME: to be confirmed):
# Ground truth
-== 'qlib'] ground_truth[ground_truth.repo
- | repo | -id | -score | -
---|---|---|---|
7 | -qlib | -2.1 | -0.5 | -
8 | -qlib | -3.2 | -1.0 | -
9 | -qlib | -3.5 | -0.0 | -
10 | -qlib | -4.2 | -0.5 | -
11 | -qlib | -5.3 | -1.0 | -
12 | -qlib | -6.1 | -1.0 | -
13 | -qlib | -6.2 | -1.0 | -
=ground_truth, repo="qlib", facet_col='repo') generate_stat_plot(df_repo__stat, ground_truth
'repo == "qlib"') df_repo__count.query(
- | repo | -level_1 | -2.1 | -3.2 | -3.5 | -4.2 | -5.3 | -6.1 | -6.2 | -
---|---|---|---|---|---|---|---|---|---|
24 | -qlib | -0.0 | -0.0 | -1.0 | -29.0 | -3.0 | -14.0 | -4.0 | -1.0 | -
25 | -qlib | -0.5 | -0.0 | -12.0 | -1.0 | -27.0 | -16.0 | -24.0 | -26.0 | -
26 | -qlib | -1.0 | -30.0 | -17.0 | -0.0 | -0.0 | -0.0 | -2.0 | -3.0 | -
Observations: - There are more disagreement between system and manual evaluation - especially for 5.3, 6.1, 6.2. - The items consistency in this repo are not similar to those in lightfm
. - e.g. Variance for 3.5 is greatly reduced. Variance for 3.2 becomes larger. - However, qlib
is not just a machine learning project, it also contains a software inside. - e.g. It has a lot of randomly generated data by itself, instead of reading a data to perform analysis, it seems to deviate from the objective of 2.1.
3. The consistency for each checklist items
--
-
- Why is it important? If the score of a particular item varies a lot when evaluating a repository, it might mean that its prompt (
Requirement
) is confusing to the LLM, or the checklist item itself is not well defined.
-
='id', columns='repo', values='std') df_repo__stat.pivot(index
repo | -DeepSpeech | -apollo | -lightfm | -magenta | -mmf | -mycroft-core | -nanodet | -paperless-ng | -qlib | -
---|---|---|---|---|---|---|---|---|---|
id | -- | - | - | - | - | - | - | - | - |
2.1 | -0.479463 | -0.152564 | -0.091287 | -0.423451 | -0.000000 | -0.000000 | -0.000000 | -0.242117 | -0.000000 | -
3.2 | -0.406838 | -0.215092 | -0.091287 | -0.189525 | -0.245066 | -0.278027 | -0.239732 | -0.091287 | -0.285673 | -
3.5 | -0.000000 | -0.000000 | -0.388040 | -0.252003 | -0.126854 | -0.000000 | -0.252003 | -0.000000 | -0.091287 | -
4.2 | -0.000000 | -0.000000 | -0.152564 | -0.091287 | -0.126854 | -0.000000 | -0.254274 | -0.000000 | -0.152564 | -
5.3 | -0.000000 | -0.000000 | -0.249136 | -0.000000 | -0.126854 | -0.000000 | -0.000000 | -0.000000 | -0.253708 | -
6.1 | -0.351107 | -0.172873 | -0.239732 | -0.252003 | -0.233046 | -0.000000 | -0.285673 | -0.000000 | -0.224888 | -
6.2 | -0.000000 | -0.000000 | -0.172873 | -0.000000 | -0.201289 | -0.253708 | -0.260415 | -0.126854 | -0.182574 | -
- alt.Chart(df_repo__stat).mark_boxplot().encode(="std:Q",
- x='id:N'
- y
- ).properties(=200,
- height=400
- width )
Observations: - The evaluation of the checklist item 2.1 Ensure Data File Loads as Expected
is usually stable. - When evaluating a repository, 50% of the time its standard deviation is smaller than 0.05, the smallest among the others.
Below shows the breakdown of item scores for each repository:
-(NOTE: only lightfm
and qlib
have ground truth, in green diamond)
=ground_truth, facet_col='id') generate_stat_plot(df_repo__stat, ground_truth
Observations: - (TBC) The standard deviation for Item 3.5 and 5.3 shows great variation, which might imply that test cases in some repo might be confusing to LLM while some are clear. - (TBC) The standard deviation for Item 5.3, 6.1, 6.2 are relatively high and consistent, which might imply that there is room for refining the prompt to reduce consistency issue.
-4. The consistency for each checklist items, compared to the lightfm
--
-
- Why is it important? We optimized the consistency of our system using
lightfm
. Therefore, we treat this repository as a benchmark. If a particular checklist item has a much worse consistency in other repository, that might mean that the prompt for that item is not generalizable.
-
Below shows the standard deviations in a 30 runs for each checklist item for each repository:
-= df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id')
- stds stds
- | std | -||||||
---|---|---|---|---|---|---|---|
id | -2.1 | -3.2 | -3.5 | -4.2 | -5.3 | -6.1 | -6.2 | -
repo | -- | - | - | - | - | - | - |
DeepSpeech | -0.479463 | -0.406838 | -0.000000 | -0.000000 | -0.000000 | -0.351107 | -0.000000 | -
apollo | -0.152564 | -0.215092 | -0.000000 | -0.000000 | -0.000000 | -0.172873 | -0.000000 | -
lightfm | -0.091287 | -0.091287 | -0.388040 | -0.152564 | -0.249136 | -0.239732 | -0.172873 | -
magenta | -0.423451 | -0.189525 | -0.252003 | -0.091287 | -0.000000 | -0.252003 | -0.000000 | -
mmf | -0.000000 | -0.245066 | -0.126854 | -0.126854 | -0.126854 | -0.233046 | -0.201289 | -
mycroft-core | -0.000000 | -0.278027 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.253708 | -
nanodet | -0.000000 | -0.239732 | -0.252003 | -0.254274 | -0.000000 | -0.285673 | -0.260415 | -
paperless-ng | -0.242117 | -0.091287 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.126854 | -
qlib | -0.000000 | -0.285673 | -0.091287 | -0.152564 | -0.253708 | -0.224888 | -0.182574 | -
= stds.copy()
- stds_p = [col[1] for col in stds_p.columns]
- stds_p.columns = stds_p.reset_index()
- stds_p = stds_p.melt(id_vars='repo', var_name='id') stds_p
stds_p.head()
- | repo | -id | -value | -
---|---|---|---|
0 | -DeepSpeech | -2.1 | -0.479463 | -
1 | -apollo | -2.1 | -0.152564 | -
2 | -lightfm | -2.1 | -0.091287 | -
3 | -magenta | -2.1 | -0.423451 | -
4 | -mmf | -2.1 | -0.000000 | -
# stripplot = (
-# alt.Chart(stds_p)
-# .mark_point(filled=True, size=100)
-# .transform_calculate(
-# # Generate Gaussian jitter with a Box-Muller transform
-# jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
-# # jitter='random()'
-# ).encode(
-# y=alt.Y(
-# 'jitter:Q',
-# title=None,
-# axis=alt.Axis(ticks=False, grid=True, labels=False),
-# scale=alt.Scale(),
-# ),
-# x=alt.X('value:Q'),
-# color=alt.Color('repo:N'),
-# row=alt.Row(
-# 'id:N',
-# header=alt.Header(
-# labelFontSize=16,
-# labelAngle=0
-# )
-# ),
-# tooltip='repo'
-# ).configure_facet(
-# spacing=0
-# ).configure_view(
-# stroke=None
-# ).configure_axis(
-# labelFontSize=16,
-# titleFontSize=16
-# ).properties(
-# height=50,
-# width=600
-# )
-# )
-
- # stripplot
def generate_jitterbox_plot(df_stds_p):
-"""
- Generate jitterbox plot across all repo and all checklist item
- """
-= alt.Chart().mark_boxplot(
- box ='grey',
- color=0.5,
- opacity=20,
- size
- ).encode(=alt.X('value:Q').title('SD(Score)'),
- x=alt.Y('id:N', title=None, axis=alt.Axis(labelPadding=10, grid=False))
- y
- )
- = alt.Chart().mark_circle(size=100).encode(
- stripplot =alt.Y(
- y'id:N',
- =alt.Axis(ticks=False, grid=True, labels=True),
- axis=alt.Scale(),
- scale
- ), ='value:Q',
- x="jitter:Q",
- yOffset=alt.Color('id:N', legend=None),
- color='repo'
- tooltip
- ).transform_calculate(# Generate Gaussian jitter with a Box-Muller transform
- ="sqrt(-2*log(random()))*cos(2*PI*random())"
- jitter
- )
- = alt.layer(
- plot
- box,
- stripplot,=df_stds_p
- data
- ).configure_view( =None
- stroke
- ).configure_axis( =16,
- labelFontSize=16
- titleFontSize
- ).properties(=300,
- height=600
- width
- )
- return plot
generate_jitterbox_plot(stds_p)
- alt.Chart(df_repo__stat).mark_boxplot().encode(="std:Q",
- x='id:N'
- y
- ).properties(=200,
- height=400
- width )
# !pip install altair_catplot
-# !pip install seaborn
# import altair_catplot
-
-# altair_catplot.catplot(
-# stds_p,
-# transform ='jitterbox',
-# mark ='point',
-# encoding = dict(
-# x = alt.X('value:Q'),
-# y = alt.Y('id:N'),
-# color = alt.Color('repo:N')
-# )
-# )
= stds.drop(index='lightfm') / stds.loc['lightfm']
- F
-= alt.Chart(
- base =False).reset_index()[['repo', 'id', 'value']]
- F.melt(ignore_index
- ).transform_calculate(="1",
- benchmark=f"{scipy.stats.f.ppf(0.975, 29, 29)}"
- threshold
- )
-= base.mark_point(
- point =True,
- filled=100,
- size
- ).encode(=alt.X('value:Q').title("std ratio (c.f. lightfm)"),
- x='id:N',
- y='repo',
- color='repo'
- tooltip
- ).properties(=200,
- height=400
- width
- )
-\
- point + base.mark_rule(color='black').encode(x="benchmark:Q") \
-+ base.mark_rule(color='red').encode(x="threshold:Q")
-# jitter instead of mark_point <-- prompt vs. repo problem?
-# prompt: sd of checklist item for all repo is high
-# repo: most of repo have low sd, the repo we're looking at has outlier
Observations: - The evaluation of the checklist item 3.2 Data in the Expected Format
becomes much more unstable in most of other repositories. - That of the 2.1 is significantly unstable in the repo paperless-ng
, magenta
and DeepSpeech
, but it may be due to the repo itself.
TODO: to look into the 3.2’s scores.
-TODO: Given ground truth == 1, distribution of system score?
-TODO: Given ground truth == 0, distribution of system score?
-def generate_histogram_plot(df_repo_run_long, df_ground_truth=None, repo=None, id=None):
-"""
- Generate histogram across all repo and all checklist item
- Optional to incorporate ground truth and select specific repo/checklist item
- """
-# data
- = df_repo_run_long.copy()
- repo_data if repo:
- = repo_data.query(f'repo == "{repo}"')
- repo_data if id:
- = repo_data.query(f'id == "{id}"')
- repo_data
-# base histogram chart
- = alt.Chart().mark_bar().encode(
- base =alt.X('eval_score:Q', title='Score'),
- x=alt.Y('count()'),
- y=alt.value('grey'),
- color=alt.value(20),
- size
- )
- if df_ground_truth is not None:
- # data
- = df_ground_truth.copy()
- gt_data if repo:
- = gt_data.query(f'repo == "{repo}"')
- gt_data if id:
- = gt_data.query(f'id == "{id}"')
- gt_data
- = pd.merge(repo_data, gt_data, how='left', on=['repo', 'id'])
- repo_data 'is_equal_to_gt'] = repo_data['eval_score'] == repo_data['score']
- repo_data[
- # base histogram chart
- = base.encode(
- base =alt.Color('is_equal_to_gt', scale=alt.Scale(range=['grey', 'green']), legend=None)
- color
- )+= base.mark_text().encode(
- base =alt.value('Ground Truth'),
- text='score',
- x=alt.value(10),
- size=alt.value('green'),
- color
- )
-= alt.layer(
- plot
- base,=repo_data
- data
- ).properties(=200,
- width=200,
- height
- ).facet(='repo',
- row='id'
- column
- )
- return plot
Contingency Table
-= pd.melt(df_repo_run, id_vars=['repo', 'run'], var_name='id', value_name='eval_score')
- df_repo_run_p = pd.merge(df_repo_run_p, ground_truth, how='inner', on=['repo', 'id'])
- df_repo_run_p = df_repo_run_p.rename(columns={'score': 'ground_truth'})
- df_repo_run_p ='run', index=['ground_truth'], columns=['eval_score'], aggfunc='count', fill_value=0) pd.pivot_table(df_repo_run_p, values
eval_score | -0.0 | -0.5 | -1.0 | -
---|---|---|---|
ground_truth | -- | - | - |
0.0 | -227 | -8 | -35 | -
0.5 | -21 | -39 | -30 | -
1.0 | -21 | -159 | -90 | -
df_repo_run_p
- | repo | -run | -id | -eval_score | -ground_truth | -
---|---|---|---|---|---|
0 | -DeepSpeech | -1 | -2.1 | -1.0 | -0.0 | -
1 | -DeepSpeech | -2 | -2.1 | -1.0 | -0.0 | -
2 | -DeepSpeech | -3 | -2.1 | -0.0 | -0.0 | -
3 | -DeepSpeech | -4 | -2.1 | -1.0 | -0.0 | -
4 | -DeepSpeech | -5 | -2.1 | -0.0 | -0.0 | -
... | -... | -... | -... | -... | -... | -
625 | -qlib | -26 | -6.2 | -0.5 | -1.0 | -
626 | -qlib | -27 | -6.2 | -0.5 | -1.0 | -
627 | -qlib | -28 | -6.2 | -0.5 | -1.0 | -
628 | -qlib | -29 | -6.2 | -1.0 | -1.0 | -
629 | -qlib | -30 | -6.2 | -0.5 | -1.0 | -
630 rows × 5 columns
-# generate_histogram_plot(df_repo_run_p, df_ground_truth=ground_truth)