Skip to content

Commit

Permalink
copy
Browse files Browse the repository at this point in the history
  • Loading branch information
paul-gauthier committed Dec 25, 2024
1 parent 9767759 commit dd9b2a8
Show file tree
Hide file tree
Showing 2 changed files with 267 additions and 1 deletion.
259 changes: 259 additions & 0 deletions aider/website/_data/o1_polyglot_leaderboard.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
- dirname: 2024-12-21-18-41-18--polyglot-gpt-4o-mini
test_cases: 225
model: gpt-4o-mini-2024-07-18
edit_format: whole
commit_hash: a755079-dirty
pass_rate_1: 0.9
pass_rate_2: 3.6
pass_num_1: 2
pass_num_2: 8
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 36
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
total_tests: 225
command: aider --model gpt-4o-mini-2024-07-18
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 17.3
total_cost: 0.3236

- dirname: 2024-12-21-18-44-28--polyglot-sonnet
test_cases: 225
model: claude-3-5-sonnet-20241022
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 18.7
pass_rate_2: 45.3
pass_num_1: 42
pass_num_2: 102
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 14
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 12
total_tests: 225
command: aider --model claude-3-5-sonnet-20241022
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 30.8
total_cost: 13.4847

- dirname: 2024-12-21-18-52-34--polyglot-gpt-4o-diff
test_cases: 225
model: gpt-4o-2024-11-20
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 4.9
pass_rate_2: 15.1
pass_num_1: 11
pass_num_2: 34
percent_cases_well_formed: 96.0
error_outputs: 12
num_malformed_responses: 11
num_with_malformed_responses: 9
user_asks: 34
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 19
total_tests: 225
command: aider --model gpt-4o-2024-11-20
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 22.2
total_cost: 7.1835

- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff
test_cases: 224
model: o1-2024-12-17 (high)
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 23.7
pass_rate_2: 61.7
pass_num_1: 53
pass_num_2: 139
percent_cases_well_formed: 91.5
error_outputs: 25
num_malformed_responses: 24
num_with_malformed_responses: 19
user_asks: 16
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
total_tests: 225
command: aider --model openrouter/openai/o1
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 133.2
total_cost: 0.0000

- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
test_cases: 225
model: deepseek-chat
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 5.3
pass_rate_2: 17.8
pass_num_1: 12
pass_num_2: 40
percent_cases_well_formed: 92.9
error_outputs: 42
num_malformed_responses: 37
num_with_malformed_responses: 16
user_asks: 23
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 5
test_timeouts: 5
total_tests: 225
command: aider --model deepseek/deepseek-chat
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 184.0
total_cost: 0.5101

- dirname: 2024-12-21-21-46-27--polyglot-haiku-diff
test_cases: 225
model: claude-3-5-haiku-20241022
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 7.1
pass_rate_2: 28.0
pass_num_1: 16
pass_num_2: 63
percent_cases_well_formed: 91.1
error_outputs: 31
num_malformed_responses: 30
num_with_malformed_responses: 20
user_asks: 13
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 9
total_tests: 225
command: aider --model claude-3-5-haiku-20241022
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 31.8
total_cost: 6.0583

- dirname: 2024-12-22-13-22-32--polyglot-qwen-diff
test_cases: 225
model: Qwen2.5-Coder-32B-Instruct
edit_format: diff
commit_hash: 6d7e8be-dirty
pass_rate_1: 4.4
pass_rate_2: 8.0
pass_num_1: 10
pass_num_2: 18
percent_cases_well_formed: 71.6
error_outputs: 158
num_malformed_responses: 148
num_with_malformed_responses: 64
user_asks: 132
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 2
total_tests: 225
command: "aider --model openai/Qwen/Qwen2.5-Coder-32B-Instruct # via hyperbolic"
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 84.4
total_cost: 0.0000

- dirname: 2024-12-22-21-26-35--polyglot-o1mini-whole
test_cases: 225
model: o1-mini-2024-09-12
edit_format: whole
commit_hash: 37df899
pass_rate_1: 5.8
pass_rate_2: 32.9
pass_num_1: 13
pass_num_2: 74
percent_cases_well_formed: 96.9
error_outputs: 8
num_malformed_responses: 8
num_with_malformed_responses: 7
user_asks: 27
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
total_tests: 225
command: aider --model o1-mini
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 34.7
total_cost: 18.5770

- dirname: 2024-12-22-18-43-25--gemini-exp-1206-polyglot-whole-2
test_cases: 225
model: gemini-exp-1206
edit_format: whole
commit_hash: b1bc2f8
pass_rate_1: 19.6
pass_rate_2: 38.2
pass_num_1: 44
pass_num_2: 86
percent_cases_well_formed: 98.2
error_outputs: 8
num_malformed_responses: 8
num_with_malformed_responses: 4
user_asks: 32
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 9
total_tests: 225
command: aider --model gemini/gemini-exp-1206
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 45.5
total_cost: 0.0000

- dirname: 2024-12-22-20-08-13--gemini-2.0-flash-exp-polyglot-whole
test_cases: 225
model: gemini-2.0-flash-exp
edit_format: whole
commit_hash: b1bc2f8
pass_rate_1: 11.6
pass_rate_2: 22.2
pass_num_1: 26
pass_num_2: 50
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 9
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 8
total_tests: 225
command: aider --model gemini/gemini-2.0-flash-exp
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 12.2
total_cost: 0.0000
9 changes: 8 additions & 1 deletion aider/website/_posts/2024-12-21-polyglot.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ the performance of
today's strongest coding models and
leaves headroom for future LLMs.

{: .note :}
See the main
[aider leaderboard](https://aider.chat/docs/leaderboards/)
for benchmark results from more models.
This article only contains a snapshot
of results at the time of publication.

## The polyglot benchmark

Like aider's original code editing benchmark,
Expand Down Expand Up @@ -171,7 +178,7 @@ on GitHub.
</tr>
</thead>
<tbody>
{% assign edit_sorted = site.data.polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
{% assign edit_sorted = site.data.o1_polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
{% for row in edit_sorted %}
<tr style="border-bottom: 1px solid #ddd;">
<td style="padding: 8px;">{{ row.model }}</td>
Expand Down

0 comments on commit dd9b2a8

Please sign in to comment.