Commit ·
d9cb81b
1
Parent(s): 36d7872
Add evaluation results for HLE, GPQA, AIME, HMMT, SWE-Bench, and Terminal-Bench (#4)
Browse files- Add evaluation results for HLE, GPQA, AIME, HMMT, SWE-Bench, and Terminal-Bench (64ed7450cf8801fae6fc24d6bea85e034393d974)
Co-authored-by: Nathan Habib <SaylorTwift@users.noreply.huggingface.co>
- .eval_results/aime_2026.yaml +9 -0
- .eval_results/gpqa_diamond.yaml +9 -0
- .eval_results/hle.yaml +9 -0
- .eval_results/hle_with_tools.yaml +10 -0
- .eval_results/hmmt_feb_2026.yaml +9 -0
- .eval_results/swe_bench_pro.yaml +9 -0
- .eval_results/swe_bench_verified.yaml +9 -0
- .eval_results/terminal_bench_2.yaml +9 -0
.eval_results/aime_2026.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- dataset:
|
| 2 |
+
id: MathArena/aime_2026
|
| 3 |
+
task_id: MathArena/aime_2026
|
| 4 |
+
value: 96.4
|
| 5 |
+
date: '2026-04-20'
|
| 6 |
+
source:
|
| 7 |
+
url: https://huggingface.co/moonshotai/Kimi-K2.6
|
| 8 |
+
name: Model Card
|
| 9 |
+
user: SaylorTwift
|
.eval_results/gpqa_diamond.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- dataset:
|
| 2 |
+
id: Idavidrein/gpqa
|
| 3 |
+
task_id: diamond
|
| 4 |
+
value: 90.5
|
| 5 |
+
date: '2026-04-20'
|
| 6 |
+
source:
|
| 7 |
+
url: https://huggingface.co/moonshotai/Kimi-K2.6
|
| 8 |
+
name: Model Card
|
| 9 |
+
user: SaylorTwift
|
.eval_results/hle.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- dataset:
|
| 2 |
+
id: cais/hle
|
| 3 |
+
task_id: hle
|
| 4 |
+
value: 34.7
|
| 5 |
+
date: '2026-04-20'
|
| 6 |
+
source:
|
| 7 |
+
url: https://huggingface.co/moonshotai/Kimi-K2.6
|
| 8 |
+
name: Model Card
|
| 9 |
+
user: SaylorTwift
|
.eval_results/hle_with_tools.yaml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- dataset:
|
| 2 |
+
id: cais/hle
|
| 3 |
+
task_id: hle
|
| 4 |
+
value: 54.0
|
| 5 |
+
date: '2026-04-20'
|
| 6 |
+
source:
|
| 7 |
+
url: https://huggingface.co/moonshotai/Kimi-K2.6
|
| 8 |
+
name: Model Card
|
| 9 |
+
user: SaylorTwift
|
| 10 |
+
notes: "With tools"
|
.eval_results/hmmt_feb_2026.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- dataset:
|
| 2 |
+
id: MathArena/hmmt_feb_2026
|
| 3 |
+
task_id: MathArena/hmmt_feb_2026
|
| 4 |
+
value: 92.7
|
| 5 |
+
date: '2026-04-20'
|
| 6 |
+
source:
|
| 7 |
+
url: https://huggingface.co/moonshotai/Kimi-K2.6
|
| 8 |
+
name: Model Card
|
| 9 |
+
user: SaylorTwift
|
.eval_results/swe_bench_pro.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- dataset:
|
| 2 |
+
id: ScaleAI/SWE-bench_Pro
|
| 3 |
+
task_id: SWE_Bench_Pro
|
| 4 |
+
value: 58.6
|
| 5 |
+
date: '2026-04-20'
|
| 6 |
+
source:
|
| 7 |
+
url: https://huggingface.co/moonshotai/Kimi-K2.6
|
| 8 |
+
name: Model Card
|
| 9 |
+
user: SaylorTwift
|
.eval_results/swe_bench_verified.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- dataset:
|
| 2 |
+
id: SWE-bench/SWE-bench_Verified
|
| 3 |
+
task_id: swe_bench_%_resolved
|
| 4 |
+
value: 80.2
|
| 5 |
+
date: '2026-04-20'
|
| 6 |
+
source:
|
| 7 |
+
url: https://huggingface.co/moonshotai/Kimi-K2.6
|
| 8 |
+
name: Model Card
|
| 9 |
+
user: SaylorTwift
|
.eval_results/terminal_bench_2.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- dataset:
|
| 2 |
+
id: harborframework/terminal-bench-2.0
|
| 3 |
+
task_id: terminalbench_2
|
| 4 |
+
value: 66.7
|
| 5 |
+
date: '2026-04-20'
|
| 6 |
+
source:
|
| 7 |
+
url: https://huggingface.co/moonshotai/Kimi-K2.6
|
| 8 |
+
name: Model Card
|
| 9 |
+
user: SaylorTwift
|