| arc_challenge |
1 |
none |
0 |
acc |
↑ |
0.4462 |
± |
0.0145 |
|
|
none |
0 |
acc_norm |
↑ |
0.4735 |
± |
0.0146 |
| arc_easy |
1 |
none |
0 |
acc |
↑ |
0.7618 |
± |
0.0087 |
|
|
none |
0 |
acc_norm |
↑ |
0.7458 |
± |
0.0089 |
| boolq |
2 |
none |
0 |
acc |
↑ |
0.8150 |
± |
0.0068 |
| hellaswag |
1 |
none |
0 |
acc |
↑ |
0.4687 |
± |
0.0050 |
|
|
none |
0 |
acc_norm |
↑ |
0.6195 |
± |
0.0048 |
| lambada_openai |
1 |
none |
0 |
acc |
↑ |
0.5389 |
± |
0.0069 |
|
|
none |
0 |
perplexity |
↓ |
8.8520 |
± |
0.3354 |
| mmlu |
2 |
none |
|
acc |
↑ |
0.6002 |
± |
0.0039 |
| - humanities |
2 |
none |
|
acc |
↑ |
0.5207 |
± |
0.0068 |
| - formal_logic |
1 |
none |
0 |
acc |
↑ |
0.4524 |
± |
0.0445 |
| - high_school_european_history |
1 |
none |
0 |
acc |
↑ |
0.7515 |
± |
0.0337 |
| - high_school_us_history |
1 |
none |
0 |
acc |
↑ |
0.7353 |
± |
0.0310 |
| - high_school_world_history |
1 |
none |
0 |
acc |
↑ |
0.7764 |
± |
0.0271 |
| - international_law |
1 |
none |
0 |
acc |
↑ |
0.7438 |
± |
0.0398 |
| - jurisprudence |
1 |
none |
0 |
acc |
↑ |
0.6667 |
± |
0.0456 |
| - logical_fallacies |
1 |
none |
0 |
acc |
↑ |
0.6810 |
± |
0.0366 |
| - moral_disputes |
1 |
none |
0 |
acc |
↑ |
0.6503 |
± |
0.0257 |
| - moral_scenarios |
1 |
none |
0 |
acc |
↑ |
0.2436 |
± |
0.0144 |
| - philosophy |
1 |
none |
0 |
acc |
↑ |
0.6431 |
± |
0.0272 |
| - prehistory |
1 |
none |
0 |
acc |
↑ |
0.6481 |
± |
0.0266 |
| - professional_law |
1 |
none |
0 |
acc |
↑ |
0.4511 |
± |
0.0127 |
| - world_religions |
1 |
none |
0 |
acc |
↑ |
0.6842 |
± |
0.0357 |
| - other |
2 |
none |
|
acc |
↑ |
0.6592 |
± |
0.0082 |
| - business_ethics |
1 |
none |
0 |
acc |
↑ |
0.7000 |
± |
0.0461 |
| - clinical_knowledge |
1 |
none |
0 |
acc |
↑ |
0.6679 |
± |
0.0290 |
| - college_medicine |
1 |
none |
0 |
acc |
↑ |
0.5665 |
± |
0.0378 |
| - global_facts |
1 |
none |
0 |
acc |
↑ |
0.4100 |
± |
0.0494 |
| - human_aging |
1 |
none |
0 |
acc |
↑ |
0.6951 |
± |
0.0309 |
| - management |
1 |
none |
0 |
acc |
↑ |
0.8058 |
± |
0.0392 |
| - marketing |
1 |
none |
0 |
acc |
↑ |
0.8419 |
± |
0.0239 |
| - medical_genetics |
1 |
none |
0 |
acc |
↑ |
0.7600 |
± |
0.0429 |
| - miscellaneous |
1 |
none |
0 |
acc |
↑ |
0.7535 |
± |
0.0154 |
| - nutrition |
1 |
none |
0 |
acc |
↑ |
0.6765 |
± |
0.0268 |
| - professional_accounting |
1 |
none |
0 |
acc |
↑ |
0.4326 |
± |
0.0296 |
| - professional_medicine |
1 |
none |
0 |
acc |
↑ |
0.5846 |
± |
0.0299 |
| - virology |
1 |
none |
0 |
acc |
↑ |
0.4398 |
± |
0.0386 |
| - social sciences |
2 |
none |
|
acc |
↑ |
0.6978 |
± |
0.0081 |
| - econometrics |
1 |
none |
0 |
acc |
↑ |
0.5000 |
± |
0.0470 |
| - high_school_geography |
1 |
none |
0 |
acc |
↑ |
0.7323 |
± |
0.0315 |
| - high_school_government_and_politics |
1 |
none |
0 |
acc |
↑ |
0.8083 |
± |
0.0284 |
| - high_school_macroeconomics |
1 |
none |
0 |
acc |
↑ |
0.6000 |
± |
0.0248 |
| - high_school_microeconomics |
1 |
none |
0 |
acc |
↑ |
0.7059 |
± |
0.0296 |
| - high_school_psychology |
1 |
none |
0 |
acc |
↑ |
0.7835 |
± |
0.0177 |
| - human_sexuality |
1 |
none |
0 |
acc |
↑ |
0.7328 |
± |
0.0388 |
| - professional_psychology |
1 |
none |
0 |
acc |
↑ |
0.6176 |
± |
0.0197 |
| - public_relations |
1 |
none |
0 |
acc |
↑ |
0.6000 |
± |
0.0469 |
| - security_studies |
1 |
none |
0 |
acc |
↑ |
0.7469 |
± |
0.0278 |
| - sociology |
1 |
none |
0 |
acc |
↑ |
0.7861 |
± |
0.0290 |
| - us_foreign_policy |
1 |
none |
0 |
acc |
↑ |
0.7900 |
± |
0.0409 |
| - stem |
2 |
none |
|
acc |
↑ |
0.5655 |
± |
0.0086 |
| - abstract_algebra |
1 |
none |
0 |
acc |
↑ |
0.4400 |
± |
0.0499 |
| - anatomy |
1 |
none |
0 |
acc |
↑ |
0.6074 |
± |
0.0422 |
| - astronomy |
1 |
none |
0 |
acc |
↑ |
0.7039 |
± |
0.0372 |
| - college_biology |
1 |
none |
0 |
acc |
↑ |
0.7292 |
± |
0.0372 |
| - college_chemistry |
1 |
none |
0 |
acc |
↑ |
0.4600 |
± |
0.0501 |
| - college_computer_science |
1 |
none |
0 |
acc |
↑ |
0.5000 |
± |
0.0503 |
| - college_mathematics |
1 |
none |
0 |
acc |
↑ |
0.4300 |
± |
0.0498 |
| - college_physics |
1 |
none |
0 |
acc |
↑ |
0.3529 |
± |
0.0476 |
| - computer_security |
1 |
none |
0 |
acc |
↑ |
0.7300 |
± |
0.0446 |
| - conceptual_physics |
1 |
none |
0 |
acc |
↑ |
0.6255 |
± |
0.0316 |
| - electrical_engineering |
1 |
none |
0 |
acc |
↑ |
0.5862 |
± |
0.0410 |
| - elementary_mathematics |
1 |
none |
0 |
acc |
↑ |
0.5661 |
± |
0.0255 |
| - high_school_biology |
1 |
none |
0 |
acc |
↑ |
0.7645 |
± |
0.0241 |
| - high_school_chemistry |
1 |
none |
0 |
acc |
↑ |
0.5222 |
± |
0.0351 |
| - high_school_computer_science |
1 |
none |
0 |
acc |
↑ |
0.7700 |
± |
0.0423 |
| - high_school_mathematics |
1 |
none |
0 |
acc |
↑ |
0.3926 |
± |
0.0298 |
| - high_school_physics |
1 |
none |
0 |
acc |
↑ |
0.4238 |
± |
0.0403 |
| - high_school_statistics |
1 |
none |
0 |
acc |
↑ |
0.5231 |
± |
0.0341 |
| - machine_learning |
1 |
none |
0 |
acc |
↑ |
0.4286 |
± |
0.0470 |
| openbookqa |
1 |
none |
0 |
acc |
↑ |
0.2820 |
± |
0.0201 |
|
|
none |
0 |
acc_norm |
↑ |
0.3720 |
± |
0.0216 |
| piqa |
1 |
none |
0 |
acc |
↑ |
0.7323 |
± |
0.0103 |
| truthfulqa_mc1 |
2 |
none |
0 |
acc |
↑ |
0.3317 |
± |
0.0165 |
| winogrande |
1 |
none |
0 |
acc |
↑ |
0.6630 |
± |
0.0133 |