| | healthbench_scripts | | |
| | sampler | | |
| | .gitignore | | 1.8 KB |
| | browsecomp_eval.py | | 6.4 KB |
| | common.py | | 10.3 KB |
| | drop_eval.py | | 10.9 KB |
| | gpqa_eval.py | | 3.1 KB |
| | healthbench_eval_test.py | | 745 B |
| | healthbench_eval.py | | 25.1 KB |
| | healthbench_meta_eval_test.py | | 6.5 KB |
| | healthbench_meta_eval.py | | 13.0 KB |
| | humaneval_eval.py | | 5.2 KB |
| | LICENSE | | 1.0 KB |
| | math_eval.py | | 2.7 KB |
| | mgsm_eval.py | | 10.0 KB |
| | mmlu_eval.py | | 4.8 KB |
| | multilingual_mmlu_benchmark_results.md | | 4.1 KB |
| | README.md | | 12.9 KB |
| | run_multilingual_mmlu.py | | 5.3 KB |
| | simple_evals.py | | 15.1 KB |
| | simpleqa_eval.py | | 11.0 KB |
| | types.py | | 1.5 KB |