poetry run python zero_shot_replication/runner.py --model=... --pset=human-eval
poetry run evalplus.evaluate --dataset humaneval --samples=... --parallel 4 --min-time-limit 0.5 --gt-time-limit-factor 5
poetry run python zero_shot_replication/runner.py --model=... --pset=leetcode
poetry run python zero_shot_replication/evals/run_leetcode_eval.py --model=...
poetry run python zero_shot_replication/runner.py --model=... --pset=gsm8k
# run_MATH_eval can service both MATH and GMS8K
poetry run python evals/run_gsm8k_eval.py --model=...
poetry run python runner.py --provider openai --pset math --model ...
poetry run python zero_shot_replication/evals/run_math_eval.py --model=...