Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove HumanEval tasks from ICL eval #715

Merged
merged 13 commits into from
Nov 6, 2023
62 changes: 31 additions & 31 deletions scripts/eval/yamls/eval_gauntlet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,32 +133,32 @@
- name: boolq
num_fewshot: 10
random_baseline: 0.5
- name: programming
benchmarks:
- name: human_eval
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_cpp
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_js
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_return_simple
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_return_complex
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_25
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_50
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_75
num_fewshot: 0
random_baseline: 0.0
# - name: programming
# benchmarks:
# - name: human_eval
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_cpp
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_js
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_return_simple
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_return_complex
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_25
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_50
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_75
# num_fewshot: 0
# random_baseline: 0.0
- name: world_knowledge_lm_task_subscore
benchmarks:
- name: jeopardy
Expand Down Expand Up @@ -258,8 +258,8 @@
- name: squad
num_fewshot: 10
random_baseline: 0
- name: programming_lite
benchmarks:
- name: human_eval
num_fewshot: 0
random_baseline: 0.0
# - name: programming_lite

Check failure on line 261 in scripts/eval/yamls/eval_gauntlet.yaml

View workflow job for this annotation

GitHub Actions / code-quality (3.9, [dev])

261:3 [comments-indentation] comment not indented like content
# benchmarks:
# - name: human_eval

Check failure on line 263 in scripts/eval/yamls/eval_gauntlet.yaml

View workflow job for this annotation

GitHub Actions / code-quality (3.9, [dev])

263:5 [comments-indentation] comment not indented like content
# num_fewshot: 0
# random_baseline: 0.0
128 changes: 64 additions & 64 deletions scripts/eval/yamls/tasks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -173,67 +173,67 @@ icl_tasks:
num_fewshot: [10]
icl_task_type: multiple_choice
continuation_delimiter: "\nAnswer: " # this separates questions from answers
-
label: human_eval
dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_cpp
dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_js
dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_return_simple
dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_return_complex
dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_25
dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_50
dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_75
dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
# -
# label: human_eval
# dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_cpp
# dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_js
# dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_return_simple
# dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_return_complex
# dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_25
# dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_50
# dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_75
# dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
Loading