Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
update lm-eval to 0.4.3
Browse files Browse the repository at this point in the history
Signed-off-by: changwangss <[email protected]>
  • Loading branch information
changwangss committed Jul 9, 2024
1 parent e79a71c commit b0344a1
Show file tree
Hide file tree
Showing 17 changed files with 62 additions and 52 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/script/formatScan/pylint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ else
echo "Not found requirements.txt file."
fi
# install packages
pip install lm-eval==0.4.2
pip install lm-eval==0.4.3
pip install accelerate nlpaug nltk schema optimum-intel optimum peft
pip install --upgrade --force-reinstall transformers==4.36.2
pip install optimum-habana
Expand Down
2 changes: 1 addition & 1 deletion examples/huggingface/neural_speed/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
intel_extension_for_transformers
neural-speed
lm-eval==0.4.2
lm-eval==0.4.3
sentencepiece
gguf
--extra-index-url https://download.pytorch.org/whl/cpu
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
transformers
accelerate
sentencepiece != 0.1.92
lm-eval==0.4.2
lm-eval==0.4.3
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ transformers
torch==2.0.1
tqdm
neural_compressor
lm-eval==0.4.2
lm-eval==0.4.3

Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ wandb
einops
neural-compressor
pytest==8.0.0
lm-eval==0.4.2
lm-eval==0.4.3
git+https://github.com/huggingface/peft.git@6c44096c7b8d55a2ecf24be9bc68393467e1584a
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ tiktoken #qwen
einops #qwen
auto-round
git+https://github.com/intel/neural-compressor.git
lm-eval==0.4.2
lm-eval==0.4.3
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ einops #qwen
git+https://github.com/intel/neural-speed.git
auto-round==0.2
git+https://github.com/intel/neural-compressor.git
lm-eval==0.4.2
lm-eval==0.4.3
huggingface_hub
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ transformers_stream_generator
tiktoken #qwen
einops #qwen
git+https://github.com/intel/neural-compressor.git
lm-eval==0.4.2
lm-eval==0.4.3
huggingface_hub
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ neural-compressor
optimum-intel > 1.12.0
onnxruntime
intel-extension-for-pytorch
lm-eval==0.4.2
lm-eval==0.4.3
2 changes: 1 addition & 1 deletion examples/modelscope/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
intel_extension_for_transformers
neural-speed
lm-eval==0.4.2
lm-eval==0.4.3
sentencepiece
gguf
--extra-index-url https://download.pytorch.org/whl/cpu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ fastapi
fschat==0.2.32
huggingface_hub
intel_extension_for_pytorch==2.3.0
lm-eval==0.4.2
lm-eval==0.4.3
neural-compressor
neural_speed==1.0a0
numpy==1.23.5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ evaluate
fastapi
fschat==0.2.35
huggingface_hub
lm-eval==0.4.2
lm-eval==0.4.3
neural-compressor
numpy==1.23.5
optimum
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ fastapi
fschat==0.2.35
huggingface_hub
intel-extension-for-transformers
lm-eval==0.4.2
lm-eval==0.4.3
neural-compressor
numpy==1.23.5
optimum
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ langchain-community==0.0.27
langchain_core==0.1.35
langid
librosa
lm-eval==0.4.2
lm-eval==0.4.3
markdown
neural-compressor
neural_speed==1.0a0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval.evaluator import(
request_caching_arg_to_dict
)
from lm_eval.logging_utils import WandbLogger
from lm_eval.loggers import WandbLogger
from lm_eval.tasks import TaskManager
from lm_eval.utils import make_table, simple_parse_args_string

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
print_writeout,
run_task_tests,
)
from lm_eval.logging_utils import add_env_info, get_git_commit_hash
from lm_eval.loggers import add_env_info, get_git_commit_hash
from lm_eval.tasks import TaskManager, get_task_dict
from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string
from lm_eval import utils
Expand Down Expand Up @@ -509,9 +509,14 @@ def evaluate(
# aggregate results ; run bootstrap CIs
for task_output in eval_tasks:
task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
results, samples, configs, versions, num_fewshot = consolidate_results(
eval_tasks
)
(
results,
samples,
configs,
versions,
num_fewshot,
higher_is_better,
) = consolidate_results(eval_tasks)

### Calculate group metrics ###
if bool(results):
Expand All @@ -522,6 +527,23 @@ def evaluate(
# or `task_name: []`.
# we only want to operate on groups here.
continue

# collect all higher_is_better values for metrics
# in the group's subtasks.
# TODO: clean this up ; unify with the below metric_list loop?
_higher_is_better = {}
for task in task_list:
for m, h in higher_is_better[task].items():
if m not in _higher_is_better.keys():
_higher_is_better[m] = h
if m in _higher_is_better and _higher_is_better[m] is not None and _higher_is_better[m] != h:
eval_logger.warning(
f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
)
_higher_is_better[m] = None
higher_is_better[group] = _higher_is_better

# collect all metric keys used by a subtask in the group.
metric_list = list(
{
key
Expand All @@ -534,38 +556,20 @@ def evaluate(
stderr = "_stderr,".join(metric.split(","))

# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
metrics = [results[task][metric] for task in task_list if metric in results[task]] # TODO: copy?
stderrs = [results[task][stderr] for task in task_list if stderr in results[task]]
sizes = [results[task]["samples"] for task in task_list if metric in results[task]]

# compute group's pooled metric and stderr
results[group][metric] = (
lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
)
results[group][metric] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][stderr] = (
lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
)
results[group][stderr] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula,
# comment out the above and uncomment this line:
# results[group][stderr] = \
# lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)

results[group]["samples"] = sum(sizes)

Expand All @@ -578,19 +582,15 @@ def evaluate(
if len(left_tasks_list) == 0:
break

_task_hierarchy = {
k: v for k, v in task_hierarchy.items() if k in left_tasks_list
}
_task_hierarchy = {k: v for k, v in task_hierarchy.items() if k in left_tasks_list}
_results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)

results_agg = {**results_agg, **_results_agg}
groups_agg = {**groups_agg, **_groups_agg}

for group_name, task_list in task_hierarchy.items():
if task_list:
num_fewshot[group_name] = num_fewshot[
task_list[0]
] # TODO: validate this
num_fewshot[group_name] = num_fewshot[task_list[0]] # TODO: validate this

results_dict = {
"results": dict(results_agg.items()),
Expand All @@ -599,6 +599,17 @@ def evaluate(
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())),
"higher_is_better": dict(sorted(higher_is_better.items())),
"n-samples": {
task_output.task_name: {
"original": len(task_output.task.eval_docs),
"effective": min(
limit if limit else len(task_output.task.eval_docs),
len(task_output.task.eval_docs),
),
}
for task_output in eval_tasks
},
}
if log_samples:
results_dict["samples"] = dict(samples)
Expand All @@ -608,7 +619,6 @@ def evaluate(
else:
return None


def request_caching_arg_to_dict(cache_requests: str) -> dict:
request_caching_args = {
"cache_requests": cache_requests in {"true", "refresh"},
Expand Down
2 changes: 1 addition & 1 deletion tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ git+https://github.com/intel/neural-compressor.git
git+https://github.com/intel/neural-speed.git
intel-extension-for-pytorch==2.3.0
intel-tensorflow==2.14.0
lm-eval==0.4.2
lm-eval==0.4.3
mlflow
nlpaug==1.1.9
onnx
Expand Down

0 comments on commit b0344a1

Please sign in to comment.