Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor bugfixes #14

Merged
merged 5 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ llm-eval-test run --help
## Usage

```
usage: llm-eval-test run [-h] [--catalog_path PATH] [--tasks_path PATH] [-v | -q] -H ENDPOINT -m MODEL -t TASKS -d PATH [-b INT] [-o OUTPUT]
usage: llm-eval-test run [-h] [--catalog_path PATH] [--tasks_path PATH] [-v | -q]
-H ENDPOINT -m MODEL -t TASKS -d PATH [-b INT] [-o OUTPUT]

Run tasks

Expand Down Expand Up @@ -69,6 +70,9 @@ mkdir $DATASETS_DIR

# Download the MMLU-Pro dataset
DATASET=TIGER-Lab/MMLU-Pro
# Use your preferred method of downloading the dataset to $DATASETS_DIR/$DATASET
# to use the huggingface-cli:
pip install huggingface_hub[cli]
huggingface-cli download $DATASET --repo-type dataset --local-dir $DATASETS_DIR/$DATASET

# Run the benchmark
Expand Down
3 changes: 3 additions & 0 deletions llm_eval_test/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ def eval_cli():
if args.command == 'list':
LMEvalWrapper.list_tasks(args.tasks_path)
elif args.command == 'run':
if 'chat/completions' in args.endpoint.lower():
logger.warning("The /v1/chat/completions API is unsupported, please use /v1/completions")

# HACK: Working from a temporary directory allows us to load hf datasets
# from disk because the dataset and evaluate libraries search the local
# path first. Since Unitxt is loaded as a dataset, we also provide wrappers
Expand Down
7 changes: 3 additions & 4 deletions llm_eval_test/benchmarks/tasks/_common/README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
# Common files for Unitxt tasks

Symlink these files into each task that requires Unitxt.
Ensure that the symlinks are relative to the task directory.
Hardlink these files into each task that requires Unitxt.

```sh
mkdir new_task
cd new_task

ln -s ../_common/task.py ./
ln -s ../_common/unitxt ./
ln -f ../_common/task.py ./
ln -f ../_common/unitxt ./
```
1 change: 0 additions & 1 deletion llm_eval_test/benchmarks/tasks/mmlu/task.py

This file was deleted.

2 changes: 2 additions & 0 deletions llm_eval_test/benchmarks/tasks/mmlu/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Wraps unitxt task in lm-eval
from lm_eval.tasks.unitxt.task import *
1 change: 0 additions & 1 deletion llm_eval_test/benchmarks/tasks/mmlu/unitxt

This file was deleted.

1 change: 1 addition & 0 deletions llm_eval_test/benchmarks/tasks/mmlu/unitxt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
class: !function task.Unitxt
1 change: 0 additions & 1 deletion llm_eval_test/benchmarks/tasks/mmlu_pro/task.py

This file was deleted.

2 changes: 2 additions & 0 deletions llm_eval_test/benchmarks/tasks/mmlu_pro/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Wraps unitxt task in lm-eval
from lm_eval.tasks.unitxt.task import *
1 change: 0 additions & 1 deletion llm_eval_test/benchmarks/tasks/mmlu_pro/unitxt

This file was deleted.

1 change: 1 addition & 0 deletions llm_eval_test/benchmarks/tasks/mmlu_pro/unitxt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
class: !function task.Unitxt
15 changes: 8 additions & 7 deletions llm_eval_test/lm_eval_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def exec(tasks, model, endpoint, **kwargs):
model = model,
base_url = endpoint,
num_concurent=1,
max_retries=3,
max_retries=kwargs["retry"],
tokenizer_backend=None,
tokenized_requests=False
)
Expand All @@ -41,12 +41,13 @@ def exec(tasks, model, endpoint, **kwargs):
)

if results:
# Write results to outfile
logger.info(f"Writing results to {kwargs['output'].name}")
output = json.dumps(
results, indent=2, default=handle_non_serializable, ensure_ascii=False
)
kwargs['output'].write(output)
if kwargs.get('output'):
# Write results to outfile
logger.info(f"Writing results to {kwargs['output'].name}")
output = json.dumps(
results, indent=2, default=handle_non_serializable, ensure_ascii=False
)
kwargs['output'].write(output)

# Print output table
print(make_table(results))
Expand Down
5 changes: 3 additions & 2 deletions llm_eval_test/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,10 @@ def dir_path(path: str) -> str:
help="path to dataset storage", metavar='PATH')
parser_run.add_argument('-b', '--batch_size', default=64, type=int,
help="per-request batch size", metavar='INT')
parser_run.add_argument('-r', '--retry', default=3, type=int,
help="Max number of times to retry a single request", metavar='INT')
parser_run.add_argument('-o', '--output', type=argparse.FileType('w'),
default=f"{work_dir}/output.json",
help="results output file")
help="results output file")

parser_list = subparsers.add_parser(
'list',
Expand Down
Loading