Merge pull request #3 from sjmonson/develop

Rename to llm-eval-test
openshift-psap · Oct 15, 2024 · 2cef539 · 2cef539
2 parents 735947c + bfc29fe
commit 2cef539
Show file tree

Hide file tree

Showing 44 changed files with 11 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# perf-llm-eval
+# llm-eval-test
 
 A wrapper around [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness) and [Unitxt](https://github.com/IBM/unitxt) designed for evaluation of a local inference endpoint.
 
@@ -25,16 +25,16 @@ python -m venv venv
 source venv/bin/activate
 
 # Install the package
-pip install git+https://github.com/sjmonson/perf-llm-eval.git
+pip install git+https://github.com/sjmonson/llm-eval-test.git
 
 # View run options
-perf-llm-eval run --help
+llm-eval-test run --help
 ```
 
 ## Usage
 
 ```
-usage: perf-llm-eval run [-h] [--catalog_path PATH] [--tasks_path PATH] [-v | -q] -H ENDPOINT -m MODEL -t TASKS -d PATH [-b INT] [-o OUTPUT]
+usage: llm-eval-test run [-h] [--catalog_path PATH] [--tasks_path PATH] [-v | -q] -H ENDPOINT -m MODEL -t TASKS -d PATH [-b INT] [-o OUTPUT]
 
 Run tasks
 
@@ -74,5 +74,5 @@ huggingface-cli download $DATASET --repo-type dataset --local-dir $DATASETS_DIR/
 # Run the benchmark
 ENDPOINT=http://127.0.0.1:8000/v1/completions # An OpenAI API-compatable completions endpoint
 MODEL_NAME=meta-llama/Llama-3.1-8B # Name of the model hosted on the inference server
-perf-llm-eval run --endpoint $ENDPOINT --model $MODEL_NAME --datasets $DATASETS_DIR --tasks mmlu_pro
+llm-eval-test run --endpoint $ENDPOINT --model $MODEL_NAME --datasets $DATASETS_DIR --tasks mmlu_pro
 ```
diff --git a/perf_llm_eval/__main__.py → llm_eval_test/__main__.py b/perf_llm_eval/__main__.py → llm_eval_test/__main__.py
@@ -4,9 +4,9 @@
 import logging
 import tempfile
 
-from perf_llm_eval.parser import setup_parser
+from llm_eval_test.parser import setup_parser
 
-logger = logging.getLogger("perf-llm-eval")
+logger = logging.getLogger("llm-eval-test")
 
 
 def eval_cli():
@@ -35,7 +35,7 @@ def eval_cli():
     os.environ["UNITXT_ARTIFACTORIES"] = args.catalog_path
 
     # Late import to avoid slow cli
-    from perf_llm_eval.lm_eval_wrapper import LMEvalWrapper
+    from llm_eval_test.lm_eval_wrapper import LMEvalWrapper
 
     if args.command == 'list':
         LMEvalWrapper.list_tasks(args.tasks_path)

diff --git a/...enchmarks/catalog/cards/mmlu_pro/all.json → ...enchmarks/catalog/cards/mmlu_pro/all.json b/...enchmarks/catalog/cards/mmlu_pro/all.json → ...enchmarks/catalog/cards/mmlu_pro/all.json
diff --git a/...marks/catalog/cards/mmlu_pro/biology.json → ...marks/catalog/cards/mmlu_pro/biology.json b/...marks/catalog/cards/mmlu_pro/biology.json → ...marks/catalog/cards/mmlu_pro/biology.json
diff --git a/...arks/catalog/cards/mmlu_pro/business.json → ...arks/catalog/cards/mmlu_pro/business.json b/...arks/catalog/cards/mmlu_pro/business.json → ...arks/catalog/cards/mmlu_pro/business.json
diff --git a/...rks/catalog/cards/mmlu_pro/chemistry.json → ...rks/catalog/cards/mmlu_pro/chemistry.json b/...rks/catalog/cards/mmlu_pro/chemistry.json → ...rks/catalog/cards/mmlu_pro/chemistry.json
diff --git a/...alog/cards/mmlu_pro/computer_science.json → ...alog/cards/mmlu_pro/computer_science.json b/...alog/cards/mmlu_pro/computer_science.json → ...alog/cards/mmlu_pro/computer_science.json
diff --git a/...rks/catalog/cards/mmlu_pro/economics.json → ...rks/catalog/cards/mmlu_pro/economics.json b/...rks/catalog/cards/mmlu_pro/economics.json → ...rks/catalog/cards/mmlu_pro/economics.json
diff --git a/...s/catalog/cards/mmlu_pro/engineering.json → ...s/catalog/cards/mmlu_pro/engineering.json b/...s/catalog/cards/mmlu_pro/engineering.json → ...s/catalog/cards/mmlu_pro/engineering.json
diff --git a/...hmarks/catalog/cards/mmlu_pro/health.json → ...hmarks/catalog/cards/mmlu_pro/health.json b/...hmarks/catalog/cards/mmlu_pro/health.json → ...hmarks/catalog/cards/mmlu_pro/health.json
diff --git a/...marks/catalog/cards/mmlu_pro/history.json → ...marks/catalog/cards/mmlu_pro/history.json b/...marks/catalog/cards/mmlu_pro/history.json → ...marks/catalog/cards/mmlu_pro/history.json
diff --git a/...enchmarks/catalog/cards/mmlu_pro/law.json → ...enchmarks/catalog/cards/mmlu_pro/law.json b/...enchmarks/catalog/cards/mmlu_pro/law.json → ...enchmarks/catalog/cards/mmlu_pro/law.json
diff --git a/...nchmarks/catalog/cards/mmlu_pro/math.json → ...nchmarks/catalog/cards/mmlu_pro/math.json b/...nchmarks/catalog/cards/mmlu_pro/math.json → ...nchmarks/catalog/cards/mmlu_pro/math.json
diff --git a/...chmarks/catalog/cards/mmlu_pro/other.json → ...chmarks/catalog/cards/mmlu_pro/other.json b/...chmarks/catalog/cards/mmlu_pro/other.json → ...chmarks/catalog/cards/mmlu_pro/other.json
diff --git a/...ks/catalog/cards/mmlu_pro/philosophy.json → ...ks/catalog/cards/mmlu_pro/philosophy.json b/...ks/catalog/cards/mmlu_pro/philosophy.json → ...ks/catalog/cards/mmlu_pro/philosophy.json
diff --git a/...marks/catalog/cards/mmlu_pro/physics.json → ...marks/catalog/cards/mmlu_pro/physics.json b/...marks/catalog/cards/mmlu_pro/physics.json → ...marks/catalog/cards/mmlu_pro/physics.json
diff --git a/...ks/catalog/cards/mmlu_pro/psychology.json → ...ks/catalog/cards/mmlu_pro/psychology.json b/...ks/catalog/cards/mmlu_pro/psychology.json → ...ks/catalog/cards/mmlu_pro/psychology.json
diff --git a/.../benchmarks/catalog/metrics/accuracy.json → .../benchmarks/catalog/metrics/accuracy.json b/.../benchmarks/catalog/metrics/accuracy.json → .../benchmarks/catalog/metrics/accuracy.json
diff --git a/...s/catalog/processors/first_character.json → ...s/catalog/processors/first_character.json b/...s/catalog/processors/first_character.json → ...s/catalog/processors/first_character.json
diff --git a/.../tasks/qa/multiple_choice/with_topic.json → .../tasks/qa/multiple_choice/with_topic.json b/.../tasks/qa/multiple_choice/with_topic.json → .../tasks/qa/multiple_choice/with_topic.json
diff --git a/...le_choice/with_topic/lm_eval_harness.json → ...le_choice/with_topic/lm_eval_harness.json b/...le_choice/with_topic/lm_eval_harness.json → ...le_choice/with_topic/lm_eval_harness.json
diff --git a/..._llm_eval/benchmarks/tasks/_mmlu_pro.yaml → ...eval_test/benchmarks/tasks/_mmlu_pro.yaml b/..._llm_eval/benchmarks/tasks/_mmlu_pro.yaml → ...eval_test/benchmarks/tasks/_mmlu_pro.yaml
diff --git a/...m_eval/benchmarks/tasks/mmlu_pro_all.yaml → ...l_test/benchmarks/tasks/mmlu_pro_all.yaml b/...m_eval/benchmarks/tasks/mmlu_pro_all.yaml → ...l_test/benchmarks/tasks/mmlu_pro_all.yaml
diff --git a/...al/benchmarks/tasks/mmlu_pro_biology.yaml → ...st/benchmarks/tasks/mmlu_pro_biology.yaml b/...al/benchmarks/tasks/mmlu_pro_biology.yaml → ...st/benchmarks/tasks/mmlu_pro_biology.yaml
diff --git a/...l/benchmarks/tasks/mmlu_pro_business.yaml → ...t/benchmarks/tasks/mmlu_pro_business.yaml b/...l/benchmarks/tasks/mmlu_pro_business.yaml → ...t/benchmarks/tasks/mmlu_pro_business.yaml
diff --git a/.../benchmarks/tasks/mmlu_pro_chemistry.yaml → .../benchmarks/tasks/mmlu_pro_chemistry.yaml b/.../benchmarks/tasks/mmlu_pro_chemistry.yaml → .../benchmarks/tasks/mmlu_pro_chemistry.yaml
diff --git a/...arks/tasks/mmlu_pro_computer_science.yaml → ...arks/tasks/mmlu_pro_computer_science.yaml b/...arks/tasks/mmlu_pro_computer_science.yaml → ...arks/tasks/mmlu_pro_computer_science.yaml
diff --git a/.../benchmarks/tasks/mmlu_pro_economics.yaml → .../benchmarks/tasks/mmlu_pro_economics.yaml b/.../benchmarks/tasks/mmlu_pro_economics.yaml → .../benchmarks/tasks/mmlu_pro_economics.yaml
diff --git a/...enchmarks/tasks/mmlu_pro_engineering.yaml → ...enchmarks/tasks/mmlu_pro_engineering.yaml b/...enchmarks/tasks/mmlu_pro_engineering.yaml → ...enchmarks/tasks/mmlu_pro_engineering.yaml
diff --git a/...val/benchmarks/tasks/mmlu_pro_health.yaml → ...est/benchmarks/tasks/mmlu_pro_health.yaml b/...val/benchmarks/tasks/mmlu_pro_health.yaml → ...est/benchmarks/tasks/mmlu_pro_health.yaml
diff --git a/...al/benchmarks/tasks/mmlu_pro_history.yaml → ...st/benchmarks/tasks/mmlu_pro_history.yaml b/...al/benchmarks/tasks/mmlu_pro_history.yaml → ...st/benchmarks/tasks/mmlu_pro_history.yaml
diff --git a/...m_eval/benchmarks/tasks/mmlu_pro_law.yaml → ...l_test/benchmarks/tasks/mmlu_pro_law.yaml b/...m_eval/benchmarks/tasks/mmlu_pro_law.yaml → ...l_test/benchmarks/tasks/mmlu_pro_law.yaml
diff --git a/..._eval/benchmarks/tasks/mmlu_pro_math.yaml → ..._test/benchmarks/tasks/mmlu_pro_math.yaml b/..._eval/benchmarks/tasks/mmlu_pro_math.yaml → ..._test/benchmarks/tasks/mmlu_pro_math.yaml
diff --git a/...eval/benchmarks/tasks/mmlu_pro_other.yaml → ...test/benchmarks/tasks/mmlu_pro_other.yaml b/...eval/benchmarks/tasks/mmlu_pro_other.yaml → ...test/benchmarks/tasks/mmlu_pro_other.yaml
diff --git a/...benchmarks/tasks/mmlu_pro_philosophy.yaml → ...benchmarks/tasks/mmlu_pro_philosophy.yaml b/...benchmarks/tasks/mmlu_pro_philosophy.yaml → ...benchmarks/tasks/mmlu_pro_philosophy.yaml
diff --git a/...al/benchmarks/tasks/mmlu_pro_physics.yaml → ...st/benchmarks/tasks/mmlu_pro_physics.yaml b/...al/benchmarks/tasks/mmlu_pro_physics.yaml → ...st/benchmarks/tasks/mmlu_pro_physics.yaml
diff --git a/...benchmarks/tasks/mmlu_pro_psychology.yaml → ...benchmarks/tasks/mmlu_pro_psychology.yaml b/...benchmarks/tasks/mmlu_pro_psychology.yaml → ...benchmarks/tasks/mmlu_pro_psychology.yaml
diff --git a/perf_llm_eval/benchmarks/tasks/task.py → llm_eval_test/benchmarks/tasks/task.py b/perf_llm_eval/benchmarks/tasks/task.py → llm_eval_test/benchmarks/tasks/task.py
diff --git a/perf_llm_eval/benchmarks/tasks/unitxt → llm_eval_test/benchmarks/tasks/unitxt b/perf_llm_eval/benchmarks/tasks/unitxt → llm_eval_test/benchmarks/tasks/unitxt
diff --git a/perf_llm_eval/lm_eval_wrapper.py → llm_eval_test/lm_eval_wrapper.py b/perf_llm_eval/lm_eval_wrapper.py → llm_eval_test/lm_eval_wrapper.py
@@ -8,7 +8,7 @@
 from lm_eval.tasks import TaskManager  # type: ignore
 from lm_eval.utils import handle_non_serializable, make_table
 
-logger = logging.getLogger("perf-llm-eval")
+logger = logging.getLogger("llm-eval-test")
 
 class LMEvalWrapper(object):
     @staticmethod

diff --git a/perf_llm_eval/parser.py → llm_eval_test/parser.py b/perf_llm_eval/parser.py → llm_eval_test/parser.py
diff --git a/perf_llm_eval/wrappers/unitxt/data/data.py → llm_eval_test/wrappers/unitxt/data/data.py b/perf_llm_eval/wrappers/unitxt/data/data.py → llm_eval_test/wrappers/unitxt/data/data.py
diff --git a/...llm_eval/wrappers/unitxt/metric/metric.py → ...val_test/wrappers/unitxt/metric/metric.py b/...llm_eval/wrappers/unitxt/metric/metric.py → ...val_test/wrappers/unitxt/metric/metric.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.poetry]
-name = "perf-llm-eval"
+name = "llm-eval-test"
 version = "0.1.0"
 description = ""
 authors = ["Samuel Monson <[email protected]>"]
@@ -12,7 +12,7 @@ unitxt = "^1.13.1"
 
 
 [tool.poetry.scripts]
-perf-llm-eval = "perf_llm_eval.__main__:eval_cli"
+llm-eval-test = "llm_eval_test.__main__:eval_cli"
 
 [build-system]
 requires = ["poetry-core"]