Use canonical fully qualified class names for metrics (stanford-crfm#…

…1680)
danielz02 · Sep 7, 2023 · 529bf44 · 529bf44
1 parent b38e978
commit 529bf44
Show file tree

Hide file tree

Showing 7 changed files with 60 additions and 53 deletions.
diff --git a/docs/code.md b/docs/code.md
@@ -60,8 +60,7 @@ In order to implement new scenarios:
 4. Note that you need not enumerate every possible correct answer (nor must
    there even necessarily be a correct answer). 
 5. Make sure to document your scenario well with a clear docstring. 
-6. In addition, specify its `name`, `description`, and `tags` and define a class
-   `__init__` function even if it is simply `pass`.
+6. In addition, specify its `name`, `description`, and `tags`.
 7. Define a function `get_specname_spec` in `run_specs.py` to retrieve a `ScenarioSpec` 
    for your scenario using a class name corresponding to the Python path of 
    the class (e.g. `helm.benchmark.scenarios.your_scenario.YourScenario`) and any 
@@ -79,17 +78,17 @@ In order to implement new scenarios:
    `name` corresponding to the scenario name and any patterns to match in 
    curly braces, a `scenario_spec`, an `adapter_spec`, `metric_specs`, 
    and `groups`. 
-12. Add the scenario to `__init__.py`
-13. Attempt to run your task with
+12. Attempt to run your task with
     `venv/bin/helm-run -r yourscenarioname:arg=value` where 
     `yourscenarioname` matches the `name` specified in YourScenario
-14. Add the spec to dictionary `CANONICAL_RUN_SPEC_FUNCS` in `run_specs.py`.
-15. Update `src/helm/proxy/static/contamination.yaml` with models that we trained on your scenario (i.e. contaminated).
+13. Add the spec to dictionary `CANONICAL_RUN_SPEC_FUNCS` in `run_specs.py`.
+14. Update `src/helm/proxy/static/contamination.yaml` with models that we trained on your scenario (i.e. contaminated).
 
 
 ## Adding new metrics
 
 To add a new metric:
+
 1. If the metric is task-specific, create a new `yourtask_metrics.py` file. 
    Otherwise, if the metric is generic and likely to be widely used, add it
    to `basic_metrics.py`.
@@ -101,7 +100,6 @@ To add a new metric:
    (e.g. multiple distance metrics). 
 5. For each `value` generated for a `Stat`, add it to `yourstat` using `yourstat.add(value)`. 
    Usually, there will only be one value for each `Stat`, but multiple can be used, e.g. to show variance.
-6. Add your metric to `__init__.py`.
 
 ## Data augmentations
 
@@ -138,13 +136,12 @@ multiple perturbations and applying it onto a single instance.
 
 ### Adding a new perturbation
 
-To add a new perturbation to the framework, create a new file at `src/helm/benchmark/augmentations` with the name
-`<Name of perturbation>_perturbation.py` e.g., `typo_perturbation.py`. Inside the file, create a new class
-(name it `<Name of the perturbation>Perturbation` e.g., `TypoPerturbation`)
-that extends the abstract class `Perturbation` and implement the `perturb` method which
-takes in text and outputs the perturbed text.
-Add your new perturbation to `src/helm/benchmark/__init__.py`.
-Add a test for the new perturbation in `test_perturbation.py`.
+1. To add a new perturbation to the framework, create a new file at `src/helm/benchmark/augmentations` with the name
+   `<Name of perturbation>_perturbation.py` e.g., `typo_perturbation.py`. Inside the file, create a new class
+   (name it `<Name of the perturbation>Perturbation` e.g., `TypoPerturbation`)
+   that extends the abstract class `Perturbation` and implement the `perturb` method which
+   takes in text and outputs the perturbed text.
+2. Add a test for the new perturbation in `test_perturbation.py`.
 
 ## Supporting new Hugging Face tokenizers
 

diff --git a/scripts/data_overlap/common/object_spec.py b/scripts/data_overlap/common/object_spec.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+import importlib
 from typing import Any, Dict, Tuple
 
 
@@ -18,12 +19,11 @@ def __hash__(self):
 
 def create_object(spec: ObjectSpec):
     """Create the actual object given the `spec`."""
-    # Adapted from https://stackoverflow.com/questions/547829/how-to-dynamically-load-a-python-class
     components = spec.class_name.split(".")
-    module: Any = __import__(components[0])
-    for component in components[1:]:
-        module = getattr(module, component)
-    return module(**spec.args)
+    class_name = components[-1]
+    module_name = ".".join(components[:-1])
+    cls = getattr(importlib.import_module(module_name), class_name)
+    return cls(**spec.args)
 
 
 def parse_object_spec(description: str) -> ObjectSpec:

diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
@@ -1,3 +1,4 @@
+import importlib
 import itertools
 from typing import Any, Callable, List, Dict, Optional, Set, TypeVar
 
@@ -466,7 +467,7 @@ def get_adapter_spec1() -> AdapterSpec:
 
 
 def get_basic_metric_specs(names: List[str]) -> List[MetricSpec]:
-    return [MetricSpec(class_name="helm.benchmark.basic_metrics.BasicMetric", args={"names": names})]
+    return [MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.BasicMetric", args={"names": names})]
 
 
 def get_exact_match_metric_specs() -> List[MetricSpec]:
@@ -482,7 +483,8 @@ def get_f1_metric_specs() -> List[MetricSpec]:
 def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
     return [
         MetricSpec(
-            class_name="helm.benchmark.classification_metrics.ClassificationMetric", args={"delimiter": delimiter}
+            class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
+            args={"delimiter": delimiter},
         )
     ]
 
@@ -496,7 +498,9 @@ def get_fairness_metric_specs() -> List[MetricSpec]:
 
 
 def get_bbq_metric_specs() -> List[MetricSpec]:
-    return [MetricSpec(class_name="helm.benchmark.bbq_metrics.BBQMetric", args={})] + get_exact_match_metric_specs()
+    return [
+        MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={})
+    ] + get_exact_match_metric_specs()
 
 
 def get_msmarco_metric_specs(track: str, rank: Optional[int] = None) -> List[MetricSpec]:
@@ -506,7 +510,7 @@ def get_msmarco_metric_specs(track: str, rank: Optional[int] = None) -> List[Met
 
     return [
         MetricSpec(
-            class_name="helm.benchmark.ranking_metrics.RankingMetric",
+            class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric",
             args={
                 "method": ADAPT_RANKING_BINARY,
                 "measure_names": measure_names,
@@ -521,7 +525,7 @@ def get_msmarco_metric_specs(track: str, rank: Optional[int] = None) -> List[Met
 
 def get_toxicity_metric_specs() -> List[MetricSpec]:
     return [
-        MetricSpec(class_name="helm.benchmark.toxicity_metrics.ToxicityMetric", args={}),
+        MetricSpec(class_name="helm.benchmark.metrics.toxicity_metrics.ToxicityMetric", args={}),
     ]
 
 
@@ -538,13 +542,13 @@ def get_bias_metric_specs() -> List[MetricSpec]:
 
     return [
         MetricSpec(
-            class_name="helm.benchmark.bias_metrics.BiasMetric",
+            class_name="helm.benchmark.metrics.bias_metrics.BiasMetric",
             args={"mode": "associations", "demographic_category": dem, "target_category": tgt},
         )
         for dem, tgt in cross_dem_target
     ] + [
         MetricSpec(
-            class_name="helm.benchmark.bias_metrics.BiasMetric",
+            class_name="helm.benchmark.metrics.bias_metrics.BiasMetric",
             args={"mode": "representation", "demographic_category": dem},
         )
         for dem in demographic_categories
@@ -561,14 +565,14 @@ def get_generative_harms_metric_specs(include_basic_metrics: bool = False) -> Li
 
 def get_summarization_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
     return [
-        MetricSpec(class_name="helm.benchmark.summarization_metrics.SummarizationMetric", args=args)
+        MetricSpec(class_name="helm.benchmark.metrics.summarization_metrics.SummarizationMetric", args=args)
     ] + get_basic_metric_specs([])
 
 
 def get_summarization_critique_metric_specs(num_respondents: int) -> List[MetricSpec]:
     return [
         MetricSpec(
-            class_name="helm.benchmark.summarization_critique_metrics.SummarizationCritiqueMetric",
+            class_name="helm.benchmark.metrics.summarization_critique_metrics.SummarizationCritiqueMetric",
             args={"num_respondents": num_respondents},
         )
     ]
@@ -586,7 +590,7 @@ def get_numeracy_metric_specs(run_solver: bool = False) -> List[MetricSpec]:
     # The solvers are slow to run so make them skippable
     if run_solver:
         metric_specs += [
-            MetricSpec(class_name="helm.benchmark.numeracy_metrics.DistanceMetric", args={}),
+            MetricSpec(class_name="helm.benchmark.metrics.numeracy_metrics.DistanceMetric", args={}),
         ]
     return metric_specs
 
@@ -600,15 +604,15 @@ def get_copyright_metric_specs(args: Optional[Dict] = None) -> List[MetricSpec]:
         args = {}
     return [
         MetricSpec(
-            class_name="helm.benchmark.copyright_metrics.BasicCopyrightMetric",
+            class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
             args={**args, "name": "longest_common_prefix_length"},
         ),
         MetricSpec(
-            class_name="helm.benchmark.copyright_metrics.BasicCopyrightMetric",
+            class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
             args={**args, "name": "edit_distance"},
         ),
         MetricSpec(
-            class_name="helm.benchmark.copyright_metrics.BasicCopyrightMetric",
+            class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
             args={**args, "name": "edit_similarity"},
         ),
     ] + get_basic_metric_specs([])
@@ -618,10 +622,14 @@ def get_disinformation_metric_specs(args: Optional[Dict] = None) -> List[MetricS
     if args is None:
         args = {}
     return [
-        MetricSpec(class_name="helm.benchmark.disinformation_metrics.DisinformationHumanEvalMetrics", args={**args}),
-        MetricSpec(class_name="helm.benchmark.disinformation_metrics.DisinformationMetric", args={"name": "self_bleu"}),
         MetricSpec(
-            class_name="helm.benchmark.disinformation_metrics.DisinformationMetric",
+            class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationHumanEvalMetrics", args={**args}
+        ),
+        MetricSpec(
+            class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric", args={"name": "self_bleu"}
+        ),
+        MetricSpec(
+            class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric",
             args={"name": "monte_carlo_entropy"},
         ),
     ] + get_basic_metric_specs([])
@@ -632,7 +640,7 @@ def get_code_metric_specs(dataset: str, timeout: float) -> List[MetricSpec]:
         return get_basic_metric_specs(["code_eval_acc", "pass"])
     else:  # APPS.
         args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout}
-        return [MetricSpec(class_name="helm.benchmark.code_metrics.APPSMetric", args=args)]
+        return [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)]
 
 
 def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
@@ -641,7 +649,7 @@ def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
 
 def get_machine_translation_metric_specs() -> List[MetricSpec]:
     return [
-        MetricSpec(class_name="helm.benchmark.machine_translation_metrics.MachineTranslationMetric", args={})
+        MetricSpec(class_name="helm.benchmark.metrics.machine_translation_metrics.MachineTranslationMetric", args={})
     ] + get_basic_metric_specs([])
 
 
@@ -2521,10 +2529,10 @@ def alter_run_spec(run_spec: RunSpec) -> RunSpec:
             increase_max_tokens_expander = IncreaseMaxTokensRunExpander(value=AnthropicClient.ADDITIONAL_TOKENS)
             # Get scenario tags
             components = run_spec.scenario_spec.class_name.split(".")
-            module: Any = __import__(components[0])
-            for component in components[1:]:
-                module = getattr(module, component)
-            scenario_tags: List[str] = module.tags
+            class_name = components[-1]
+            module_name = ".".join(components[:-1])
+            cls = getattr(importlib.import_module(module_name), class_name)
+            scenario_tags: List[str] = cls.tags
             # If the scenario is instruction, do not use PROMPT_ANSWER_START
             if "instructions" in scenario_tags:
                 format_expander = FormatPromptRunExpander(

diff --git a/src/helm/benchmark/scenarios/test_grammar.py b/src/helm/benchmark/scenarios/test_grammar.py
@@ -1,4 +1,4 @@
-from .grammar import (
+from helm.benchmark.scenarios.grammar import (
     Grammar,
     GrammarRule,
     Expansion,

diff --git a/src/helm/benchmark/scenarios/test_scenario.py b/src/helm/benchmark/scenarios/test_scenario.py
@@ -1,5 +1,5 @@
 from helm.benchmark.run_specs import get_scenario_spec_tiny
-from .scenario import create_scenario, Scenario, Input, PassageQuestionInput
+from helm.benchmark.scenarios.scenario import create_scenario, Scenario, Input, PassageQuestionInput
 
 
 class TestScenario:

diff --git a/src/helm/benchmark/test_data_preprocessor.py b/src/helm/benchmark/test_data_preprocessor.py
@@ -1,10 +1,10 @@
 from typing import List
 
-from .augmentations.data_augmenter import DataAugmenterSpec
-from .augmentations.perturbation import PerturbationSpec
-from .data_preprocessor import DataPreprocessor
-from .run_specs import get_scenario_spec1
-from .scenarios.scenario import create_scenario, Instance, Scenario, with_instance_ids
+from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
+from helm.benchmark.augmentations.perturbation import PerturbationSpec
+from helm.benchmark.data_preprocessor import DataPreprocessor
+from helm.benchmark.run_specs import get_scenario_spec1
+from helm.benchmark.scenarios.scenario import create_scenario, Instance, Scenario, with_instance_ids
 
 
 def test_data_preprocessor():

diff --git a/src/helm/common/object_spec.py b/src/helm/common/object_spec.py
@@ -1,3 +1,5 @@
+import importlib
+
 from dataclasses import dataclass
 from typing import Any, Dict, Tuple
 
@@ -18,12 +20,12 @@ def __hash__(self):
 
 def create_object(spec: ObjectSpec):
     """Create the actual object given the `spec`."""
-    # Adapted from https://stackoverflow.com/questions/547829/how-to-dynamically-load-a-python-class
+    # TODO: Refactor other places that use this pattern.
     components = spec.class_name.split(".")
-    module: Any = __import__(components[0])
-    for component in components[1:]:
-        module = getattr(module, component)
-    return module(**spec.args)
+    class_name = components[-1]
+    module_name = ".".join(components[:-1])
+    cls = getattr(importlib.import_module(module_name), class_name)
+    return cls(**spec.args)
 
 
 def parse_object_spec(description: str) -> ObjectSpec: