Skip to content

Commit

Permalink
Use canonical fully qualified class names for metrics (stanford-crfm#…
Browse files Browse the repository at this point in the history
  • Loading branch information
yifanmai authored and danielz02 committed Sep 7, 2023
1 parent b38e978 commit 529bf44
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 53 deletions.
25 changes: 11 additions & 14 deletions docs/code.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ In order to implement new scenarios:
4. Note that you need not enumerate every possible correct answer (nor must
there even necessarily be a correct answer).
5. Make sure to document your scenario well with a clear docstring.
6. In addition, specify its `name`, `description`, and `tags` and define a class
`__init__` function even if it is simply `pass`.
6. In addition, specify its `name`, `description`, and `tags`.
7. Define a function `get_specname_spec` in `run_specs.py` to retrieve a `ScenarioSpec`
for your scenario using a class name corresponding to the Python path of
the class (e.g. `helm.benchmark.scenarios.your_scenario.YourScenario`) and any
Expand All @@ -79,17 +78,17 @@ In order to implement new scenarios:
`name` corresponding to the scenario name and any patterns to match in
curly braces, a `scenario_spec`, an `adapter_spec`, `metric_specs`,
and `groups`.
12. Add the scenario to `__init__.py`
13. Attempt to run your task with
12. Attempt to run your task with
`venv/bin/helm-run -r yourscenarioname:arg=value` where
`yourscenarioname` matches the `name` specified in YourScenario
14. Add the spec to dictionary `CANONICAL_RUN_SPEC_FUNCS` in `run_specs.py`.
15. Update `src/helm/proxy/static/contamination.yaml` with models that we trained on your scenario (i.e. contaminated).
13. Add the spec to dictionary `CANONICAL_RUN_SPEC_FUNCS` in `run_specs.py`.
14. Update `src/helm/proxy/static/contamination.yaml` with models that we trained on your scenario (i.e. contaminated).


## Adding new metrics

To add a new metric:

1. If the metric is task-specific, create a new `yourtask_metrics.py` file.
Otherwise, if the metric is generic and likely to be widely used, add it
to `basic_metrics.py`.
Expand All @@ -101,7 +100,6 @@ To add a new metric:
(e.g. multiple distance metrics).
5. For each `value` generated for a `Stat`, add it to `yourstat` using `yourstat.add(value)`.
Usually, there will only be one value for each `Stat`, but multiple can be used, e.g. to show variance.
6. Add your metric to `__init__.py`.

## Data augmentations

Expand Down Expand Up @@ -138,13 +136,12 @@ multiple perturbations and applying it onto a single instance.

### Adding a new perturbation

To add a new perturbation to the framework, create a new file at `src/helm/benchmark/augmentations` with the name
`<Name of perturbation>_perturbation.py` e.g., `typo_perturbation.py`. Inside the file, create a new class
(name it `<Name of the perturbation>Perturbation` e.g., `TypoPerturbation`)
that extends the abstract class `Perturbation` and implement the `perturb` method which
takes in text and outputs the perturbed text.
Add your new perturbation to `src/helm/benchmark/__init__.py`.
Add a test for the new perturbation in `test_perturbation.py`.
1. To add a new perturbation to the framework, create a new file at `src/helm/benchmark/augmentations` with the name
`<Name of perturbation>_perturbation.py` e.g., `typo_perturbation.py`. Inside the file, create a new class
(name it `<Name of the perturbation>Perturbation` e.g., `TypoPerturbation`)
that extends the abstract class `Perturbation` and implement the `perturb` method which
takes in text and outputs the perturbed text.
2. Add a test for the new perturbation in `test_perturbation.py`.

## Supporting new Hugging Face tokenizers

Expand Down
10 changes: 5 additions & 5 deletions scripts/data_overlap/common/object_spec.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from dataclasses import dataclass
import importlib
from typing import Any, Dict, Tuple


Expand All @@ -18,12 +19,11 @@ def __hash__(self):

def create_object(spec: ObjectSpec):
"""Create the actual object given the `spec`."""
# Adapted from https://stackoverflow.com/questions/547829/how-to-dynamically-load-a-python-class
components = spec.class_name.split(".")
module: Any = __import__(components[0])
for component in components[1:]:
module = getattr(module, component)
return module(**spec.args)
class_name = components[-1]
module_name = ".".join(components[:-1])
cls = getattr(importlib.import_module(module_name), class_name)
return cls(**spec.args)


def parse_object_spec(description: str) -> ObjectSpec:
Expand Down
52 changes: 30 additions & 22 deletions src/helm/benchmark/run_specs.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import importlib
import itertools
from typing import Any, Callable, List, Dict, Optional, Set, TypeVar

Expand Down Expand Up @@ -466,7 +467,7 @@ def get_adapter_spec1() -> AdapterSpec:


def get_basic_metric_specs(names: List[str]) -> List[MetricSpec]:
return [MetricSpec(class_name="helm.benchmark.basic_metrics.BasicMetric", args={"names": names})]
return [MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.BasicMetric", args={"names": names})]


def get_exact_match_metric_specs() -> List[MetricSpec]:
Expand All @@ -482,7 +483,8 @@ def get_f1_metric_specs() -> List[MetricSpec]:
def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
return [
MetricSpec(
class_name="helm.benchmark.classification_metrics.ClassificationMetric", args={"delimiter": delimiter}
class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
args={"delimiter": delimiter},
)
]

Expand All @@ -496,7 +498,9 @@ def get_fairness_metric_specs() -> List[MetricSpec]:


def get_bbq_metric_specs() -> List[MetricSpec]:
return [MetricSpec(class_name="helm.benchmark.bbq_metrics.BBQMetric", args={})] + get_exact_match_metric_specs()
return [
MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={})
] + get_exact_match_metric_specs()


def get_msmarco_metric_specs(track: str, rank: Optional[int] = None) -> List[MetricSpec]:
Expand All @@ -506,7 +510,7 @@ def get_msmarco_metric_specs(track: str, rank: Optional[int] = None) -> List[Met

return [
MetricSpec(
class_name="helm.benchmark.ranking_metrics.RankingMetric",
class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric",
args={
"method": ADAPT_RANKING_BINARY,
"measure_names": measure_names,
Expand All @@ -521,7 +525,7 @@ def get_msmarco_metric_specs(track: str, rank: Optional[int] = None) -> List[Met

def get_toxicity_metric_specs() -> List[MetricSpec]:
return [
MetricSpec(class_name="helm.benchmark.toxicity_metrics.ToxicityMetric", args={}),
MetricSpec(class_name="helm.benchmark.metrics.toxicity_metrics.ToxicityMetric", args={}),
]


Expand All @@ -538,13 +542,13 @@ def get_bias_metric_specs() -> List[MetricSpec]:

return [
MetricSpec(
class_name="helm.benchmark.bias_metrics.BiasMetric",
class_name="helm.benchmark.metrics.bias_metrics.BiasMetric",
args={"mode": "associations", "demographic_category": dem, "target_category": tgt},
)
for dem, tgt in cross_dem_target
] + [
MetricSpec(
class_name="helm.benchmark.bias_metrics.BiasMetric",
class_name="helm.benchmark.metrics.bias_metrics.BiasMetric",
args={"mode": "representation", "demographic_category": dem},
)
for dem in demographic_categories
Expand All @@ -561,14 +565,14 @@ def get_generative_harms_metric_specs(include_basic_metrics: bool = False) -> Li

def get_summarization_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
return [
MetricSpec(class_name="helm.benchmark.summarization_metrics.SummarizationMetric", args=args)
MetricSpec(class_name="helm.benchmark.metrics.summarization_metrics.SummarizationMetric", args=args)
] + get_basic_metric_specs([])


def get_summarization_critique_metric_specs(num_respondents: int) -> List[MetricSpec]:
return [
MetricSpec(
class_name="helm.benchmark.summarization_critique_metrics.SummarizationCritiqueMetric",
class_name="helm.benchmark.metrics.summarization_critique_metrics.SummarizationCritiqueMetric",
args={"num_respondents": num_respondents},
)
]
Expand All @@ -586,7 +590,7 @@ def get_numeracy_metric_specs(run_solver: bool = False) -> List[MetricSpec]:
# The solvers are slow to run so make them skippable
if run_solver:
metric_specs += [
MetricSpec(class_name="helm.benchmark.numeracy_metrics.DistanceMetric", args={}),
MetricSpec(class_name="helm.benchmark.metrics.numeracy_metrics.DistanceMetric", args={}),
]
return metric_specs

Expand All @@ -600,15 +604,15 @@ def get_copyright_metric_specs(args: Optional[Dict] = None) -> List[MetricSpec]:
args = {}
return [
MetricSpec(
class_name="helm.benchmark.copyright_metrics.BasicCopyrightMetric",
class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
args={**args, "name": "longest_common_prefix_length"},
),
MetricSpec(
class_name="helm.benchmark.copyright_metrics.BasicCopyrightMetric",
class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
args={**args, "name": "edit_distance"},
),
MetricSpec(
class_name="helm.benchmark.copyright_metrics.BasicCopyrightMetric",
class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
args={**args, "name": "edit_similarity"},
),
] + get_basic_metric_specs([])
Expand All @@ -618,10 +622,14 @@ def get_disinformation_metric_specs(args: Optional[Dict] = None) -> List[MetricS
if args is None:
args = {}
return [
MetricSpec(class_name="helm.benchmark.disinformation_metrics.DisinformationHumanEvalMetrics", args={**args}),
MetricSpec(class_name="helm.benchmark.disinformation_metrics.DisinformationMetric", args={"name": "self_bleu"}),
MetricSpec(
class_name="helm.benchmark.disinformation_metrics.DisinformationMetric",
class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationHumanEvalMetrics", args={**args}
),
MetricSpec(
class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric", args={"name": "self_bleu"}
),
MetricSpec(
class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric",
args={"name": "monte_carlo_entropy"},
),
] + get_basic_metric_specs([])
Expand All @@ -632,7 +640,7 @@ def get_code_metric_specs(dataset: str, timeout: float) -> List[MetricSpec]:
return get_basic_metric_specs(["code_eval_acc", "pass"])
else: # APPS.
args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout}
return [MetricSpec(class_name="helm.benchmark.code_metrics.APPSMetric", args=args)]
return [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)]


def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
Expand All @@ -641,7 +649,7 @@ def get_open_ended_generation_metric_specs() -> List[MetricSpec]:

def get_machine_translation_metric_specs() -> List[MetricSpec]:
return [
MetricSpec(class_name="helm.benchmark.machine_translation_metrics.MachineTranslationMetric", args={})
MetricSpec(class_name="helm.benchmark.metrics.machine_translation_metrics.MachineTranslationMetric", args={})
] + get_basic_metric_specs([])


Expand Down Expand Up @@ -2521,10 +2529,10 @@ def alter_run_spec(run_spec: RunSpec) -> RunSpec:
increase_max_tokens_expander = IncreaseMaxTokensRunExpander(value=AnthropicClient.ADDITIONAL_TOKENS)
# Get scenario tags
components = run_spec.scenario_spec.class_name.split(".")
module: Any = __import__(components[0])
for component in components[1:]:
module = getattr(module, component)
scenario_tags: List[str] = module.tags
class_name = components[-1]
module_name = ".".join(components[:-1])
cls = getattr(importlib.import_module(module_name), class_name)
scenario_tags: List[str] = cls.tags
# If the scenario is instruction, do not use PROMPT_ANSWER_START
if "instructions" in scenario_tags:
format_expander = FormatPromptRunExpander(
Expand Down
2 changes: 1 addition & 1 deletion src/helm/benchmark/scenarios/test_grammar.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .grammar import (
from helm.benchmark.scenarios.grammar import (
Grammar,
GrammarRule,
Expansion,
Expand Down
2 changes: 1 addition & 1 deletion src/helm/benchmark/scenarios/test_scenario.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from helm.benchmark.run_specs import get_scenario_spec_tiny
from .scenario import create_scenario, Scenario, Input, PassageQuestionInput
from helm.benchmark.scenarios.scenario import create_scenario, Scenario, Input, PassageQuestionInput


class TestScenario:
Expand Down
10 changes: 5 additions & 5 deletions src/helm/benchmark/test_data_preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from typing import List

from .augmentations.data_augmenter import DataAugmenterSpec
from .augmentations.perturbation import PerturbationSpec
from .data_preprocessor import DataPreprocessor
from .run_specs import get_scenario_spec1
from .scenarios.scenario import create_scenario, Instance, Scenario, with_instance_ids
from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
from helm.benchmark.augmentations.perturbation import PerturbationSpec
from helm.benchmark.data_preprocessor import DataPreprocessor
from helm.benchmark.run_specs import get_scenario_spec1
from helm.benchmark.scenarios.scenario import create_scenario, Instance, Scenario, with_instance_ids


def test_data_preprocessor():
Expand Down
12 changes: 7 additions & 5 deletions src/helm/common/object_spec.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import importlib

from dataclasses import dataclass
from typing import Any, Dict, Tuple

Expand All @@ -18,12 +20,12 @@ def __hash__(self):

def create_object(spec: ObjectSpec):
"""Create the actual object given the `spec`."""
# Adapted from https://stackoverflow.com/questions/547829/how-to-dynamically-load-a-python-class
# TODO: Refactor other places that use this pattern.
components = spec.class_name.split(".")
module: Any = __import__(components[0])
for component in components[1:]:
module = getattr(module, component)
return module(**spec.args)
class_name = components[-1]
module_name = ".".join(components[:-1])
cls = getattr(importlib.import_module(module_name), class_name)
return cls(**spec.args)


def parse_object_spec(description: str) -> ObjectSpec:
Expand Down

0 comments on commit 529bf44

Please sign in to comment.