allenai · AkshitaB · Dec 13, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Updated code that records the fine-grained perplexity metrics per subdomain to also include perplexity over words, characters, bytes, and also bits per byte
 - Added option to track avg logit per token type
 - Added script that uses the tango steps as functions, and bypasses the tango caching mechanism, for simpler execution
+- minimal example of how to run Paloma from HF hub as well as step to output results in jsonl.gz format
 
 ### Fixed
 

diff --git a/configs/example_paloma_config.jsonnet b/configs/example_paloma_config.jsonnet
@@ -0,0 +1,44 @@
+/*--------------------------------------- Configurations -----------------------------------------*/
+
+local utils = import 'utils.libsonnet';
+
+//❗ To run this config you will need to first set up the data following the instructions in ai2-llm-eval/eval_data/README.md
+//❗ Also note that this will run validation results. Change to paloma_hf_release_test.libsonnet to run test results.
+local ppl_suite = import 'task_sets/paloma_hf_release_val.libsonnet';
+
+
+//❗Set gsheet to the name of your google sheet.
+// Set it to null if you do not want your results to be uploaded to a google sheet (they will still be saved as an object).
+local gsheet = null;
+//❗Set output_dir to a directory where you want to save outputs as jsonl.gz files .
+// Set it to null if you do not want your results saved as jsonl.gz files.
+local output_dir = null;
+
+local create_models = function(model_path, revisions, gpus_needed) [
+    {
+        model_path: model_path,
+        revision: rev,
+        gpus_needed: gpus_needed,
+        prediction_kwargs: {
+            model_max_length: 2048, //❗Ensure that this is set to the actual max len of your model
+            limit: 2, //❗ Here we only run 2 examples per task for testing purposes. Set this to null to run all examples.
+        }
+    }
+    for rev in revisions
+];
+
+local revisions = [
+    "step" + std.toString(i * 10000)
+    for i in std.range(14, 14) //❗ Set this to the range of revisions you want to run.
+];
+
+
+local models = create_models("EleutherAI/pythia-160m-seed1", revisions, 1); 
+local task_sets = [
+    ppl_suite.task_set
+];
+
+
+{
+    steps: utils.create_fine_grained_pipeline(models, task_sets, gsheet, output_dir)
+}
diff --git a/configs/task_sets/paloma_hf_release_test.libsonnet b/configs/task_sets/paloma_hf_release_test.libsonnet
@@ -0,0 +1,53 @@
+
+local task_utils = import 'task_utils.libsonnet';
+
+local common_kwargs = {
+    task_name: "ppl_custom",
+    task_kwargs: {
+        keep_all_instance_fields_except: ["text", "tokens"],
+        detailed_output: true,
+    },
+    prediction_kwargs: {
+        split: "test",
+        model_max_length: task_utils.model_max_length,
+    }
+};
+
+// TODO: refactor catwalk's Perplexity task so that it actually uses the s3 path.
+// until then, let the path be present in nfs ($EVAL_DATA_PATH).
+local data_dir = "paloma/";
+
+local create_task_kwargs(task_names) = [
+    {
+        task_kwargs: {
+            task_rename: "ppl_" + task_name,
+            files: [data_dir + "/" + task_name + "/test"]
+        }
+    }
+    for task_name in task_names
+];
+
+local task_dicts = create_task_kwargs(
+    [
+    "m2d2_s2orc_unsplit",
+    "m2d2_wikipedia_unsplit",
+    "c4_100_domains",
+    "c4_en",
+    "mc4",
+    "4chan_meta_sep",
+    "manosphere_meta_sep",
+    "gab",
+    "twitterAAE_HELM_fixed",
+    "wikitext_103",
+    "ptb",
+    "redpajama",
+    "falcon-refinedweb",
+    "dolma-v1_5",
+    "dolma_100_subreddits",
+    "dolma_100_programing_languages"
+    ]
+);
+
+{
+    task_set: task_utils.create_task_set_from_task_dicts("eval_suite", task_dicts, common_kwargs)
+}
diff --git a/configs/task_sets/paloma_hf_release_val.libsonnet b/configs/task_sets/paloma_hf_release_val.libsonnet
@@ -0,0 +1,53 @@
+
+local task_utils = import 'task_utils.libsonnet';
+
+local common_kwargs = {
+    task_name: "ppl_custom",
+    task_kwargs: {
+        keep_all_instance_fields_except: ["text", "tokens"],
+        detailed_output: true,
+    },
+    prediction_kwargs: {
+        split: "validation",
+        model_max_length: task_utils.model_max_length,
+    }
+};
+
+// TODO: refactor catwalk's Perplexity task so that it actually uses the s3 path.
+// until then, let the path be present in nfs ($EVAL_DATA_PATH).
+local data_dir = "paloma/";
+
+local create_task_kwargs(task_names) = [
+    {
+        task_kwargs: {
+            task_rename: "ppl_" + task_name,
+            files: [data_dir + "/" + task_name + "/val"]
+        }
+    }
+    for task_name in task_names
+];
+
+local task_dicts = create_task_kwargs(
+    [
+    "m2d2_s2orc_unsplit",
+    "m2d2_wikipedia_unsplit",
+    "c4_100_domains",
+    "c4_en",
+    "mc4",
+    "4chan_meta_sep",
+    "manosphere_meta_sep",
+    "gab",
+    "twitterAAE_HELM_fixed",
+    "wikitext_103",
+    "ptb",
+    "redpajama",
+    "falcon-refinedweb",
+    "dolma-v1_5",
+    "dolma_100_subreddits",
+    "dolma_100_programing_languages"
+    ]
+);
+
+{
+    task_set: task_utils.create_task_set_from_task_dicts("eval_suite", task_dicts, common_kwargs)
+}
diff --git a/configs/utils.libsonnet b/configs/utils.libsonnet
@@ -192,6 +192,18 @@ local create_processed_outputs_as_rows_multiple_metrics_steps(model_task_configs
         }
     };
 
+local create_save_write_outputs_as_rows_multiple_metrics_as_file_steps(output_dir) =
+    {
+        "save-to-file": {
+            type: "save-write-outputs-as-rows-multiple-metrics-as-file",
+            write_outputs: {type: "ref", ref: "combine-all-outputs"},
+            output_dir: output_dir,
+            step_resources: {
+                gpu_count: 0
+            }
+        }
+    };
+
 local create_pipeline(models, task_sets, gsheet) =
 
     // Model steps
@@ -218,7 +230,7 @@ local create_pipeline(models, task_sets, gsheet) =
 
     all_steps;
 
-local create_fine_grained_pipeline(models, task_sets, gsheet) =
+local create_fine_grained_pipeline(models, task_sets, gsheet, output_dir = null) =
 
     // Model steps
     local model_location_steps = create_model_location_steps(models);
@@ -237,13 +249,16 @@ local create_fine_grained_pipeline(models, task_sets, gsheet) =
     // Aggregate results for each task set and model combination
     local combine_all_outputs = create_processed_outputs_as_rows_multiple_metrics_steps(model_task_configs, gsheet);
 
+    local save_to_file = create_save_write_outputs_as_rows_multiple_metrics_as_file_steps(output_dir);
+
     local all_steps =
         model_location_steps +
         catwalk_model_steps +
         task_steps +
         outputs_steps +
         processed_outputs_steps +
-        combine_all_outputs;
+        combine_all_outputs +
+        save_to_file;
 
     all_steps;
 

diff --git a/llm_eval/steps/run_catwalk.py b/llm_eval/steps/run_catwalk.py
@@ -442,6 +442,7 @@ def run(
                         row = {}
                         task = d["task"]
                         row["model"] = model
+                        row["split"] = pred_kwargs["split"]
                         if "revision" in d["model_kwargs"]:
                             row["revision"] = d["model_kwargs"]["revision"]
                         row["subdomain"] = subdomain
@@ -462,6 +463,35 @@ def run(
         return per_metric_type_tsv_outputs
 
 
+@Step.register("save-write-outputs-as-rows-multiple-metrics-as-file")
+class SaveWriteOutputsAsRowsMultipleMetricsAsFile(Step):
+    VERSION = "001"
+
+    def run(self, write_outputs: Dict[str, List[Dict]], output_dir: str) -> None:
+        import smart_open
+
+        if output_dir is None:
+            logger.info("output_file is None, skipping save to file")
+            return
+        transport_params = None
+        if output_dir.startswith("s3://"):
+            import boto3
+
+            session = boto3.Session(
+                aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
+                aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
+                aws_session_token=os.environ["AWS_SESSION_TOKEN"],
+            )
+            client = session.client("s3")
+            transport_params = dict(client=client)
+        for table_name in write_outputs:
+            output_file = os.path.join(output_dir, table_name + ".jsonl.gz")
+            with smart_open.open(output_file, "wb", transport_params=transport_params) as f:
+                for row in tqdm(write_outputs[table_name], desc=f"writing {table_name} to file"):
+                    f.write(json.dumps(row).encode())
+                    f.write(b"\n")
+
+
 def write_to_gsheet(gsheet: str, rows: List[Dict], sheet_title: str = "Sheet1"):
     import pygsheets
 

diff --git a/paloma/README.md b/paloma/README.md
@@ -0,0 +1,44 @@
+# Paloma
+
+The Paloma benchmark makes use of this repo to run evaluation inference. This readme will explain everything you need to know to get results on Paloma and make a submission to our benchmark.
+
+Links:
+
+[Data](https://huggingface.co/datasets/allenai/paloma)
+
+## Getting existing results from the benchmark
+Paloma is first and foremost a suite of results from the research community organized by comprability. These are formated as *.jsonl.gz files recording the perplexity per domain over our 585 domains as well as additional metrics discussed in our paper. These are files are the same type of results that are output by running the code in this repo for a given model.
+
+We are also building out a website to allow interactive inspection of these multi-dimensional results. Until then please contact us by emailing the first author of Paloma if you would like access to the raw benchmark results.
+
+So far the models evaluated by the benchmark are the 6 baseline 1B parameter models that we release with Paloma as well as `EleutherAI/pythia-160m`, `EleutherAI/pythia-1B`, and `EleutherAI/pythia-6.9b`.
+
+## Setup
+Start by following the installation instructions for this repo in this [readme](../README.md).
+
+Then follow the instructions in this [readme](eval_data/README.md) to obtain and set up the evaluation data.
+
+## Running evaluation
+After following the setup instructions above, you can make an evaluation configuration based on our template [here](../configs/example_paloma_config.jsonnet). This is designed to work with any model hosted on the HuggingFace hub. Just specify the name of the model on the hub and any revisions (i.e., checkpoints) that you want results over. Read the comments in the configuration with the ❗ symbol for more information about details you may need to fill in.  Finally make sure to set an output directory for `output_dir` where you want the job to output your results. 
+
+Now you can run your evaluation job locally with the following command (from the root of this repo):
+```
+tango --settings tango.yml run configs/example_paloma_config.jsonnet --workspace my-eval-workspace
+```
+
+## Pretraining your model
+Note that if you want to make a submission to our benchmark you must choose whether you will opt in to several experimental controls that will allow your submission to be marked for the greatest level of comparability. In this section we detail how you can accomplish these experimental controls.
+
+### Decontaminating your pretraining data
+Our decontamination approach is implemented in the Dolma Tooling repo. This will allow you to remove any document from any your pretraining data that is contaminated with respect to the Paloma.
+
+To do this please follow the instructions [here](https://github.com/allenai/dolma/blob/decon-instructions/docs/paloma_decontamination.md) to decontaminate your own pretraining data.
+
+### Fixing the training data order
+Our approach for fixing the training data order requires the use of the same training code that we employ to train our 1B parameter baselines. This training code cannot yet be released as it is being developed for a separate, ongoing project. When that code is released we will update our instructions here to enable use of this experimental control. If you wish to use this experimental control before then, please feel free to reach out to the first author of Paloma.
+
+### Fixing the vocabulary
+We ask that submissions that do not investigate changes in vocabulary opt in to our standardized vocabulary to enable the greatest level of comprability. That vocabulary is available from the tokenizer hosted on HuggingFace hub as `allenai/gpt-neox-olmo-dolma-v1_5`. 
+
+## Making a submission
+At present we are building out an automatic submission process that will soon be available. Until then please reach out to us by emailing the first author of Paloma, if you would like to submit results to the benchmark.
diff --git a/paloma/eval_data/README.md b/paloma/eval_data/README.md
@@ -0,0 +1,15 @@
+# Local evaluation data
+
+This directory is used as a temporary work around until we implement perplexity inference with HF hub datasets.
+
+To use Paloma with this pipeline you will need to first download the data from HF hub (install git lfs first in necessary):
+```
+huggingface-cli login
+git lfs install
+git clone https://huggingface.co/datasets/olmo-friends/paloma
+```
+
+Then when you run the pipe line you will first need to export the path to this data
+```
+export EVAL_DATA_PATH=$(pwd)
+```