Black

MantisAI · Sep 13, 2023 · f4b1f32 · f4b1f32
1 parent 2bc3e21
commit f4b1f32
Show file tree

Hide file tree

Showing 5 changed files with 102 additions and 356 deletions.
diff --git a/grants_tagger_light/augmentation/augment.py b/grants_tagger_light/augmentation/augment.py
@@ -156,33 +156,24 @@ def augment(
 
 @augment_app.command()
 def augment_cli(
-    data_path: str = typer.Argument(
-        ...,
-        help="Path to mesh.jsonl"),
-    save_to_path: str = typer.Argument(
-        ...,
-        help="Path to save the new jsonl data"
-    ),
+    data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
+    save_to_path: str = typer.Argument(..., help="Path to save the new jsonl data"),
     model_key: str = typer.Option(
         "gpt-3.5-turbo",
         help="LLM to use data augmentation. By now, only `openai` is supported",
     ),
     num_proc: int = typer.Option(
-        os.cpu_count(),
-        help="Number of processes to use for data augmentation"
+        os.cpu_count(), help="Number of processes to use for data augmentation"
     ),
     batch_size: int = typer.Option(
-        64,
-        help="Preprocessing batch size (for dataset, filter, map, ...)"
+        64, help="Preprocessing batch size (for dataset, filter, map, ...)"
     ),
     min_examples: int = typer.Option(
         None,
         help="Minimum number of examples to require. "
         "Less than that will trigger data augmentation.",
     ),
-    examples: int = typer.Option(
-        25,
-        help="Examples to generate per each tag."),
+    examples: int = typer.Option(25, help="Examples to generate per each tag."),
     prompt_template: str = typer.Option(
         "grants_tagger_light/augmentation/prompt.template",
         help="File to use as a prompt. "

diff --git a/grants_tagger_light/preprocessing/preprocess_mesh.py b/grants_tagger_light/preprocessing/preprocess_mesh.py
@@ -117,7 +117,7 @@ def preprocess_mesh(
         batch_size=batch_size,
         num_proc=num_proc,
         desc="Tokenizing",
-        fn_kwargs={"tokenizer": tokenizer, "x_col": "abstractText"}
+        fn_kwargs={"tokenizer": tokenizer, "x_col": "abstractText"},
     )
     logger.info("Time taken to tokenize: {}".format(time.time() - t1))
 
@@ -225,35 +225,26 @@ def preprocess_mesh(
 
 @preprocess_app.command()
 def preprocess_mesh_cli(
-    data_path: str = typer.Argument(
-        ...,
-        help="Path to mesh.jsonl"
-    ),
+    data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
     save_to_path: str = typer.Argument(
-        ...,
-        help="Path to save the serialized PyArrow dataset after preprocessing"
+        ..., help="Path to save the serialized PyArrow dataset after preprocessing"
     ),
     model_key: str = typer.Argument(
         ...,
         help="Key to use when loading tokenizer and label2id. "
         "Leave blank if training from scratch",  # noqa
     ),
     test_size: float = typer.Option(
-        None,
-        help="Fraction of data to use for testing in (0,1] or number of rows"
+        None, help="Fraction of data to use for testing in (0,1] or number of rows"
     ),
     num_proc: int = typer.Option(
-        os.cpu_count(),
-        help="Number of processes to use for preprocessing"
+        os.cpu_count(), help="Number of processes to use for preprocessing"
     ),
     max_samples: int = typer.Option(
         -1,
         help="Maximum number of samples to use for preprocessing",
     ),
-    batch_size: int = typer.Option(
-        256,
-        help="Size of the preprocessing batch"
-    ),
+    batch_size: int = typer.Option(256, help="Size of the preprocessing batch"),
     tags: str = typer.Option(
         None,
         help="Comma-separated tags you want to include in the dataset "

diff --git a/grants_tagger_light/retagging/cnn_gpu_config.cfg b/grants_tagger_light/retagging/cnn_gpu_config.cfg
diff --git a/grants_tagger_light/retagging/config.cfg b/grants_tagger_light/retagging/config.cfg