Skip to content

Commit

Permalink
Black
Browse files Browse the repository at this point in the history
  • Loading branch information
Jose J. Martinez committed Sep 13, 2023
1 parent 2bc3e21 commit f4b1f32
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 356 deletions.
19 changes: 5 additions & 14 deletions grants_tagger_light/augmentation/augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,33 +156,24 @@ def augment(

@augment_app.command()
def augment_cli(
data_path: str = typer.Argument(
...,
help="Path to mesh.jsonl"),
save_to_path: str = typer.Argument(
...,
help="Path to save the new jsonl data"
),
data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
save_to_path: str = typer.Argument(..., help="Path to save the new jsonl data"),
model_key: str = typer.Option(
"gpt-3.5-turbo",
help="LLM to use data augmentation. By now, only `openai` is supported",
),
num_proc: int = typer.Option(
os.cpu_count(),
help="Number of processes to use for data augmentation"
os.cpu_count(), help="Number of processes to use for data augmentation"
),
batch_size: int = typer.Option(
64,
help="Preprocessing batch size (for dataset, filter, map, ...)"
64, help="Preprocessing batch size (for dataset, filter, map, ...)"
),
min_examples: int = typer.Option(
None,
help="Minimum number of examples to require. "
"Less than that will trigger data augmentation.",
),
examples: int = typer.Option(
25,
help="Examples to generate per each tag."),
examples: int = typer.Option(25, help="Examples to generate per each tag."),
prompt_template: str = typer.Option(
"grants_tagger_light/augmentation/prompt.template",
help="File to use as a prompt. "
Expand Down
21 changes: 6 additions & 15 deletions grants_tagger_light/preprocessing/preprocess_mesh.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def preprocess_mesh(
batch_size=batch_size,
num_proc=num_proc,
desc="Tokenizing",
fn_kwargs={"tokenizer": tokenizer, "x_col": "abstractText"}
fn_kwargs={"tokenizer": tokenizer, "x_col": "abstractText"},
)
logger.info("Time taken to tokenize: {}".format(time.time() - t1))

Expand Down Expand Up @@ -225,35 +225,26 @@ def preprocess_mesh(

@preprocess_app.command()
def preprocess_mesh_cli(
data_path: str = typer.Argument(
...,
help="Path to mesh.jsonl"
),
data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
save_to_path: str = typer.Argument(
...,
help="Path to save the serialized PyArrow dataset after preprocessing"
..., help="Path to save the serialized PyArrow dataset after preprocessing"
),
model_key: str = typer.Argument(
...,
help="Key to use when loading tokenizer and label2id. "
"Leave blank if training from scratch", # noqa
),
test_size: float = typer.Option(
None,
help="Fraction of data to use for testing in (0,1] or number of rows"
None, help="Fraction of data to use for testing in (0,1] or number of rows"
),
num_proc: int = typer.Option(
os.cpu_count(),
help="Number of processes to use for preprocessing"
os.cpu_count(), help="Number of processes to use for preprocessing"
),
max_samples: int = typer.Option(
-1,
help="Maximum number of samples to use for preprocessing",
),
batch_size: int = typer.Option(
256,
help="Size of the preprocessing batch"
),
batch_size: int = typer.Option(256, help="Size of the preprocessing batch"),
tags: str = typer.Option(
None,
help="Comma-separated tags you want to include in the dataset "
Expand Down
127 changes: 0 additions & 127 deletions grants_tagger_light/retagging/cnn_gpu_config.cfg

This file was deleted.

124 changes: 0 additions & 124 deletions grants_tagger_light/retagging/config.cfg

This file was deleted.

Loading

0 comments on commit f4b1f32

Please sign in to comment.