Skip to content

Commit

Permalink
Enable running extractions on all files in a directory (#304)
Browse files Browse the repository at this point in the history
This enables passing a path to a directory through the `-i` input
command for the `extract` command.

All files with the `.txt` suffix will be parsed as inputs and outputs
will be written to the same output file.
  • Loading branch information
caufieldjh authored Jan 3, 2024
2 parents 327a7bc + 91b0962 commit 7553b3e
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 14 deletions.
4 changes: 4 additions & 0 deletions docs/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ The following options are available for most functions unless stated otherwise.

Use the option `--inputfile` to specify a path to a file containing input text.

For the `extract` command, this may be a single file or a directory of files.

In the latter case, all .txt files will be assumed to be input, and the path will *not* be parsed recursively.

### template

Use the option `--template` to specify a template to use. This is a required parameter.
Expand Down
50 changes: 36 additions & 14 deletions src/ontogpt/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,14 @@ def extract(
ontogpt extract -t gocam.GoCamAnnotations -i gocam-27929086.txt
The input argument must be either a file path or a string.
Use the -i/--input-file option followed by the path to the input file if using the former.
The input argument may be:
A file path,
A directory,
or a string.
Use the -i/--input-file option followed by the path to the input file
or directory.
If the input is a directory, all files with the .txt extension will be read.
This is not recursive.
Otherwise, the input is assumed to be a string to be read as input.
You can also use fragments of existing schemas, use the --target-class option (-T) to
Expand All @@ -298,9 +304,17 @@ def extract(
model_source = selectmodel["provider"]
model_name = selectmodel["alternative_names"][0]

inputlist = []

if not inputfile or inputfile == "-":
text = sys.stdin.read()
if inputfile and Path(inputfile).exists():
inputlist.append(text)
elif inputfile and Path(inputfile).is_dir():
logging.info(f"Input file directory: {inputfile}")
inputfiles = Path(inputfile).glob("*.txt")
inputlist = [open(f, "r").read() for f in inputfiles if f.is_file()]
logging.info(f"Found {len(inputlist)} input files here.")
elif inputfile and Path(inputfile).exists():
logging.info(f"Input file: {inputfile}")
if use_textract:
import textract
Expand All @@ -309,6 +323,7 @@ def extract(
else:
text = open(inputfile, "r").read()
logging.info(f"Input text: {text}")
inputlist.append(text)
elif inputfile and not Path(inputfile).exists():
raise FileNotFoundError(f"Cannot find input file {inputfile}")

Expand All @@ -333,12 +348,20 @@ def extract(
target_class_def = ke.schemaview.get_class(target_class)
else:
target_class_def = None
results = ke.extract_from_text(text=text, cls=target_class_def, show_prompt=show_prompt)
if set_slot_value:
for slot_value in set_slot_value:
slot, value = slot_value.split("=")
setattr(results.extracted_object, slot, value)
write_extraction(results, output, output_format, ke)

i = 0
for input_entry in inputlist:
if len(inputlist) > 1:
i = i + 1
logging.info(f"Now extracting from file {i} of {len(inputlist)}")
results = ke.extract_from_text(
text=input_entry, cls=target_class_def, show_prompt=show_prompt
)
if set_slot_value:
for slot_value in set_slot_value:
slot, value = slot_value.split("=")
setattr(results.extracted_object, slot, value)
write_extraction(results, output, output_format, ke)


@main.command()
Expand Down Expand Up @@ -1464,7 +1487,7 @@ def eval_enrichment(genes, input_file, number_to_drop, annotations_path, model,
default=False,
show_default=True,
help="If set, chunk input text, then prepare a separate prompt for each chunk."
" Otherwise the full input text is passed.",
" Otherwise the full input text is passed.",
)
@click.argument("evaluator")
def eval(evaluator, num_tests, output, chunking, model, **kwargs):
Expand All @@ -1477,10 +1500,9 @@ def eval(evaluator, num_tests, output, chunking, model, **kwargs):
else:
modelname = DEFAULT_MODEL

evaluator = create_evaluator(name=evaluator,
num_tests=num_tests,
chunking=chunking,
model=modelname)
evaluator = create_evaluator(
name=evaluator, num_tests=num_tests, chunking=chunking, model=modelname
)
eos = evaluator.eval()
output.write(dump_minimal_yaml(eos, minimize=False))

Expand Down

0 comments on commit 7553b3e

Please sign in to comment.