Enable running extractions on all files in a directory (#304)

This enables passing a path to a directory through the `-i` input command for the `extract` command. All files with the `.txt` suffix will be parsed as inputs and outputs will be written to the same output file.
monarch-initiative · Jan 3, 2024 · 7553b3e · 7553b3e
2 parents 327a7bc + 91b0962
commit 7553b3e
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 14 deletions.
diff --git a/docs/functions.md b/docs/functions.md
@@ -36,6 +36,10 @@ The following options are available for most functions unless stated otherwise.
 
 Use the option `--inputfile` to specify a path to a file containing input text.
 
+For the `extract` command, this may be a single file or a directory of files.
+
+In the latter case, all .txt files will be assumed to be input, and the path will *not* be parsed recursively.
+
 ### template
 
 Use the option `--template` to specify a template to use. This is a required parameter.

diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py
@@ -277,8 +277,14 @@ def extract(
 
         ontogpt extract -t gocam.GoCamAnnotations -i gocam-27929086.txt
 
-    The input argument must be either a file path or a string.
-    Use the -i/--input-file option followed by the path to the input file if using the former.
+    The input argument may be:
+        A file path,
+        A directory,
+        or a string.
+    Use the -i/--input-file option followed by the path to the input file
+    or directory.
+    If the input is a directory, all files with the .txt extension will be read.
+    This is not recursive.
     Otherwise, the input is assumed to be a string to be read as input.
 
     You can also use fragments of existing schemas, use the --target-class option (-T) to
@@ -298,9 +304,17 @@ def extract(
     model_source = selectmodel["provider"]
     model_name = selectmodel["alternative_names"][0]
 
+    inputlist = []
+
     if not inputfile or inputfile == "-":
         text = sys.stdin.read()
-    if inputfile and Path(inputfile).exists():
+        inputlist.append(text)
+    elif inputfile and Path(inputfile).is_dir():
+        logging.info(f"Input file directory: {inputfile}")
+        inputfiles = Path(inputfile).glob("*.txt")
+        inputlist = [open(f, "r").read() for f in inputfiles if f.is_file()]
+        logging.info(f"Found {len(inputlist)} input files here.")
+    elif inputfile and Path(inputfile).exists():
         logging.info(f"Input file: {inputfile}")
         if use_textract:
             import textract
@@ -309,6 +323,7 @@ def extract(
         else:
             text = open(inputfile, "r").read()
         logging.info(f"Input text: {text}")
+        inputlist.append(text)
     elif inputfile and not Path(inputfile).exists():
         raise FileNotFoundError(f"Cannot find input file {inputfile}")
 
@@ -333,12 +348,20 @@ def extract(
         target_class_def = ke.schemaview.get_class(target_class)
     else:
         target_class_def = None
-    results = ke.extract_from_text(text=text, cls=target_class_def, show_prompt=show_prompt)
-    if set_slot_value:
-        for slot_value in set_slot_value:
-            slot, value = slot_value.split("=")
-            setattr(results.extracted_object, slot, value)
-    write_extraction(results, output, output_format, ke)
+
+    i = 0
+    for input_entry in inputlist:
+        if len(inputlist) > 1:
+            i = i + 1
+            logging.info(f"Now extracting from file {i} of {len(inputlist)}")
+        results = ke.extract_from_text(
+            text=input_entry, cls=target_class_def, show_prompt=show_prompt
+        )
+        if set_slot_value:
+            for slot_value in set_slot_value:
+                slot, value = slot_value.split("=")
+                setattr(results.extracted_object, slot, value)
+        write_extraction(results, output, output_format, ke)
 
 
 @main.command()
@@ -1464,7 +1487,7 @@ def eval_enrichment(genes, input_file, number_to_drop, annotations_path, model,
     default=False,
     show_default=True,
     help="If set, chunk input text, then prepare a separate prompt for each chunk."
-            " Otherwise the full input text is passed.",
+    " Otherwise the full input text is passed.",
 )
 @click.argument("evaluator")
 def eval(evaluator, num_tests, output, chunking, model, **kwargs):
@@ -1477,10 +1500,9 @@ def eval(evaluator, num_tests, output, chunking, model, **kwargs):
     else:
         modelname = DEFAULT_MODEL
 
-    evaluator = create_evaluator(name=evaluator,
-                                 num_tests=num_tests,
-                                 chunking=chunking,
-                                 model=modelname)
+    evaluator = create_evaluator(
+        name=evaluator, num_tests=num_tests, chunking=chunking, model=modelname
+    )
     eos = evaluator.eval()
     output.write(dump_minimal_yaml(eos, minimize=False))