From d251e99572fc5cd2b1f58f090343b794f959523f Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 2 Jan 2024 15:26:28 -0500 Subject: [PATCH 1/5] pre-linting --- src/ontogpt/cli.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py index 28e4906d6..751a77351 100644 --- a/src/ontogpt/cli.py +++ b/src/ontogpt/cli.py @@ -1464,7 +1464,7 @@ def eval_enrichment(genes, input_file, number_to_drop, annotations_path, model, default=False, show_default=True, help="If set, chunk input text, then prepare a separate prompt for each chunk." - " Otherwise the full input text is passed.", + " Otherwise the full input text is passed.", ) @click.argument("evaluator") def eval(evaluator, num_tests, output, chunking, model, **kwargs): @@ -1477,10 +1477,9 @@ def eval(evaluator, num_tests, output, chunking, model, **kwargs): else: modelname = DEFAULT_MODEL - evaluator = create_evaluator(name=evaluator, - num_tests=num_tests, - chunking=chunking, - model=modelname) + evaluator = create_evaluator( + name=evaluator, num_tests=num_tests, chunking=chunking, model=modelname + ) eos = evaluator.eval() output.write(dump_minimal_yaml(eos, minimize=False)) From d6f73488aeba5a5c811d7a5e39da29d47899821b Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 2 Jan 2024 15:50:48 -0500 Subject: [PATCH 2/5] Add directory parsing to extract command --- src/ontogpt/cli.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py index 751a77351..b79a8511e 100644 --- a/src/ontogpt/cli.py +++ b/src/ontogpt/cli.py @@ -277,8 +277,14 @@ def extract( ontogpt extract -t gocam.GoCamAnnotations -i gocam-27929086.txt - The input argument must be either a file path or a string. - Use the -i/--input-file option followed by the path to the input file if using the former. + The input argument may be: + A file path, + A directory, + or a string. + Use the -i/--input-file option followed by the path to the input file + or directory. + If the input is a directory, all files with the .txt extension will be read. + This is not recursive. Otherwise, the input is assumed to be a string to be read as input. You can also use fragments of existing schemas, use the --target-class option (-T) to @@ -298,9 +304,17 @@ def extract( model_source = selectmodel["provider"] model_name = selectmodel["alternative_names"][0] + inputlist = [] + if not inputfile or inputfile == "-": text = sys.stdin.read() - if inputfile and Path(inputfile).exists(): + inputlist.append(text) + elif inputfile and Path(inputfile).is_dir(): + logging.info(f"Input file directory: {inputfile}") + inputfiles = Path(inputfile).glob('*.txt') + inputlist = [open(f, "r").read() for f in inputfiles if f.is_file()] + logging.info(f"Found {len(inputlist)} input files here.") + elif inputfile and Path(inputfile).exists(): logging.info(f"Input file: {inputfile}") if use_textract: import textract @@ -309,6 +323,7 @@ def extract( else: text = open(inputfile, "r").read() logging.info(f"Input text: {text}") + inputlist.append(text) elif inputfile and not Path(inputfile).exists(): raise FileNotFoundError(f"Cannot find input file {inputfile}") @@ -333,12 +348,14 @@ def extract( target_class_def = ke.schemaview.get_class(target_class) else: target_class_def = None - results = ke.extract_from_text(text=text, cls=target_class_def, show_prompt=show_prompt) - if set_slot_value: - for slot_value in set_slot_value: - slot, value = slot_value.split("=") - setattr(results.extracted_object, slot, value) - write_extraction(results, output, output_format, ke) + + for input_entry in inputlist: + results = ke.extract_from_text(text=input_entry, cls=target_class_def, show_prompt=show_prompt) + if set_slot_value: + for slot_value in set_slot_value: + slot, value = slot_value.split("=") + setattr(results.extracted_object, slot, value) + write_extraction(results, output, output_format, ke) @main.command() From ca50c0819e7f222858f85cf129776c6ebfc7a94e Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 2 Jan 2024 15:55:25 -0500 Subject: [PATCH 3/5] Add a lil' counter --- src/ontogpt/cli.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py index b79a8511e..aaa6d61fe 100644 --- a/src/ontogpt/cli.py +++ b/src/ontogpt/cli.py @@ -277,7 +277,7 @@ def extract( ontogpt extract -t gocam.GoCamAnnotations -i gocam-27929086.txt - The input argument may be: + The input argument may be: A file path, A directory, or a string. @@ -311,7 +311,7 @@ def extract( inputlist.append(text) elif inputfile and Path(inputfile).is_dir(): logging.info(f"Input file directory: {inputfile}") - inputfiles = Path(inputfile).glob('*.txt') + inputfiles = Path(inputfile).glob("*.txt") inputlist = [open(f, "r").read() for f in inputfiles if f.is_file()] logging.info(f"Found {len(inputlist)} input files here.") elif inputfile and Path(inputfile).exists(): @@ -349,8 +349,14 @@ def extract( else: target_class_def = None + i = 0 for input_entry in inputlist: - results = ke.extract_from_text(text=input_entry, cls=target_class_def, show_prompt=show_prompt) + if len(inputlist) > 1: + i = i + 1 + logging.info(f"Now reading file {i} of {len(inputlist)}") + results = ke.extract_from_text( + text=input_entry, cls=target_class_def, show_prompt=show_prompt + ) if set_slot_value: for slot_value in set_slot_value: slot, value = slot_value.split("=") From 4327b0813baa5f1ac61172dac456b825b25397a7 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 2 Jan 2024 15:57:45 -0500 Subject: [PATCH 4/5] Minor change to logging text --- src/ontogpt/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py index aaa6d61fe..c36775890 100644 --- a/src/ontogpt/cli.py +++ b/src/ontogpt/cli.py @@ -353,7 +353,7 @@ def extract( for input_entry in inputlist: if len(inputlist) > 1: i = i + 1 - logging.info(f"Now reading file {i} of {len(inputlist)}") + logging.info(f"Now extracting from file {i} of {len(inputlist)}") results = ke.extract_from_text( text=input_entry, cls=target_class_def, show_prompt=show_prompt ) From 91b0962e2ad8e9fd78926a48c1c4ffdefa4cf7ba Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 2 Jan 2024 16:00:42 -0500 Subject: [PATCH 5/5] Update docs about inputfile --- docs/functions.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/functions.md b/docs/functions.md index 23d68e572..a38e48b21 100644 --- a/docs/functions.md +++ b/docs/functions.md @@ -36,6 +36,10 @@ The following options are available for most functions unless stated otherwise. Use the option `--inputfile` to specify a path to a file containing input text. +For the `extract` command, this may be a single file or a directory of files. + +In the latter case, all .txt files will be assumed to be input, and the path will *not* be parsed recursively. + ### template Use the option `--template` to specify a template to use. This is a required parameter.