From 5331769b4e5cb0ad22dc44b8cceac5a602239297 Mon Sep 17 00:00:00 2001 From: kimbwook Date: Mon, 9 Dec 2024 13:57:11 +0900 Subject: [PATCH 1/2] change file type logic --- autorag/VERSION | 2 +- autorag/data/parse/run.py | 7 ++++--- tests/autorag/data/parse/test_parse_run.py | 22 ++++++++++++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/autorag/VERSION b/autorag/VERSION index 7c97c9b17..c44fec2a0 100644 --- a/autorag/VERSION +++ b/autorag/VERSION @@ -1 +1 @@ -0.3.11rc2 +0.3.11rc3 diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py index 849b2cbc3..1a8b1f975 100644 --- a/autorag/data/parse/run.py +++ b/autorag/data/parse/run.py @@ -56,9 +56,10 @@ def run_parser( # create a list of only those file_types that are in file_types but not in set_file_types missing_file_types = list(file_types - set_file_types) if list(set_file_types - file_types): - raise ValueError( - f"File types {list(set_file_types - file_types)} are not in the data path." - ) + for module, module_param in zip(modules, module_params): + if module_param["file_type"] in list(set_file_types - file_types): + modules.remove(module) + module_params.remove(module_param) if missing_file_types: add_modules_list = [] diff --git a/tests/autorag/data/parse/test_parse_run.py b/tests/autorag/data/parse/test_parse_run.py index 9c97a6d03..bf0c2f76f 100644 --- a/tests/autorag/data/parse/test_parse_run.py +++ b/tests/autorag/data/parse/test_parse_run.py @@ -24,3 +24,25 @@ def test_run_parser(): "file_type": "pdf", } assert os.path.exists(os.path.join(temp_dir, "pdf.parquet")) + + +def test_run_parser_two(): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir: + modules = [langchain_parse, langchain_parse] + module_params = [ + {"parse_method": "pdfminer", "file_type": "pdf"}, + {"parse_method": "csv", "file_type": "csv"}, + ] + data_path_glob = eng_text_glob + summary_df = run_parser( + modules, module_params, data_path_glob, temp_dir, all_files=False + ) + assert os.path.exists(os.path.join(temp_dir, "summary.csv")) + expect_columns = {"filename", "module_name", "module_params", "execution_time"} + assert set(summary_df.columns) == expect_columns + assert len(summary_df) == 1 + assert summary_df["module_params"][0] == { + "parse_method": "pdfminer", + "file_type": "pdf", + } + assert os.path.exists(os.path.join(temp_dir, "pdf.parquet")) From f562c5ca8a65f7a7f2ab44fd6beaf81a724b55c8 Mon Sep 17 00:00:00 2001 From: kimbwook Date: Mon, 9 Dec 2024 14:11:26 +0900 Subject: [PATCH 2/2] change delete logic --- autorag/data/parse/run.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py index 1a8b1f975..a289e35ba 100644 --- a/autorag/data/parse/run.py +++ b/autorag/data/parse/run.py @@ -53,13 +53,23 @@ def run_parser( ) set_file_types = set([module["file_type"] for module in module_params]) + # Calculate the set difference once + file_types_to_remove = set_file_types - file_types + + # Use list comprehension to filter out unwanted elements + module_params = [ + param + for param in module_params + if param["file_type"] not in file_types_to_remove + ] + modules = [ + module + for module, param in zip(modules, module_params) + if param["file_type"] not in file_types_to_remove + ] + # create a list of only those file_types that are in file_types but not in set_file_types missing_file_types = list(file_types - set_file_types) - if list(set_file_types - file_types): - for module, module_param in zip(modules, module_params): - if module_param["file_type"] in list(set_file_types - file_types): - modules.remove(module) - module_params.remove(module_param) if missing_file_types: add_modules_list = []