diff --git a/autorag/VERSION b/autorag/VERSION index 7c97c9b17..0b9c01996 100644 --- a/autorag/VERSION +++ b/autorag/VERSION @@ -1 +1 @@ -0.3.11rc2 +0.3.12 diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py index 849b2cbc3..502f052d7 100644 --- a/autorag/data/parse/run.py +++ b/autorag/data/parse/run.py @@ -53,12 +53,23 @@ def run_parser( ) set_file_types = set([module["file_type"] for module in module_params]) + # Calculate the set difference once + file_types_to_remove = set_file_types - file_types + + # Use list comprehension to filter out unwanted elements + module_params = [ + param + for param in module_params + if param["file_type"] not in file_types_to_remove + ] + modules = [ + module + for module, param in zip(modules, module_params) + if param["file_type"] not in file_types_to_remove + ] + # create a list of only those file_types that are in file_types but not in set_file_types missing_file_types = list(file_types - set_file_types) - if list(set_file_types - file_types): - raise ValueError( - f"File types {list(set_file_types - file_types)} are not in the data path." - ) if missing_file_types: add_modules_list = [] @@ -84,11 +95,11 @@ def run_parser( # save results to parquet files if all_files: - filepaths = list( - map( - lambda x: os.path.join(project_dir, f"{x}.parquet"), range(len(modules)) + if len(module_params) > 1: + raise ValueError( + "All files is set to True, You can only use one parsing module." ) - ) + filepaths = [os.path.join(project_dir, "parsed_result.parquet")] else: filepaths = list( map( diff --git a/docs/source/data_creation/parse/parse.md b/docs/source/data_creation/parse/parse.md index 2de27dad5..255cf2324 100644 --- a/docs/source/data_creation/parse/parse.md +++ b/docs/source/data_creation/parse/parse.md @@ -154,8 +154,11 @@ If the parsing is completed successfully, the following three types of files are #### Use all files -For example, if parsing is performed using three parse methods, the following files are created. -`0.parquet`, `1.parquet`, `2.parquet`, `parse_config.yaml`, `summary.csv` +You can use only one parse method at a time. + +Parsed result will be saved as `parsed_result.parquet`. + +If you want to use two all_files parse method, you should run the parse pipeline twice with different two YAML files. Finally, in the summary.csv file, you can see information about the parsed result, such as what parse method was used to parse it. diff --git a/sample_config/parse/all_files_full.yaml b/sample_config/parse/all_files_full.yaml index 441a4ce44..9cdbf7e92 100644 --- a/sample_config/parse/all_files_full.yaml +++ b/sample_config/parse/all_files_full.yaml @@ -1,10 +1,22 @@ +# You can use only one of the following modules at a time. modules: + # Use Directory Parse - module_type: langchain_parse file_type: all_files - parse_method: [ directory, unstructured, upstagedocumentparse ] + parse_method: directory + # Use Unstructured + - module_type: langchain_parse + file_type: all_files + parse_method: unstructured + # Use Upsatge Document Parse + - module_type: langchain_parse + file_type: all_files + parse_method: upstagedocumentparse + # Use Naver Clova OCR - module_type: clova file_type: all_files table_detection: true + # Use Llama Parse - module_type: llamaparse file_type: all_files result_type: markdown diff --git a/tests/autorag/data/parse/test_parse_run.py b/tests/autorag/data/parse/test_parse_run.py index 9c97a6d03..bf0c2f76f 100644 --- a/tests/autorag/data/parse/test_parse_run.py +++ b/tests/autorag/data/parse/test_parse_run.py @@ -24,3 +24,25 @@ def test_run_parser(): "file_type": "pdf", } assert os.path.exists(os.path.join(temp_dir, "pdf.parquet")) + + +def test_run_parser_two(): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir: + modules = [langchain_parse, langchain_parse] + module_params = [ + {"parse_method": "pdfminer", "file_type": "pdf"}, + {"parse_method": "csv", "file_type": "csv"}, + ] + data_path_glob = eng_text_glob + summary_df = run_parser( + modules, module_params, data_path_glob, temp_dir, all_files=False + ) + assert os.path.exists(os.path.join(temp_dir, "summary.csv")) + expect_columns = {"filename", "module_name", "module_params", "execution_time"} + assert set(summary_df.columns) == expect_columns + assert len(summary_df) == 1 + assert summary_df["module_params"][0] == { + "parse_method": "pdfminer", + "file_type": "pdf", + } + assert os.path.exists(os.path.join(temp_dir, "pdf.parquet")) diff --git a/tests/autorag/test_parser.py b/tests/autorag/test_parser.py index 5c2edf82b..552124cdb 100644 --- a/tests/autorag/test_parser.py +++ b/tests/autorag/test_parser.py @@ -157,8 +157,10 @@ def test_start_parsing_all_files(simple_parser): project_dir = simple_parser.project_dir assert os.path.exists(project_dir) assert os.path.exists(os.path.join(project_dir, "parse_config.yaml")) - assert os.path.exists(os.path.join(project_dir, "0.parquet")) - all_files_result = pd.read_parquet(os.path.join(project_dir, "0.parquet")) + assert os.path.exists(os.path.join(project_dir, "parsed_result.parquet")) + all_files_result = pd.read_parquet( + os.path.join(project_dir, "parsed_result.parquet") + ) expect_result_columns = ["texts", "path", "page", "last_modified_datetime"] assert all(