Skip to content

Commit

Permalink
Merge branch 'main' into Feature/#1042
Browse files Browse the repository at this point in the history
  • Loading branch information
bwook00 authored Dec 11, 2024
2 parents 8e2bf9d + 3ea3d8d commit 1c575b9
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 14 deletions.
2 changes: 1 addition & 1 deletion autorag/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.3.11rc2
0.3.12
27 changes: 19 additions & 8 deletions autorag/data/parse/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,23 @@ def run_parser(
)
set_file_types = set([module["file_type"] for module in module_params])

# Calculate the set difference once
file_types_to_remove = set_file_types - file_types

# Use list comprehension to filter out unwanted elements
module_params = [
param
for param in module_params
if param["file_type"] not in file_types_to_remove
]
modules = [
module
for module, param in zip(modules, module_params)
if param["file_type"] not in file_types_to_remove
]

# create a list of only those file_types that are in file_types but not in set_file_types
missing_file_types = list(file_types - set_file_types)
if list(set_file_types - file_types):
raise ValueError(
f"File types {list(set_file_types - file_types)} are not in the data path."
)

if missing_file_types:
add_modules_list = []
Expand All @@ -84,11 +95,11 @@ def run_parser(

# save results to parquet files
if all_files:
filepaths = list(
map(
lambda x: os.path.join(project_dir, f"{x}.parquet"), range(len(modules))
if len(module_params) > 1:
raise ValueError(
"All files is set to True, You can only use one parsing module."
)
)
filepaths = [os.path.join(project_dir, "parsed_result.parquet")]
else:
filepaths = list(
map(
Expand Down
7 changes: 5 additions & 2 deletions docs/source/data_creation/parse/parse.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,11 @@ If the parsing is completed successfully, the following three types of files are

#### Use all files

For example, if parsing is performed using three parse methods, the following files are created.
`0.parquet`, `1.parquet`, `2.parquet`, `parse_config.yaml`, `summary.csv`
You can use only one parse method at a time.

Parsed result will be saved as `parsed_result.parquet`.

If you want to use two all_files parse method, you should run the parse pipeline twice with different two YAML files.

Finally, in the summary.csv file, you can see information about the parsed result, such as what parse method was used to parse it.

Expand Down
14 changes: 13 additions & 1 deletion sample_config/parse/all_files_full.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
# You can use only one of the following modules at a time.
modules:
# Use Directory Parse
- module_type: langchain_parse
file_type: all_files
parse_method: [ directory, unstructured, upstagedocumentparse ]
parse_method: directory
# Use Unstructured
- module_type: langchain_parse
file_type: all_files
parse_method: unstructured
# Use Upsatge Document Parse
- module_type: langchain_parse
file_type: all_files
parse_method: upstagedocumentparse
# Use Naver Clova OCR
- module_type: clova
file_type: all_files
table_detection: true
# Use Llama Parse
- module_type: llamaparse
file_type: all_files
result_type: markdown
Expand Down
22 changes: 22 additions & 0 deletions tests/autorag/data/parse/test_parse_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,25 @@ def test_run_parser():
"file_type": "pdf",
}
assert os.path.exists(os.path.join(temp_dir, "pdf.parquet"))


def test_run_parser_two():
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
modules = [langchain_parse, langchain_parse]
module_params = [
{"parse_method": "pdfminer", "file_type": "pdf"},
{"parse_method": "csv", "file_type": "csv"},
]
data_path_glob = eng_text_glob
summary_df = run_parser(
modules, module_params, data_path_glob, temp_dir, all_files=False
)
assert os.path.exists(os.path.join(temp_dir, "summary.csv"))
expect_columns = {"filename", "module_name", "module_params", "execution_time"}
assert set(summary_df.columns) == expect_columns
assert len(summary_df) == 1
assert summary_df["module_params"][0] == {
"parse_method": "pdfminer",
"file_type": "pdf",
}
assert os.path.exists(os.path.join(temp_dir, "pdf.parquet"))
6 changes: 4 additions & 2 deletions tests/autorag/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,10 @@ def test_start_parsing_all_files(simple_parser):
project_dir = simple_parser.project_dir
assert os.path.exists(project_dir)
assert os.path.exists(os.path.join(project_dir, "parse_config.yaml"))
assert os.path.exists(os.path.join(project_dir, "0.parquet"))
all_files_result = pd.read_parquet(os.path.join(project_dir, "0.parquet"))
assert os.path.exists(os.path.join(project_dir, "parsed_result.parquet"))
all_files_result = pd.read_parquet(
os.path.join(project_dir, "parsed_result.parquet")
)

expect_result_columns = ["texts", "path", "page", "last_modified_datetime"]
assert all(
Expand Down

0 comments on commit 1c575b9

Please sign in to comment.