Skip to content

Commit

Permalink
support parsing multiple types of documents at once (#1015)
Browse files Browse the repository at this point in the history
* just commit

* support parsing multiple types of documents at once

* Add docs

* Change to a compatible split code in the window
  • Loading branch information
bwook00 authored Nov 29, 2024
1 parent c9072e3 commit 1a49236
Show file tree
Hide file tree
Showing 31 changed files with 844 additions and 73 deletions.
33 changes: 29 additions & 4 deletions autorag/data/parse/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import datetime
from glob import glob
from typing import Tuple, List, Optional
import os

from autorag.utils import result_to_dataframe
from autorag.data.utils.util import get_file_metadata
Expand All @@ -14,14 +15,38 @@ def parser_node(func):
@functools.wraps(func)
@result_to_dataframe(["texts", "path", "page", "last_modified_datetime"])
def wrapper(
data_path_glob: str, parse_method: Optional[str] = None, **kwargs
data_path_glob: str,
file_type: str,
parse_method: Optional[str] = None,
**kwargs,
) -> Tuple[List[str], List[str], List[int], List[datetime]]:
logger.info(f"Running parser - {func.__name__} module...")

data_path_list = glob(data_path_glob)
if not data_path_list:
raise FileNotFoundError(f"data does not exits in {data_path_glob}")

assert file_type in [
"pdf",
"csv",
"json",
"md",
"html",
"xml",
"all_files",
], f"search type {file_type} is not supported"

# extract only files from data_path_list based on the file_type set in the YAML file
data_paths = (
[
data_path
for data_path in data_path_list
if os.path.basename(data_path).split(".")[-1] == file_type
]
if file_type != "all_files"
else data_path_list
)

if func.__name__ == "langchain_parse":
parse_method = parse_method.lower()
if parse_method == "directory":
Expand All @@ -30,14 +55,14 @@ def wrapper(
folder_path = "/".join(path_split_list)
kwargs.update({"glob": glob_path, "path": folder_path})
result = func(
data_path_list=data_path_list, parse_method=parse_method, **kwargs
data_path_list=data_paths, parse_method=parse_method, **kwargs
)
else:
result = func(
data_path_list=data_path_list, parse_method=parse_method, **kwargs
data_path_list=data_paths, parse_method=parse_method, **kwargs
)
elif func.__name__ in ["clova_ocr", "llama_parse", "table_hybrid_parse"]:
result = func(data_path_list=data_path_list, **kwargs)
result = func(data_path_list=data_paths, **kwargs)
else:
raise ValueError(f"Unsupported module_type: {func.__name__}")
result = _add_last_modified_datetime(result)
Expand Down
88 changes: 85 additions & 3 deletions autorag/data/parse/run.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,79 @@
import os
from typing import List, Callable, Dict
import pandas as pd
from glob import glob

from autorag.strategy import measure_speed
from autorag.data.utils.util import get_param_combinations

default_map = {
"pdf": {
"file_type": "pdf",
"module_type": "langchain_parse",
"parse_method": "pdfminer",
},
"csv": {
"file_type": "csv",
"module_type": "langchain_parse",
"parse_method": "csv",
},
"md": {
"file_type": "md",
"module_type": "langchain_parse",
"parse_method": "unstructuredmarkdown",
},
"html": {
"file_type": "html",
"module_type": "langchain_parse",
"parse_method": "bshtml",
},
"xml": {
"file_type": "xml",
"module_type": "langchain_parse",
"parse_method": "unstructuredxml",
},
}


def run_parser(
modules: List[Callable],
module_params: List[Dict],
data_path_glob: str,
project_dir: str,
all_files: bool,
):
if not all_files:
# Set the parsing module to default if it is a file type in paths but not set in YAML.
data_path_list = glob(data_path_glob)
if not data_path_list:
raise FileNotFoundError(f"data does not exits in {data_path_glob}")

file_types = set(
[os.path.basename(data_path).split(".")[-1] for data_path in data_path_list]
)
set_file_types = set([module["file_type"] for module in module_params])

# create a list of only those file_types that are in file_types but not in set_file_types
missing_file_types = list(file_types - set_file_types)
if list(set_file_types - file_types):
raise ValueError(
f"File types {list(set_file_types - file_types)} are not in the data path."
)

if missing_file_types:
add_modules_list = []
for missing_file_type in missing_file_types:
if missing_file_type == "json":
raise ValueError(
"JSON file type must have a jq_schema so you must set it in the YAML file."
)

add_modules_list.append(default_map[missing_file_type])

add_modules, add_params = get_param_combinations(add_modules_list)
modules.extend(add_modules)
module_params.extend(add_params)

results, execution_times = zip(
*map(
lambda x: measure_speed(x[0], data_path_glob=data_path_glob, **x[1]),
Expand All @@ -20,9 +83,19 @@ def run_parser(
average_times = list(map(lambda x: x / len(results[0]), execution_times))

# save results to parquet files
filepaths = list(
map(lambda x: os.path.join(project_dir, f"{x}.parquet"), range(len(modules)))
)
if all_files:
filepaths = list(
map(
lambda x: os.path.join(project_dir, f"{x}.parquet"), range(len(modules))
)
)
else:
filepaths = list(
map(
lambda x: os.path.join(project_dir, f"{x['file_type']}.parquet"),
module_params,
)
)
list(map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)))
filenames = list(map(lambda x: os.path.basename(x), filepaths))

Expand All @@ -35,4 +108,13 @@ def run_parser(
}
)
summary_df.to_csv(os.path.join(project_dir, "summary.csv"), index=False)

# concat all parquet files here if not all_files.
if not all_files:
dataframes = [pd.read_parquet(file) for file in filepaths]
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.to_parquet(
os.path.join(project_dir, "parsed_result.parquet"), index=False
)

return summary_df
3 changes: 2 additions & 1 deletion autorag/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __init__(self, data_path_glob: str, project_dir: Optional[str] = None):
self.data_path_glob = data_path_glob
self.project_dir = project_dir if project_dir is not None else os.getcwd()

def start_parsing(self, yaml_path: str):
def start_parsing(self, yaml_path: str, all_files: bool = False):
if not os.path.exists(self.project_dir):
os.makedirs(self.project_dir)

Expand All @@ -32,5 +32,6 @@ def start_parsing(self, yaml_path: str):
module_params=input_params,
data_path_glob=self.data_path_glob,
project_dir=self.project_dir,
all_files=all_files,
)
logger.info("Parsing Done!")
104 changes: 104 additions & 0 deletions docs/source/data_creation/legacy/parse.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Parse

In this section, we will cover how to parse raw documents.

It is a crucial step to parse the raw documents.
Because if the raw documents are not parsed well, the RAG will not be optimized well.

Using only YAML files, you can easily use the various document loaders.
The parsed result is saved according to the data format used by AutoRAG.

## Overview

The sample parse pipeline looks like this.

```python
from autorag.parser import Parser

parser = Parser(data_path_glob="your/data/path/*")
parser.start_parsing("your/path/to/parse_config.yaml")
```

## Run Parse Pipeline

### 1. Set parser instance

```python
from autorag.parser import Parser

parser = Parser(data_path_glob="your/data/path/*")
```

#### 📌 Parameter: `data_path_glob`

Parser instance requires `data_path_glob` parameter.
This parameter is used to specify the path of the documents to be parsed.

Only glob patterns are supported.

You can use the wildcard character `*` to specify multiple files.

you can specify the file extension like `*.pdf` to specific file types.

```{admonition} Want to specify project folder?
You can specify project directory with `--project_dir` option or project_dir parameter.
```

### 2. Set YAML file

Here is an example of how to use the `langchain_parse` module.

```yaml
modules:
- module_type: langchain_parse
parse_method: [ pdfminer, pdfplumber ]
```
### 3. Start parsing
Use `start_parsing` function to start parsing.

```python
parser.start_parsing("your/path/to/parse_config.yaml")
```

### 4. Check the result

If you set `project_dir` parameter, you can check the result in the project directory.
If not, you can check the result in the current directory.

If the parsing is completed successfully, the following three types of files are created in the `project_dir`.

1. Parsed Result
2. Used YAML file
3. Summary file

For example, if parsing is performed using three parse methods, the following files are created.
`0.parquet`, `1.parquet`, `2.parquet`, `parse_config.yaml`, `summary.csv`

Finally, in the summary.csv file, you can see information about the parsed result, such as what parse method was used to parse it.

## Output Columns

- `texts`: Parsed text from the document.
- `path`: Path of the document.
- `pages`: Number of pages in the document. Contains page if parsing on a per-page basis, otherwise -1.
- Modules that parse per page: [ `clova`, `table_hybrid_parse` ]
- Modules that don't parse on a per-page basis: [ `langchain_parse`, `llama_parse` ]
- `last_modified_datetime`: Last modified datetime of the document.

#### Supported Parse Modules

📌 You can check our all Parse modules
at [here](https://edai.notion.site/Supporting-Parse-Modules-e0b7579c7c0e4fb2963e408eeccddd75?pvs=4)


```{toctree}
---
maxdepth: 1
---
langchain_parse.md
llama_parse.md
clova.md
table_hybrid_parse.md
```
Loading

0 comments on commit 1a49236

Please sign in to comment.