support parsing multiple types of documents at once (#1015)

* just commit * support parsing multiple types of documents at once * Add docs * Change to a compatible split code in the window
Marker-Inc-Korea · Nov 29, 2024 · 1a49236 · 1a49236
1 parent c9072e3
commit 1a49236
Show file tree

Hide file tree

Showing 31 changed files with 844 additions and 73 deletions.
diff --git a/autorag/data/parse/base.py b/autorag/data/parse/base.py
@@ -3,6 +3,7 @@
 from datetime import datetime
 from glob import glob
 from typing import Tuple, List, Optional
+import os
 
 from autorag.utils import result_to_dataframe
 from autorag.data.utils.util import get_file_metadata
@@ -14,14 +15,38 @@ def parser_node(func):
 	@functools.wraps(func)
 	@result_to_dataframe(["texts", "path", "page", "last_modified_datetime"])
 	def wrapper(
-		data_path_glob: str, parse_method: Optional[str] = None, **kwargs
+		data_path_glob: str,
+		file_type: str,
+		parse_method: Optional[str] = None,
+		**kwargs,
 	) -> Tuple[List[str], List[str], List[int], List[datetime]]:
 		logger.info(f"Running parser - {func.__name__} module...")
 
 		data_path_list = glob(data_path_glob)
 		if not data_path_list:
 			raise FileNotFoundError(f"data does not exits in {data_path_glob}")
 
+		assert file_type in [
+			"pdf",
+			"csv",
+			"json",
+			"md",
+			"html",
+			"xml",
+			"all_files",
+		], f"search type {file_type} is not supported"
+
+		# extract only files from data_path_list based on the file_type set in the YAML file
+		data_paths = (
+			[
+				data_path
+				for data_path in data_path_list
+				if os.path.basename(data_path).split(".")[-1] == file_type
+			]
+			if file_type != "all_files"
+			else data_path_list
+		)
+
 		if func.__name__ == "langchain_parse":
 			parse_method = parse_method.lower()
 			if parse_method == "directory":
@@ -30,14 +55,14 @@ def wrapper(
 				folder_path = "/".join(path_split_list)
 				kwargs.update({"glob": glob_path, "path": folder_path})
 				result = func(
-					data_path_list=data_path_list, parse_method=parse_method, **kwargs
+					data_path_list=data_paths, parse_method=parse_method, **kwargs
 				)
 			else:
 				result = func(
-					data_path_list=data_path_list, parse_method=parse_method, **kwargs
+					data_path_list=data_paths, parse_method=parse_method, **kwargs
 				)
 		elif func.__name__ in ["clova_ocr", "llama_parse", "table_hybrid_parse"]:
-			result = func(data_path_list=data_path_list, **kwargs)
+			result = func(data_path_list=data_paths, **kwargs)
 		else:
 			raise ValueError(f"Unsupported module_type: {func.__name__}")
 		result = _add_last_modified_datetime(result)

diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py
@@ -1,16 +1,79 @@
 import os
 from typing import List, Callable, Dict
 import pandas as pd
+from glob import glob
 
 from autorag.strategy import measure_speed
+from autorag.data.utils.util import get_param_combinations
+
+default_map = {
+	"pdf": {
+		"file_type": "pdf",
+		"module_type": "langchain_parse",
+		"parse_method": "pdfminer",
+	},
+	"csv": {
+		"file_type": "csv",
+		"module_type": "langchain_parse",
+		"parse_method": "csv",
+	},
+	"md": {
+		"file_type": "md",
+		"module_type": "langchain_parse",
+		"parse_method": "unstructuredmarkdown",
+	},
+	"html": {
+		"file_type": "html",
+		"module_type": "langchain_parse",
+		"parse_method": "bshtml",
+	},
+	"xml": {
+		"file_type": "xml",
+		"module_type": "langchain_parse",
+		"parse_method": "unstructuredxml",
+	},
+}
 
 
 def run_parser(
 	modules: List[Callable],
 	module_params: List[Dict],
 	data_path_glob: str,
 	project_dir: str,
+	all_files: bool,
 ):
+	if not all_files:
+		# Set the parsing module to default if it is a file type in paths but not set in YAML.
+		data_path_list = glob(data_path_glob)
+		if not data_path_list:
+			raise FileNotFoundError(f"data does not exits in {data_path_glob}")
+
+		file_types = set(
+			[os.path.basename(data_path).split(".")[-1] for data_path in data_path_list]
+		)
+		set_file_types = set([module["file_type"] for module in module_params])
+
+		# create a list of only those file_types that are in file_types but not in set_file_types
+		missing_file_types = list(file_types - set_file_types)
+		if list(set_file_types - file_types):
+			raise ValueError(
+				f"File types {list(set_file_types - file_types)} are not in the data path."
+			)
+
+		if missing_file_types:
+			add_modules_list = []
+			for missing_file_type in missing_file_types:
+				if missing_file_type == "json":
+					raise ValueError(
+						"JSON file type must have a jq_schema so you must set it in the YAML file."
+					)
+
+				add_modules_list.append(default_map[missing_file_type])
+
+			add_modules, add_params = get_param_combinations(add_modules_list)
+			modules.extend(add_modules)
+			module_params.extend(add_params)
+
 	results, execution_times = zip(
 		*map(
 			lambda x: measure_speed(x[0], data_path_glob=data_path_glob, **x[1]),
@@ -20,9 +83,19 @@ def run_parser(
 	average_times = list(map(lambda x: x / len(results[0]), execution_times))
 
 	# save results to parquet files
-	filepaths = list(
-		map(lambda x: os.path.join(project_dir, f"{x}.parquet"), range(len(modules)))
-	)
+	if all_files:
+		filepaths = list(
+			map(
+				lambda x: os.path.join(project_dir, f"{x}.parquet"), range(len(modules))
+			)
+		)
+	else:
+		filepaths = list(
+			map(
+				lambda x: os.path.join(project_dir, f"{x['file_type']}.parquet"),
+				module_params,
+			)
+		)
 	list(map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)))
 	filenames = list(map(lambda x: os.path.basename(x), filepaths))
 
@@ -35,4 +108,13 @@ def run_parser(
 		}
 	)
 	summary_df.to_csv(os.path.join(project_dir, "summary.csv"), index=False)
+
+	# concat all parquet files here if not all_files.
+	if not all_files:
+		dataframes = [pd.read_parquet(file) for file in filepaths]
+		combined_df = pd.concat(dataframes, ignore_index=True)
+		combined_df.to_parquet(
+			os.path.join(project_dir, "parsed_result.parquet"), index=False
+		)
+
 	return summary_df
diff --git a/autorag/parser.py b/autorag/parser.py
@@ -14,7 +14,7 @@ def __init__(self, data_path_glob: str, project_dir: Optional[str] = None):
 		self.data_path_glob = data_path_glob
 		self.project_dir = project_dir if project_dir is not None else os.getcwd()
 
-	def start_parsing(self, yaml_path: str):
+	def start_parsing(self, yaml_path: str, all_files: bool = False):
 		if not os.path.exists(self.project_dir):
 			os.makedirs(self.project_dir)
 
@@ -32,5 +32,6 @@ def start_parsing(self, yaml_path: str):
 			module_params=input_params,
 			data_path_glob=self.data_path_glob,
 			project_dir=self.project_dir,
+			all_files=all_files,
 		)
 		logger.info("Parsing Done!")
diff --git a/docs/source/data_creation/legacy/parse.md b/docs/source/data_creation/legacy/parse.md
@@ -0,0 +1,104 @@
+# Parse
+
+In this section, we will cover how to parse raw documents.
+
+It is a crucial step to parse the raw documents.
+Because if the raw documents are not parsed well, the RAG will not be optimized well.
+
+Using only YAML files, you can easily use the various document loaders.
+The parsed result is saved according to the data format used by AutoRAG.
+
+## Overview
+
+The sample parse pipeline looks like this.
+
+```python
+from autorag.parser import Parser
+
+parser = Parser(data_path_glob="your/data/path/*")
+parser.start_parsing("your/path/to/parse_config.yaml")
+```
+
+## Run Parse Pipeline
+
+### 1. Set parser instance
+
+```python
+from autorag.parser import Parser
+
+parser = Parser(data_path_glob="your/data/path/*")
+```
+
+#### 📌 Parameter: `data_path_glob`
+
+Parser instance requires `data_path_glob` parameter.
+This parameter is used to specify the path of the documents to be parsed.
+
+Only glob patterns are supported.
+
+You can use the wildcard character `*` to specify multiple files.
+
+you can specify the file extension like `*.pdf` to specific file types.
+
+```{admonition} Want to specify project folder?
+You can specify project directory with `--project_dir` option or project_dir parameter.
+```
+
+### 2. Set YAML file
+
+Here is an example of how to use the `langchain_parse` module.
+
+```yaml
+modules:
+  - module_type: langchain_parse
+    parse_method: [ pdfminer, pdfplumber ]
+```
+
+### 3. Start parsing
+
+Use `start_parsing` function to start parsing.
+
+```python
+parser.start_parsing("your/path/to/parse_config.yaml")
+```
+
+### 4. Check the result
+
+If you set `project_dir` parameter, you can check the result in the project directory.
+If not, you can check the result in the current directory.
+
+If the parsing is completed successfully, the following three types of files are created in the `project_dir`.
+
+1. Parsed Result
+2. Used YAML file
+3. Summary file
+
+For example, if parsing is performed using three parse methods, the following files are created.
+`0.parquet`, `1.parquet`, `2.parquet`, `parse_config.yaml`, `summary.csv`
+
+Finally, in the summary.csv file, you can see information about the parsed result, such as what parse method was used to parse it.
+
+## Output Columns
+
+- `texts`: Parsed text from the document.
+- `path`: Path of the document.
+- `pages`: Number of pages in the document. Contains page if parsing on a per-page basis, otherwise -1.
+    - Modules that parse per page: [ `clova`, `table_hybrid_parse` ]
+    - Modules that don't parse on a per-page basis: [ `langchain_parse`, `llama_parse` ]
+- `last_modified_datetime`: Last modified datetime of the document.
+
+#### Supported Parse Modules
+
+📌 You can check our all Parse modules
+at [here](https://edai.notion.site/Supporting-Parse-Modules-e0b7579c7c0e4fb2963e408eeccddd75?pvs=4)
+
+
+```{toctree}
+---
+maxdepth: 1
+---
+langchain_parse.md
+llama_parse.md
+clova.md
+table_hybrid_parse.md
+```