From 4af83e9d10a82fb3ee48ca249197a9678a814ef4 Mon Sep 17 00:00:00 2001 From: Wang Baoling Date: Fri, 21 Jun 2024 10:42:29 +0800 Subject: [PATCH] feat: support json file (#1217) ### What problem does this PR solve? feat: support json file. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: KevinHuSh --- deepdoc/parser/__init__.py | 1 + deepdoc/parser/json_parser.py | 116 ++++++++++++++++++++++++++++++++++ rag/app/naive.py | 8 ++- rag/nlp/__init__.py | 2 + 4 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 deepdoc/parser/json_parser.py diff --git a/deepdoc/parser/__init__.py b/deepdoc/parser/__init__.py index dedd2c6ac2..d925e68f53 100644 --- a/deepdoc/parser/__init__.py +++ b/deepdoc/parser/__init__.py @@ -16,3 +16,4 @@ from .excel_parser import RAGFlowExcelParser as ExcelParser from .ppt_parser import RAGFlowPptParser as PptParser from .html_parser import RAGFlowHtmlParser as HtmlParser +from .json_parser import RAGFlowJsonParser as JsonParser \ No newline at end of file diff --git a/deepdoc/parser/json_parser.py b/deepdoc/parser/json_parser.py new file mode 100644 index 0000000000..52b54d7b08 --- /dev/null +++ b/deepdoc/parser/json_parser.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- +# The following documents are mainly referenced, and only adaptation modifications have been made +# from https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/json.py + +import json +from typing import Any, Dict, List, Optional +from rag.nlp import find_codec + +class RAGFlowJsonParser: + def __init__( + self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None + ): + super().__init__() + self.max_chunk_size = max_chunk_size * 2 + self.min_chunk_size = ( + min_chunk_size + if min_chunk_size is not None + else max(max_chunk_size - 200, 50) + ) + + def __call__(self, binary): + encoding = find_codec(binary) + txt = binary.decode(encoding, errors="ignore") + json_data = json.loads(txt) + chunks = self.split_json(json_data, True) + sections = [json.dumps(l, ensure_ascii=False) for l in chunks if l] + return sections + + @staticmethod + def _json_size(data: Dict) -> int: + """Calculate the size of the serialized JSON object.""" + return len(json.dumps(data, ensure_ascii=False)) + + @staticmethod + def _set_nested_dict(d: Dict, path: List[str], value: Any) -> None: + """Set a value in a nested dictionary based on the given path.""" + for key in path[:-1]: + d = d.setdefault(key, {}) + d[path[-1]] = value + + def _list_to_dict_preprocessing(self, data: Any) -> Any: + if isinstance(data, dict): + # Process each key-value pair in the dictionary + return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()} + elif isinstance(data, list): + # Convert the list to a dictionary with index-based keys + return { + str(i): self._list_to_dict_preprocessing(item) + for i, item in enumerate(data) + } + else: + # Base case: the item is neither a dict nor a list, so return it unchanged + return data + + def _json_split( + self, + data: Dict[str, Any], + current_path: Optional[List[str]] = None, + chunks: Optional[List[Dict]] = None, + ) -> List[Dict]: + """ + Split json into maximum size dictionaries while preserving structure. + """ + current_path = current_path or [] + chunks = chunks or [{}] + if isinstance(data, dict): + for key, value in data.items(): + new_path = current_path + [key] + chunk_size = self._json_size(chunks[-1]) + size = self._json_size({key: value}) + remaining = self.max_chunk_size - chunk_size + + if size < remaining: + # Add item to current chunk + self._set_nested_dict(chunks[-1], new_path, value) + else: + if chunk_size >= self.min_chunk_size: + # Chunk is big enough, start a new chunk + chunks.append({}) + + # Iterate + self._json_split(value, new_path, chunks) + else: + # handle single item + self._set_nested_dict(chunks[-1], current_path, data) + return chunks + + def split_json( + self, + json_data: Dict[str, Any], + convert_lists: bool = False, + ) -> List[Dict]: + """Splits JSON into a list of JSON chunks""" + + if convert_lists: + chunks = self._json_split(self._list_to_dict_preprocessing(json_data)) + else: + chunks = self._json_split(json_data) + + # Remove the last chunk if it's empty + if not chunks[-1]: + chunks.pop() + return chunks + + def split_text( + self, + json_data: Dict[str, Any], + convert_lists: bool = False, + ensure_ascii: bool = True, + ) -> List[str]: + """Splits JSON into a list of JSON formatted strings""" + + chunks = self.split_json(json_data=json_data, convert_lists=convert_lists) + + # Convert to string + return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks] diff --git a/rag/app/naive.py b/rag/app/naive.py index 0f4bd2434c..d0bdf6dd41 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -17,7 +17,7 @@ import re from deepdoc.parser.pdf_parser import PlainParser from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec -from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser +from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser from rag.settings import cron_logger from rag.utils import num_tokens_from_string @@ -167,6 +167,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections = [(l, "") for l in sections if l] callback(0.8, "Finish parsing.") + elif re.search(r"\.json$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + sections = JsonParser(parser_config.get("chunk_token_num", 128))(binary) + sections = [(l, "") for l in sections if l] + callback(0.8, "Finish parsing.") + elif re.search(r"\.doc$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") binary = BytesIO(binary) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 2f404c4046..c65808eea4 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -471,7 +471,9 @@ def add_chunk(t, pos): tnum = num_tokens_from_string(t) if tnum < 8: pos = "" + # Ensure that the length of the merged chunk does not exceed chunk_token_num if tk_nums[-1] > chunk_token_num: + if t.find(pos) < 0: t += pos cks.append(t)