Skip to content

Commit

Permalink
rename DocNode::text to DocNode::_content for image/video/... support
Browse files Browse the repository at this point in the history
  • Loading branch information
ouonline committed Nov 25, 2024
1 parent 425b0e5 commit 52b4e36
Show file tree
Hide file tree
Showing 27 changed files with 104 additions and 95 deletions.
4 changes: 3 additions & 1 deletion lazyllm/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .globals import globals, LazyLlmResponse, LazyLlmRequest, encode_request, decode_request
from .bind import root, Bind as bind, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9
from .queue import FileSystemQueue
from .utils import compile_func
from .utils import compile_func, obj2str, str2obj

__all__ = [
# registry
Expand All @@ -33,6 +33,8 @@
'DynamicDescriptor',
'singleton',
'reset_on_pickle',
'obj2str',
'str2obj',

# arg praser
'LazyLLMCMD',
Expand Down
7 changes: 3 additions & 4 deletions lazyllm/common/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@
import contextvars
import copy
from typing import Any, Tuple, Optional, List, Dict
import pickle
from pydantic import BaseModel as struct
from .common import package, kwargs
from .deprecated import deprecated
import asyncio
import base64
from .utils import obj2str, str2obj


class ReadWriteLock(object):
Expand Down Expand Up @@ -226,9 +225,9 @@ def __str__(self): return str(self.messages)


def encode_request(input):
return base64.b64encode(pickle.dumps(input)).decode('utf-8')
return obj2str(input)


def decode_request(input, default=None):
if input is None: return default
return pickle.loads(base64.b64decode(input.encode('utf-8')))
return str2obj(input)
8 changes: 8 additions & 0 deletions lazyllm/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from typing import Union, Dict, Callable, Any, Optional
import re
import ast
import pickle
import base64

def check_path(
path: Union[str, PathLike],
Expand Down Expand Up @@ -34,3 +36,9 @@ def compile_func(func_code: str, global_env: Optional[Dict[str, Any]] = None) ->
local_dict = {}
exec(func, global_env, local_dict)
return local_dict[fname]

def obj2str(obj: Any) -> str:
return base64.b64encode(pickle.dumps(obj)).decode('utf-8')

def str2obj(data: str) -> Any:
return None if data is None else pickle.loads(base64.b64decode(data.encode('utf-8')))
22 changes: 9 additions & 13 deletions lazyllm/tools/rag/chroma_store.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
from typing import Any, Dict, List, Optional, Callable, Set
from lazyllm.thirdparty import chromadb
from lazyllm import LOG
from lazyllm.common import override
from lazyllm.common import override, obj2str, str2obj
from .store_base import StoreBase, LAZY_ROOT_NAME
from .doc_node import DocNode
from .index_base import IndexBase
from .utils import _FileNodeIndex
from .default_index import DefaultIndex
from .map_store import MapStore
import pickle
import base64

# ---------------------------------------------------------------------------- #

Expand Down Expand Up @@ -111,7 +109,7 @@ def _save_nodes(self, nodes: List[DocNode]) -> None:
ids.append(node.uid)
embeddings.append([0]) # we don't use chroma for retrieving
metadatas.append(metadata)
documents.append(node.get_text())
documents.append(obj2str(node.content))
if ids:
collection.upsert(
embeddings=embeddings,
Expand All @@ -132,16 +130,14 @@ def _build_nodes_from_chroma(self, results: Dict[str, List], embed_dims: Dict[st
chroma_metadata = results['metadatas'][i]

parent = chroma_metadata['parent']
local_metadata = pickle.loads(base64.b64decode(chroma_metadata['metadata'].encode('utf-8')))

global_metadata = pickle.loads(base64.b64decode(chroma_metadata['global_metadata'].encode('utf-8')))\
if not parent else None
local_metadata = str2obj(chroma_metadata['metadata'])
global_metadata = str2obj(chroma_metadata['global_metadata']) if not parent else None

node = DocNode(
uid=uid,
text=results["documents"][i],
content=str2obj(results["documents"][i]),
group=chroma_metadata["group"],
embedding=pickle.loads(base64.b64decode(chroma_metadata['embedding'].encode('utf-8'))),
embedding=str2obj(chroma_metadata['embedding']),
parent=parent,
metadata=local_metadata,
global_metadata=global_metadata,
Expand Down Expand Up @@ -170,12 +166,12 @@ def _make_chroma_metadata(self, node: DocNode) -> Dict[str, Any]:
metadata = {
"group": node.group,
"parent": node.parent.uid if node.parent else "",
"embedding": base64.b64encode(pickle.dumps(node.embedding)).decode('utf-8'),
"metadata": base64.b64encode(pickle.dumps(node.metadata)).decode('utf-8'),
"embedding": obj2str(node.embedding),
"metadata": obj2str(node.metadata),
}

if not node.parent:
metadata["global_metadata"] = base64.b64encode(pickle.dumps(node.global_metadata)).decode('utf-8')
metadata["global_metadata"] = obj2str(node.global_metadata)

return metadata

Expand Down
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/dataReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def load_file(input_file: Path, file_metadata: Callable[[str], Dict], file_extra
with fs.open(input_file, encoding=encoding) as f:
data = f.read().decode(encoding)

doc = DocNode(text=data, metadata=metadata or {})
doc = DocNode(content=data, metadata=metadata or {})
doc.docpath = str(input_file)
if filename_as_id: doc.uid = str(input_file)
documents.append(doc)
Expand Down
4 changes: 2 additions & 2 deletions lazyllm/tools/rag/doc_manager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import hashlib
import json
from typing import List, Optional, Dict
from pydantic import BaseModel, Field
Expand All @@ -10,6 +9,7 @@
import lazyllm
from lazyllm import FastapiApp as app
from .utils import DocListManager, BaseResponse
from .doc_impl import gen_docid
from .global_metadata import RAG_DOC_ID, RAG_DOC_PATH


Expand Down Expand Up @@ -74,7 +74,7 @@ def upload_files(self, files: List[UploadFile], override: bool = False, # noqa
lazyllm.LOG.error(f'writing file [{path}] to disk failed: [{e}]')
raise e

file_id = hashlib.sha256(path.encode()).hexdigest()
file_id = gen_docid(path)
self._manager.update_file_status([file_id], status=DocListManager.Status.success)
results.append('Success')

Expand Down
17 changes: 12 additions & 5 deletions lazyllm/tools/rag/doc_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ class MetadataMode(str, Enum):

@reset_on_pickle(('_lock', threading.Lock))
class DocNode:
def __init__(self, uid: Optional[str] = None, text: Optional[str] = None, group: Optional[str] = None,
embedding: Optional[Dict[str, List[float]]] = None, parent: Optional["DocNode"] = None,
metadata: Optional[Dict[str, Any]] = None, global_metadata: Optional[Dict[str, Any]] = None):
def __init__(self, uid: Optional[str] = None, content: Optional[Union[str, List[Any]]] = None,
group: Optional[str] = None, embedding: Optional[Dict[str, List[float]]] = None,
parent: Optional["DocNode"] = None, metadata: Optional[Dict[str, Any]] = None,
global_metadata: Optional[Dict[str, Any]] = None):
self.uid: str = uid if uid else str(uuid.uuid4())
self.text: Optional[str] = text
self.content: Optional[Union[str, List[Any]]] = content
self.group: Optional[str] = group
self.embedding: Optional[Dict[str, List[float]]] = embedding or {}
self._metadata: Dict[str, Any] = metadata or {}
Expand All @@ -37,6 +38,12 @@ def __init__(self, uid: Optional[str] = None, text: Optional[str] = None, group:
raise ValueError('only ROOT node can set global metadata.')
self._global_metadata = global_metadata if global_metadata else {}

@property
def text(self) -> str:
if not isinstance(self.content, str):
raise TypeError(f"node content type '{type(self.content)}' is not a string")
return self.content

@property
def root_node(self) -> Optional["DocNode"]:
root = self.parent
Expand Down Expand Up @@ -97,7 +104,7 @@ def get_parent_id(self) -> str:

def __str__(self) -> str:
return (
f"DocNode(id: {self.uid}, group: {self.group}, text: {self.get_text()}) parent: {self.get_parent_id()}, "
f"DocNode(id: {self.uid}, group: {self.group}, content: {self.content}) parent: {self.get_parent_id()}, "
f"children: {self.get_children_str()}"
)

Expand Down
14 changes: 6 additions & 8 deletions lazyllm/tools/rag/milvus_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
from .index_base import IndexBase
from .store_base import StoreBase
from .global_metadata import GlobalMetadataDesc, RAG_DOC_PATH, RAG_DOC_ID
from lazyllm.common import override
import pickle
import base64
from lazyllm.common import override, obj2str, str2obj

class MilvusStore(StoreBase):
# we define these variables as members so that pymilvus is not imported until MilvusStore is instantiated.
Expand All @@ -29,7 +27,7 @@ def _def_constants(self) -> None:
'dtype': pymilvus.DataType.VARCHAR,
'max_length': 256,
},
'text': {
'content': {
'dtype': pymilvus.DataType.VARCHAR,
'max_length': 65535,
},
Expand Down Expand Up @@ -253,9 +251,9 @@ def _construct_filter_expr(self, filters: Dict[str, Union[str, int, List, Set]])
def _serialize_node_partial(self, node: DocNode) -> Dict:
res = {
'uid': node.uid,
'text': node.text,
'content': obj2str(node.content),
'parent': node.parent.uid if node.parent else '',
'metadata': base64.b64encode(pickle.dumps(node._metadata)).decode('utf-8'),
'metadata': obj2str(node._metadata),
}

for k, v in node.embedding.items():
Expand All @@ -273,9 +271,9 @@ def _deserialize_node_partial(self, result: Dict) -> DocNode:

doc = DocNode(
uid=record.pop('uid'),
text=record.pop('text'),
content=str2obj(record.pop('content')),
parent=record.pop('parent'), # this is the parent's uid
metadata=pickle.loads(base64.b64decode(record.pop('metadata').encode('utf-8'))),
metadata=str2obj(record.pop('metadata')),
)

for k, v in record.items():
Expand Down
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/docxReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
metadata = {"file_name": file.name}
if extra_info is not None: metadata.update(extra_info)

return [DocNode(text=text, metadata=metadata)]
return [DocNode(content=text, metadata=metadata)]
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/epubReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
if item.get_type() == ebooklib.ITEM_DOCUMENT:
text_list.append(html2text.html2text(item.get_content().decode("utf-8")))
text = "\n".join(text_list)
return [DocNode(text=text, metadata=extra_info or {})]
return [DocNode(content=text, metadata=extra_info or {})]
4 changes: 2 additions & 2 deletions lazyllm/tools/rag/readers/hwpReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
if self._is_valid(file_dir) is False: raise Exception("Not Valid HwpFile")

result_text = self._get_text(load_file, file_dir)
return [DocNode(text=result_text, metadata=extra_info or {})]
return [DocNode(content=result_text, metadata=extra_info or {})]

def _is_valid(self, dirs: List[str]) -> bool:
if [self._FILE_HEADER_SECTION] not in dirs: return False
return [self._HWP_SUMMARY_SECTION] in dirs

def _text_to_docnode(self, text: str, extra_info: Optional[Dict] = None) -> DocNode:
return DocNode(text=text, metadata=extra_info or {})
return DocNode(content=text, metadata=extra_info or {})

def _get_text(self, load_file: Any, file_dirs: List[str]) -> str:
sections = self._get_body_sections(file_dirs)
Expand Down
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/imageReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,4 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
model = cast(pytesseract, self._parser_config['model'])
text_str = model.image_to_string(image, **self._pytesseract_model_kwargs)

return [DocNode(text=text_str, metadata=extra_info or {})]
return [DocNode(content=text_str, metadata=extra_info or {})]
4 changes: 2 additions & 2 deletions lazyllm/tools/rag/readers/ipynbReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
splits = re.split(r"In\[\d+\]:", doc_str)
splits.pop(0)

if self._concatenate: docs = [DocNode(text="\n\n".join(splits), metadata=extra_info or {})]
else: docs = [DocNode(text=s, metadata=extra_info or {}) for s in splits]
if self._concatenate: docs = [DocNode(content="\n\n".join(splits), metadata=extra_info or {})]
else: docs = [DocNode(content=s, metadata=extra_info or {}) for s in splits]

return docs
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/markdownReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
if not isinstance(file, Path): file = Path(file)

tups = self._parse_tups(file, fs=fs)
results = [DocNode(text=value if header is None else f"\n\n{header}\n{value}", metadata=extra_info or {})
results = [DocNode(content=value if header is None else f"\n\n{header}\n{value}", metadata=extra_info or {})
for header, value in tups]

return results
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/mboxreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,

i += 1
if self._max_count > 0 and i >= self._max_count: break
return [DocNode(text=result, metadata=extra_info or {}) for result in results]
return [DocNode(content=result, metadata=extra_info or {}) for result in results]
12 changes: 6 additions & 6 deletions lazyllm/tools/rag/readers/pandasReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,

text_list = df.apply(lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1).tolist()

if self._concat_rows: return [DocNode(text=(self._row_joiner).join(text_list), metadata=extra_info or {})]
else: return [DocNode(text=text, metadata=extra_info or {}) for text in text_list]
if self._concat_rows: return [DocNode(content=(self._row_joiner).join(text_list), metadata=extra_info or {})]
else: return [DocNode(content=text, metadata=extra_info or {}) for text in text_list]

class PandasExcelReader(LazyLLMReaderBase):
def __init__(self, concat_rows: bool = True, sheet_name: Optional[str] = None,
Expand Down Expand Up @@ -58,14 +58,14 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
df = dfs.fillna("")
text_list = (df.astype(str).apply(lambda row: " ".join(row.values), axis=1).tolist())

if self._concat_rows: documents.append(DocNode(text="\n".join(text_list), metadata=extra_info or {}))
else: documents.extend([DocNode(text=text, metadata=extra_info or {}) for text in text_list])
if self._concat_rows: documents.append(DocNode(content="\n".join(text_list), metadata=extra_info or {}))
else: documents.extend([DocNode(content=text, metadata=extra_info or {}) for text in text_list])
else:
for df in dfs.values():
df = df.fillna("")
text_list = (df.astype(str).apply(lambda row: " ".join(row), axis=1).tolist())

if self._concat_rows: documents.append(DocNode(text="\n".join(text_list), metadata=extra_info or {}))
else: documents.extend([DocNode(text=text, metadata=extra_info or {}) for text in text_list])
if self._concat_rows: documents.append(DocNode(content="\n".join(text_list), metadata=extra_info or {}))
else: documents.extend([DocNode(content=text, metadata=extra_info or {}) for text in text_list])

return documents
4 changes: 2 additions & 2 deletions lazyllm/tools/rag/readers/pdfReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
metadata = {"file_name": file.name}
if extra_info is not None: metadata.update(extra_info)
text = "\n".join(pdf.pages[page].extract_text() for page in range(num_pages))
docs.append(DocNode(text=text, metadata=metadata))
docs.append(DocNode(content=text, metadata=metadata))
else:
for page in range(num_pages):
page_text = pdf.pages[page].extract_text()
page_label = pdf.page_labels[page]
metadata = {"page_label": page_label, "file_name": file.name}
if extra_info is not None: metadata.update(extra_info)
docs.append(DocNode(text=page_text, metadata=metadata))
docs.append(DocNode(content=page_text, metadata=metadata))
return docs
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/pptxReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,4 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
os.unlink(f.name)

if hasattr(shape, "text"): result += f"{shape.text}\n"
return [DocNode(text=result, metadata=extra_info or {})]
return [DocNode(content=result, metadata=extra_info or {})]
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/videoAudioReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
result = model.transcribe(str(file))

transcript = result['text']
return [DocNode(text=transcript, metadata=extra_info or {})]
return [DocNode(content=transcript, metadata=extra_info or {})]
4 changes: 2 additions & 2 deletions lazyllm/tools/rag/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def build_nodes_from_splits(
if not text_chunk:
continue
node = DocNode(
text=text_chunk,
content=text_chunk,
group=node_group,
parent=doc,
)
Expand Down Expand Up @@ -104,7 +104,7 @@ def __call__(self, node: DocNode, **kwargs: Any) -> List[DocNode]:
# Parent and child should not be set here.
results = self.transform(node, **kwargs)
if isinstance(results, (DocNode, str)): results = [results]
return [DocNode(text=chunk) if isinstance(chunk, str) else chunk for chunk in results if chunk]
return [DocNode(content=chunk) if isinstance(chunk, str) else chunk for chunk in results if chunk]


def make_transform(t):
Expand Down
8 changes: 4 additions & 4 deletions tests/advanced_tests/standard_test/test_reranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
class TestReranker(unittest.TestCase):

def setUp(self):
self.doc1 = DocNode(text="This is a test document with the keyword apple.")
self.doc1 = DocNode(content="This is a test document with the keyword apple.")
self.doc2 = DocNode(
text="This is another test document with the keyword banana."
content="This is another test document with the keyword banana."
)
self.doc3 = DocNode(text="This document contains the keyword cherry.")
self.doc3 = DocNode(content="This document contains the keyword cherry.")
self.nodes = [self.doc1, self.doc2, self.doc3]
self.query = "test query"

Expand Down Expand Up @@ -63,7 +63,7 @@ def CustomReranker(node, **kwargs):
return node
return None

custom_doc = DocNode(text="This document contains custom keyword.")
custom_doc = DocNode(content="This document contains custom keyword.")
nodes = [self.doc1, self.doc2, self.doc3, custom_doc]

reranker = Reranker(name="CustomReranker")
Expand Down
Loading

0 comments on commit 52b4e36

Please sign in to comment.