Skip to content

Commit

Permalink
review2
Browse files Browse the repository at this point in the history
  • Loading branch information
ouonline committed Dec 2, 2024
1 parent 6acbcf2 commit 3c67325
Show file tree
Hide file tree
Showing 21 changed files with 76 additions and 63 deletions.
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/dataReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def load_file(input_file: Path, file_metadata: Callable[[str], Dict], file_extra
with fs.open(input_file, encoding=encoding) as f:
data = f.read().decode(encoding)

doc = DocNode(content=data, metadata=metadata or {})
doc = DocNode(text=data, metadata=metadata or {})
doc.docpath = str(input_file)
if filename_as_id: doc.uid = str(input_file)
documents.append(doc)
Expand Down
7 changes: 5 additions & 2 deletions lazyllm/tools/rag/doc_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,12 @@ class DocNode:
def __init__(self, uid: Optional[str] = None, content: Optional[Union[str, List[Any]]] = None,
group: Optional[str] = None, embedding: Optional[Dict[str, List[float]]] = None,
parent: Optional["DocNode"] = None, metadata: Optional[Dict[str, Any]] = None,
global_metadata: Optional[Dict[str, Any]] = None):
global_metadata: Optional[Dict[str, Any]] = None, *, text: Optional[str] = None):
if text and content:
raise ValueError(f'`text` and `content` cannot be set at the same time.')

self.uid: str = uid if uid else str(uuid.uuid4())
self.content: Optional[Union[str, List[Any]]] = content
self.content: Optional[Union[str, List[Any]]] = content if content else text
self.group: Optional[str] = group
self.embedding: Optional[Dict[str, List[float]]] = embedding or {}
self._metadata: Dict[str, Any] = metadata or {}
Expand Down
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/docxReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
metadata = {"file_name": file.name}
if extra_info is not None: metadata.update(extra_info)

return [DocNode(content=text, metadata=metadata)]
return [DocNode(text=text, metadata=metadata)]
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/epubReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
if item.get_type() == ebooklib.ITEM_DOCUMENT:
text_list.append(html2text.html2text(item.get_content().decode("utf-8")))
text = "\n".join(text_list)
return [DocNode(content=text, metadata=extra_info or {})]
return [DocNode(text=text, metadata=extra_info or {})]
4 changes: 2 additions & 2 deletions lazyllm/tools/rag/readers/hwpReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
if self._is_valid(file_dir) is False: raise Exception("Not Valid HwpFile")

result_text = self._get_text(load_file, file_dir)
return [DocNode(content=result_text, metadata=extra_info or {})]
return [DocNode(text=result_text, metadata=extra_info or {})]

def _is_valid(self, dirs: List[str]) -> bool:
if [self._FILE_HEADER_SECTION] not in dirs: return False
return [self._HWP_SUMMARY_SECTION] in dirs

def _text_to_docnode(self, text: str, extra_info: Optional[Dict] = None) -> DocNode:
return DocNode(content=text, metadata=extra_info or {})
return DocNode(text=text, metadata=extra_info or {})

def _get_text(self, load_file: Any, file_dirs: List[str]) -> str:
sections = self._get_body_sections(file_dirs)
Expand Down
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/imageReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,4 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
model = cast(pytesseract, self._parser_config['model'])
text_str = model.image_to_string(image, **self._pytesseract_model_kwargs)

return [DocNode(content=text_str, metadata=extra_info or {})]
return [DocNode(text=text_str, metadata=extra_info or {})]
4 changes: 2 additions & 2 deletions lazyllm/tools/rag/readers/ipynbReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
splits = re.split(r"In\[\d+\]:", doc_str)
splits.pop(0)

if self._concatenate: docs = [DocNode(content="\n\n".join(splits), metadata=extra_info or {})]
else: docs = [DocNode(content=s, metadata=extra_info or {}) for s in splits]
if self._concatenate: docs = [DocNode(text="\n\n".join(splits), metadata=extra_info or {})]
else: docs = [DocNode(text=s, metadata=extra_info or {}) for s in splits]

return docs
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/markdownReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
if not isinstance(file, Path): file = Path(file)

tups = self._parse_tups(file, fs=fs)
results = [DocNode(content=value if header is None else f"\n\n{header}\n{value}", metadata=extra_info or {})
results = [DocNode(text=value if header is None else f"\n\n{header}\n{value}", metadata=extra_info or {})
for header, value in tups]

return results
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/mboxreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,

i += 1
if self._max_count > 0 and i >= self._max_count: break
return [DocNode(content=result, metadata=extra_info or {}) for result in results]
return [DocNode(text=result, metadata=extra_info or {}) for result in results]
12 changes: 6 additions & 6 deletions lazyllm/tools/rag/readers/pandasReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,

text_list = df.apply(lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1).tolist()

if self._concat_rows: return [DocNode(content=(self._row_joiner).join(text_list), metadata=extra_info or {})]
else: return [DocNode(content=text, metadata=extra_info or {}) for text in text_list]
if self._concat_rows: return [DocNode(text=(self._row_joiner).join(text_list), metadata=extra_info or {})]
else: return [DocNode(text=text, metadata=extra_info or {}) for text in text_list]

class PandasExcelReader(LazyLLMReaderBase):
def __init__(self, concat_rows: bool = True, sheet_name: Optional[str] = None,
Expand Down Expand Up @@ -58,14 +58,14 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
df = dfs.fillna("")
text_list = (df.astype(str).apply(lambda row: " ".join(row.values), axis=1).tolist())

if self._concat_rows: documents.append(DocNode(content="\n".join(text_list), metadata=extra_info or {}))
else: documents.extend([DocNode(content=text, metadata=extra_info or {}) for text in text_list])
if self._concat_rows: documents.append(DocNode(text="\n".join(text_list), metadata=extra_info or {}))
else: documents.extend([DocNode(text=text, metadata=extra_info or {}) for text in text_list])
else:
for df in dfs.values():
df = df.fillna("")
text_list = (df.astype(str).apply(lambda row: " ".join(row), axis=1).tolist())

if self._concat_rows: documents.append(DocNode(content="\n".join(text_list), metadata=extra_info or {}))
else: documents.extend([DocNode(content=text, metadata=extra_info or {}) for text in text_list])
if self._concat_rows: documents.append(DocNode(text="\n".join(text_list), metadata=extra_info or {}))
else: documents.extend([DocNode(text=text, metadata=extra_info or {}) for text in text_list])

return documents
4 changes: 2 additions & 2 deletions lazyllm/tools/rag/readers/pdfReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
metadata = {"file_name": file.name}
if extra_info is not None: metadata.update(extra_info)
text = "\n".join(pdf.pages[page].extract_text() for page in range(num_pages))
docs.append(DocNode(content=text, metadata=metadata))
docs.append(DocNode(text=text, metadata=metadata))
else:
for page in range(num_pages):
page_text = pdf.pages[page].extract_text()
page_label = pdf.page_labels[page]
metadata = {"page_label": page_label, "file_name": file.name}
if extra_info is not None: metadata.update(extra_info)
docs.append(DocNode(content=page_text, metadata=metadata))
docs.append(DocNode(text=page_text, metadata=metadata))
return docs
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/pptxReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,4 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
os.unlink(f.name)

if hasattr(shape, "text"): result += f"{shape.text}\n"
return [DocNode(content=result, metadata=extra_info or {})]
return [DocNode(text=result, metadata=extra_info or {})]
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/readers/videoAudioReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ def _load_data(self, file: Path, extra_info: Optional[Dict] = None,
result = model.transcribe(str(file))

transcript = result['text']
return [DocNode(content=transcript, metadata=extra_info or {})]
return [DocNode(text=transcript, metadata=extra_info or {})]
4 changes: 2 additions & 2 deletions lazyllm/tools/rag/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def build_nodes_from_splits(
if not text_chunk:
continue
node = DocNode(
content=text_chunk,
text=text_chunk,
group=node_group,
parent=doc,
)
Expand Down Expand Up @@ -104,7 +104,7 @@ def __call__(self, node: DocNode, **kwargs: Any) -> List[DocNode]:
# Parent and child should not be set here.
results = self.transform(node, **kwargs)
if isinstance(results, (DocNode, str)): results = [results]
return [DocNode(content=chunk) if isinstance(chunk, str) else chunk for chunk in results if chunk]
return [DocNode(text=chunk) if isinstance(chunk, str) else chunk for chunk in results if chunk]


def make_transform(t):
Expand Down
8 changes: 4 additions & 4 deletions tests/advanced_tests/standard_test/test_reranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
class TestReranker(unittest.TestCase):

def setUp(self):
self.doc1 = DocNode(content="This is a test document with the keyword apple.")
self.doc1 = DocNode(text="This is a test document with the keyword apple.")
self.doc2 = DocNode(
content="This is another test document with the keyword banana."
text="This is another test document with the keyword banana."
)
self.doc3 = DocNode(content="This document contains the keyword cherry.")
self.doc3 = DocNode(text="This document contains the keyword cherry.")
self.nodes = [self.doc1, self.doc2, self.doc3]
self.query = "test query"

Expand Down Expand Up @@ -63,7 +63,7 @@ def CustomReranker(node, **kwargs):
return node
return None

custom_doc = DocNode(content="This document contains custom keyword.")
custom_doc = DocNode(text="This document contains custom keyword.")
nodes = [self.doc1, self.doc2, self.doc3, custom_doc]

reranker = Reranker(name="CustomReranker")
Expand Down
16 changes: 8 additions & 8 deletions tests/basic_tests/test_bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
class TestBM25(unittest.TestCase):
def setUp(self):
self.nodes = [
DocNode(content="This is a test document."),
DocNode(content="This document is for testing BM25."),
DocNode(content="BM25 is a ranking function used in information retrieval."),
DocNode(text="This is a test document."),
DocNode(text="This document is for testing BM25."),
DocNode(text="BM25 is a ranking function used in information retrieval."),
]

self.bm25_en = BM25(self.nodes, language="en", topk=2)
Expand All @@ -36,16 +36,16 @@ def test_retrieve(self):
class TestBM25Chinese(unittest.TestCase):
def setUp(self):
self.nodes = [
DocNode(content="这是一个测试文档。这个文档用于测试BM25。"),
DocNode(text="这是一个测试文档。这个文档用于测试BM25。"),
DocNode(
content="BM25是一种在信息检索中使用的排序函数。信息检索系统通过BM25算法来排序文档和分数。"
text="BM25是一种在信息检索中使用的排序函数。信息检索系统通过BM25算法来排序文档和分数。"
),
DocNode(content="中文文档的测试内容。测试文档中包含多个句子。"),
DocNode(text="中文文档的测试内容。测试文档中包含多个句子。"),
DocNode(
content="这个测试是为了验证BM25在中文文档中的表现。我们需要对多个文档进行排序测试。"
text="这个测试是为了验证BM25在中文文档中的表现。我们需要对多个文档进行排序测试。"
),
DocNode(
content="文档的内容可以影响BM25的评分。排序函数的性能对于信息检索非常重要。"
text="文档的内容可以影响BM25的评分。排序函数的性能对于信息检索非常重要。"
),
]

Expand Down
4 changes: 2 additions & 2 deletions tests/basic_tests/test_doc_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def setup_method(self):
self.metadata = {"author": "John Doe", "date": "2023-07-01"}
self.embedding = [0.1, 0.2, 0.3]
self.node = DocNode(
content=self.text,
text=self.text,
embedding={"default": self.embedding},
)
self.node.metadata = self.metadata
Expand Down Expand Up @@ -71,7 +71,7 @@ def test_get_metadata_str(self):

def test_root_node(self):
"""Test the root_node property."""
child_node = DocNode(content="Child node", parent=self.node)
child_node = DocNode(text="Child node", parent=self.node)
assert child_node.root_node == self.node

def test_metadata_property(self):
Expand Down
5 changes: 3 additions & 2 deletions tests/basic_tests/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class TestDocImpl(unittest.TestCase):
def setUp(self):
self.mock_embed = MagicMock()
self.mock_directory_reader = MagicMock()
mock_node = DocNode(group=LAZY_ROOT_NAME, content="dummy text")
mock_node = DocNode(group=LAZY_ROOT_NAME, text="dummy text")
mock_node._global_metadata = {RAG_DOC_PATH: "dummy_file.txt"}
self.mock_directory_reader.load_data.return_value = [mock_node]

Expand Down Expand Up @@ -72,7 +72,7 @@ def test_add_files(self):
assert self.doc_impl.store is None
self.doc_impl._lazy_init()
assert len(self.doc_impl.store.get_nodes(LAZY_ROOT_NAME)) == 1
new_doc = DocNode(content="new dummy text", group=LAZY_ROOT_NAME)
new_doc = DocNode(text="new dummy text", group=LAZY_ROOT_NAME)
new_doc._global_metadata = {RAG_DOC_PATH: "new_file.txt"}
self.mock_directory_reader.load_data.return_value = [new_doc]
self.doc_impl._add_files(["new_file.txt"])
Expand Down Expand Up @@ -210,6 +210,7 @@ def test_delete_files_in_store(self):
response = httpx.post(url, params=params, files=files, timeout=10)
assert response.status_code == 200 and response.json().get('code') == 200, response.json()
ids = response.json().get('data')[0]
lazyllm.LOG.error(f'debug!!! ids -> {ids}')
assert len(ids) == 2

time.sleep(20) # waiting for worker thread to update newly uploaded files
Expand Down
4 changes: 2 additions & 2 deletions tests/basic_tests/test_rag_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ class YmlReader(ReaderBase):
def _load_data(self, file, extra_info=None, fs=None):
with open(file, 'r') as f:
data = f.read()
node = DocNode(content=data, metadata=extra_info or {})
node = DocNode(text=data, metadata=extra_info or {})
node.content = "Call the class YmlReader."
return [node]

def processYml(file, extra_info=None):
with open(file, 'r') as f:
data = f.read()
node = DocNode(content=data, metadata=extra_info or {})
node = DocNode(text=data, metadata=extra_info or {})
node.content = "Call the function processYml."
return [node]

Expand Down
Loading

0 comments on commit 3c67325

Please sign in to comment.