Skip to content

Commit

Permalink
Merge branch 'main' into add-recursive-chunking
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista committed Jan 9, 2025
2 parents df214d6 + 28ad78c commit 25721bb
Show file tree
Hide file tree
Showing 9 changed files with 329 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/pydoc/config/converters_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ loaders:
"pypdf",
"tika",
"txt",
"xlsx",
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
2 changes: 2 additions & 0 deletions haystack/components/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from haystack.components.converters.pypdf import PyPDFToDocument
from haystack.components.converters.tika import TikaDocumentConverter
from haystack.components.converters.txt import TextFileToDocument
from haystack.components.converters.xlsx import XLSXToDocument

__all__ = [
"TextFileToDocument",
Expand All @@ -31,4 +32,5 @@
"PPTXToDocument",
"CSVToDocument",
"JSONConverter",
"XLSXToDocument",
]
180 changes: 180 additions & 0 deletions haystack/components/converters/xlsx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

import io
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Tuple, Union

import pandas as pd

from haystack import Document, component, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport

logger = logging.getLogger(__name__)

with LazyImport("Run 'pip install openpyxl'") as xlsx_import:
import openpyxl # pylint: disable=unused-import # the library is used but not directly referenced

with LazyImport("Run 'pip install tabulate'") as tabulate_import:
from tabulate import tabulate # pylint: disable=unused-import # the library is used but not directly referenced


@component
class XLSXToDocument:
"""
Converts XLSX (Excel) files into Documents.
Supports reading data from specific sheets or all sheets in the Excel file. If all sheets are read, a Document is
created for each sheet. The content of the Document is the table which can be saved in CSV or Markdown format.
### Usage example
```python
from haystack.components.converters.xlsx import XLSXToDocument
converter = XLSXToDocument()
results = converter.run(sources=["sample.xlsx"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# ",A,B\n1,col_a,col_b\n2,1.5,test\n"
```
"""

def __init__(
self,
table_format: Literal["csv", "markdown"] = "csv",
sheet_name: Union[str, int, List[Union[str, int]], None] = None,
read_excel_kwargs: Optional[Dict[str, Any]] = None,
table_format_kwargs: Optional[Dict[str, Any]] = None,
):
"""
Creates a XLSXToDocument component.
:param table_format: The format to convert the Excel file to.
:param sheet_name: The name of the sheet to read. If None, all sheets are read.
:param read_excel_kwargs: Additional arguments to pass to `pandas.read_excel`.
See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html#pandas-read-excel
:param table_format_kwargs: Additional keyword arguments to pass to the table format function.
- If `table_format` is "csv", these arguments are passed to `pandas.DataFrame.to_csv`.
See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#pandas-dataframe-to-csv
- If `table_format` is "markdown", these arguments are passed to `pandas.DataFrame.to_markdown`.
See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_markdown.html#pandas-dataframe-to-markdown
"""
xlsx_import.check()
self.table_format = table_format
if table_format not in ["csv", "markdown"]:
raise ValueError(f"Unsupported export format: {table_format}. Choose either 'csv' or 'markdown'.")
if table_format == "markdown":
tabulate_import.check()
self.sheet_name = sheet_name
self.read_excel_kwargs = read_excel_kwargs or {}
self.table_format_kwargs = table_format_kwargs or {}

@component.output_types(documents=List[Document])
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
) -> Dict[str, List[Document]]:
"""
Converts a XLSX file to a Document.
:param sources:
List of file paths or ByteStream objects.
:param meta:
Optional metadata to attach to the documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced documents.
If it's a list, the length of the list must match the number of sources, because the two lists will
be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output documents.
:returns:
A dictionary with the following keys:
- `documents`: Created documents
"""
documents = []

meta_list = normalize_metadata(meta, sources_count=len(sources))

for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue

try:
tables, tables_metadata = self._extract_tables(bytestream)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to a Document, skipping. Error: {error}",
source=source,
error=e,
)
continue

# Loop over tables and create a Document for each table
for table, excel_metadata in zip(tables, tables_metadata):
merged_metadata = {**bytestream.meta, **metadata, **excel_metadata}
document = Document(content=table, meta=merged_metadata)
documents.append(document)

return {"documents": documents}

@staticmethod
def _generate_excel_column_names(n_cols: int) -> List[str]:
result = []
for i in range(n_cols):
col_name = ""
num = i
while num >= 0:
col_name = chr(num % 26 + 65) + col_name
num = num // 26 - 1
result.append(col_name)
return result

def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict]]:
"""
Extract tables from a Excel file.
"""
resolved_read_excel_kwargs = {
**self.read_excel_kwargs,
"sheet_name": self.sheet_name,
"header": None, # Don't assign any pandas column labels
"engine": "openpyxl", # Use openpyxl as the engine to read the Excel file
}
sheet_to_dataframe = pd.read_excel(io=io.BytesIO(bytestream.data), **resolved_read_excel_kwargs)
if isinstance(sheet_to_dataframe, pd.DataFrame):
sheet_to_dataframe = {self.sheet_name: sheet_to_dataframe}

updated_sheet_to_dataframe = {}
for key in sheet_to_dataframe:
df = sheet_to_dataframe[key]
# Row starts at 1 in Excel
df.index = df.index + 1
# Excel column names are Alphabet Characters
header = self._generate_excel_column_names(df.shape[1])
df.columns = header
updated_sheet_to_dataframe[key] = df

tables = []
metadata = []
for key, value in updated_sheet_to_dataframe.items():
if self.table_format == "csv":
resolved_kwargs = {"index": True, "header": True, "lineterminator": "\n", **self.table_format_kwargs}
tables.append(value.to_csv(**resolved_kwargs))
else:
resolved_kwargs = {
"index": True,
"headers": value.columns,
"tablefmt": "pipe",
**self.table_format_kwargs,
}
# to_markdown uses tabulate
tables.append(value.to_markdown(**resolved_kwargs))
# add sheet_name to metadata
metadata.append({"xlsx": {"sheet_name": key}})
return tables, metadata
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ extra-dependencies = [
"trafilatura", # HTMLToDocument
"python-pptx", # PPTXToDocument
"python-docx", # DocxToDocument
"jq", #JSONConverter
"jq", # JSONConverter
"openpyxl", # XLSXToDocument
"tabulate", # XLSXToDocument

"nltk", # NLTKDocumentSplitter

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Add XLSXToDocument converter that loads an Excel file using Pandas + openpyxl and by default converts each sheet into a separate Document in a CSV format.
139 changes: 139 additions & 0 deletions test/components/converters/test_xlsx_to_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import logging
from typing import Union

import pytest

from haystack.components.converters.xlsx import XLSXToDocument


class TestXLSXToDocument:
def test_init(self) -> None:
converter = XLSXToDocument()
assert converter.sheet_name is None
assert converter.read_excel_kwargs == {}
assert converter.table_format == "csv"
assert converter.table_format_kwargs == {}

def test_run_basic_tables(self, test_files_path) -> None:
converter = XLSXToDocument()
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
assert len(documents) == 2
assert documents[0].content == ",A,B\n1,col_a,col_b\n2,1.5,test\n"
assert documents[0].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
"xlsx": {"sheet_name": "Basic Table"},
}
assert documents[1].content == ",A,B\n1,col_c,col_d\n2,True,\n"
assert documents[1].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
"xlsx": {"sheet_name": "Table Missing Value"},
}

def test_run_table_empty_rows_and_columns(self, test_files_path) -> None:
converter = XLSXToDocument()
paths = [test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
assert len(documents) == 1
assert documents[0].content == ",A,B,C\n1,,,\n2,,,\n3,,,\n4,,col_a,col_b\n5,,1.5,test\n"
assert documents[0].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"),
"xlsx": {"sheet_name": "Sheet1"},
}

def test_run_multiple_tables_in_one_sheet(self, test_files_path) -> None:
converter = XLSXToDocument()
paths = [test_files_path / "xlsx" / "multiple_tables.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
assert len(documents) == 1
assert (
documents[0].content
== ",A,B,C,D,E,F\n1,,,,,,\n2,,,,,,\n3,,col_a,col_b,,,\n4,,1.5,test,,col_c,col_d\n5,,,,,3,True\n"
)
assert documents[0].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "multiple_tables.xlsx"),
"xlsx": {"sheet_name": "Sheet1"},
}

def test_run_markdown(self, test_files_path) -> None:
converter = XLSXToDocument(table_format="markdown")
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
assert len(documents) == 2
assert (
documents[0].content
== "| | A | B |\n|---:|:------|:------|\n| 1 | col_a | col_b |\n| 2 | 1.5 | test |"
)
assert documents[0].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
"xlsx": {"sheet_name": "Basic Table"},
}
assert (
documents[1].content
== "| | A | B |\n|---:|:------|:------|\n| 1 | col_c | col_d |\n| 2 | True | nan |"
)
assert documents[1].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
"xlsx": {"sheet_name": "Table Missing Value"},
}

@pytest.mark.parametrize(
"sheet_name, expected_sheet_name, expected_content",
[
("Basic Table", "Basic Table", ",A,B\n1,col_a,col_b\n2,1.5,test\n"),
("Table Missing Value", "Table Missing Value", ",A,B\n1,col_c,col_d\n2,True,\n"),
(0, 0, ",A,B\n1,col_a,col_b\n2,1.5,test\n"),
(1, 1, ",A,B\n1,col_c,col_d\n2,True,\n"),
],
)
def test_run_sheet_name(
self, sheet_name: Union[int, str], expected_sheet_name: str, expected_content: str, test_files_path
) -> None:
converter = XLSXToDocument(sheet_name=sheet_name)
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
results = converter.run(sources=paths)
documents = results["documents"]
assert len(documents) == 1
assert documents[0].content == expected_content
assert documents[0].meta == {
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
"xlsx": {"sheet_name": expected_sheet_name},
}

def test_run_with_read_excel_kwargs(self, test_files_path) -> None:
converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1})
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
assert len(documents) == 1
assert documents[0].content == ",A,B\n1,1.5,test\n"
assert documents[0].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
"xlsx": {"sheet_name": "Basic Table"},
}

def test_run_error_wrong_file_type(self, caplog: pytest.LogCaptureFixture, test_files_path) -> None:
converter = XLSXToDocument()
sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
with caplog.at_level(logging.WARNING):
results = converter.run(sources=sources)
assert "sample_pdf_1.pdf and convert it" in caplog.text
assert results["documents"] == []

def test_run_error_non_existent_file(self, caplog: pytest.LogCaptureFixture) -> None:
converter = XLSXToDocument()
paths = ["non_existing_file.docx"]
with caplog.at_level(logging.WARNING):
converter.run(sources=paths)
assert "Could not read non_existing_file.docx" in caplog.text
Binary file added test/test_files/xlsx/basic_tables_two_sheets.xlsx
Binary file not shown.
Binary file added test/test_files/xlsx/multiple_tables.xlsx
Binary file not shown.
Binary file not shown.

0 comments on commit 25721bb

Please sign in to comment.