Merge branch 'main' into add-recursive-chunking

deepset-ai · Jan 9, 2025 · 25721bb · 25721bb
2 parents df214d6 + 28ad78c
commit 25721bb
Show file tree

Hide file tree

Showing 9 changed files with 329 additions and 1 deletion.
diff --git a/docs/pydoc/config/converters_api.yml b/docs/pydoc/config/converters_api.yml
@@ -16,6 +16,7 @@ loaders:
         "pypdf",
         "tika",
         "txt",
+        "xlsx",
       ]
     ignore_when_discovered: ["__init__"]
 processors:

diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py
@@ -15,6 +15,7 @@
 from haystack.components.converters.pypdf import PyPDFToDocument
 from haystack.components.converters.tika import TikaDocumentConverter
 from haystack.components.converters.txt import TextFileToDocument
+from haystack.components.converters.xlsx import XLSXToDocument
 
 __all__ = [
     "TextFileToDocument",
@@ -31,4 +32,5 @@
     "PPTXToDocument",
     "CSVToDocument",
     "JSONConverter",
+    "XLSXToDocument",
 ]
diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py
@@ -0,0 +1,180 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import io
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+
+import pandas as pd
+
+from haystack import Document, component, logging
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
+from haystack.dataclasses import ByteStream
+from haystack.lazy_imports import LazyImport
+
+logger = logging.getLogger(__name__)
+
+with LazyImport("Run 'pip install openpyxl'") as xlsx_import:
+    import openpyxl  # pylint: disable=unused-import # the library is used but not directly referenced
+
+with LazyImport("Run 'pip install tabulate'") as tabulate_import:
+    from tabulate import tabulate  # pylint: disable=unused-import # the library is used but not directly referenced
+
+
+@component
+class XLSXToDocument:
+    """
+    Converts XLSX (Excel) files into Documents.
+
+    Supports reading data from specific sheets or all sheets in the Excel file. If all sheets are read, a Document is
+    created for each sheet. The content of the Document is the table which can be saved in CSV or Markdown format.
+
+    ### Usage example
+
+    ```python
+    from haystack.components.converters.xlsx import XLSXToDocument
+
+    converter = XLSXToDocument()
+    results = converter.run(sources=["sample.xlsx"], meta={"date_added": datetime.now().isoformat()})
+    documents = results["documents"]
+    print(documents[0].content)
+    # ",A,B\n1,col_a,col_b\n2,1.5,test\n"
+    ```
+    """
+
+    def __init__(
+        self,
+        table_format: Literal["csv", "markdown"] = "csv",
+        sheet_name: Union[str, int, List[Union[str, int]], None] = None,
+        read_excel_kwargs: Optional[Dict[str, Any]] = None,
+        table_format_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Creates a XLSXToDocument component.
+
+        :param table_format: The format to convert the Excel file to.
+        :param sheet_name: The name of the sheet to read. If None, all sheets are read.
+        :param read_excel_kwargs: Additional arguments to pass to `pandas.read_excel`.
+            See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html#pandas-read-excel
+        :param table_format_kwargs: Additional keyword arguments to pass to the table format function.
+            - If `table_format` is "csv", these arguments are passed to `pandas.DataFrame.to_csv`.
+              See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#pandas-dataframe-to-csv
+            - If `table_format` is "markdown", these arguments are passed to `pandas.DataFrame.to_markdown`.
+              See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_markdown.html#pandas-dataframe-to-markdown
+        """
+        xlsx_import.check()
+        self.table_format = table_format
+        if table_format not in ["csv", "markdown"]:
+            raise ValueError(f"Unsupported export format: {table_format}. Choose either 'csv' or 'markdown'.")
+        if table_format == "markdown":
+            tabulate_import.check()
+        self.sheet_name = sheet_name
+        self.read_excel_kwargs = read_excel_kwargs or {}
+        self.table_format_kwargs = table_format_kwargs or {}
+
+    @component.output_types(documents=List[Document])
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ) -> Dict[str, List[Document]]:
+        """
+        Converts a XLSX file to a Document.
+
+        :param sources:
+            List of file paths or ByteStream objects.
+        :param meta:
+            Optional metadata to attach to the documents.
+            This value can be either a list of dictionaries or a single dictionary.
+            If it's a single dictionary, its content is added to the metadata of all produced documents.
+            If it's a list, the length of the list must match the number of sources, because the two lists will
+            be zipped.
+            If `sources` contains ByteStream objects, their `meta` will be added to the output documents.
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: Created documents
+        """
+        documents = []
+
+        meta_list = normalize_metadata(meta, sources_count=len(sources))
+
+        for source, metadata in zip(sources, meta_list):
+            try:
+                bytestream = get_bytestream_from_source(source)
+            except Exception as e:
+                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
+                continue
+
+            try:
+                tables, tables_metadata = self._extract_tables(bytestream)
+            except Exception as e:
+                logger.warning(
+                    "Could not read {source} and convert it to a Document, skipping. Error: {error}",
+                    source=source,
+                    error=e,
+                )
+                continue
+
+            # Loop over tables and create a Document for each table
+            for table, excel_metadata in zip(tables, tables_metadata):
+                merged_metadata = {**bytestream.meta, **metadata, **excel_metadata}
+                document = Document(content=table, meta=merged_metadata)
+                documents.append(document)
+
+        return {"documents": documents}
+
+    @staticmethod
+    def _generate_excel_column_names(n_cols: int) -> List[str]:
+        result = []
+        for i in range(n_cols):
+            col_name = ""
+            num = i
+            while num >= 0:
+                col_name = chr(num % 26 + 65) + col_name
+                num = num // 26 - 1
+            result.append(col_name)
+        return result
+
+    def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict]]:
+        """
+        Extract tables from a Excel file.
+        """
+        resolved_read_excel_kwargs = {
+            **self.read_excel_kwargs,
+            "sheet_name": self.sheet_name,
+            "header": None,  # Don't assign any pandas column labels
+            "engine": "openpyxl",  # Use openpyxl as the engine to read the Excel file
+        }
+        sheet_to_dataframe = pd.read_excel(io=io.BytesIO(bytestream.data), **resolved_read_excel_kwargs)
+        if isinstance(sheet_to_dataframe, pd.DataFrame):
+            sheet_to_dataframe = {self.sheet_name: sheet_to_dataframe}
+
+        updated_sheet_to_dataframe = {}
+        for key in sheet_to_dataframe:
+            df = sheet_to_dataframe[key]
+            # Row starts at 1 in Excel
+            df.index = df.index + 1
+            # Excel column names are Alphabet Characters
+            header = self._generate_excel_column_names(df.shape[1])
+            df.columns = header
+            updated_sheet_to_dataframe[key] = df
+
+        tables = []
+        metadata = []
+        for key, value in updated_sheet_to_dataframe.items():
+            if self.table_format == "csv":
+                resolved_kwargs = {"index": True, "header": True, "lineterminator": "\n", **self.table_format_kwargs}
+                tables.append(value.to_csv(**resolved_kwargs))
+            else:
+                resolved_kwargs = {
+                    "index": True,
+                    "headers": value.columns,
+                    "tablefmt": "pipe",
+                    **self.table_format_kwargs,
+                }
+                # to_markdown uses tabulate
+                tables.append(value.to_markdown(**resolved_kwargs))
+            # add sheet_name to metadata
+            metadata.append({"xlsx": {"sheet_name": key}})
+        return tables, metadata
diff --git a/pyproject.toml b/pyproject.toml
@@ -106,7 +106,9 @@ extra-dependencies = [
   "trafilatura",                      # HTMLToDocument
   "python-pptx",                      # PPTXToDocument
   "python-docx",                      # DocxToDocument
-  "jq",                               #JSONConverter
+  "jq",                               # JSONConverter
+  "openpyxl",                         # XLSXToDocument
+  "tabulate",                         # XLSXToDocument
 
   "nltk", # NLTKDocumentSplitter
 

diff --git a/releasenotes/notes/add-excel-to-document-converter-1920c9f9902ddf17.yaml b/releasenotes/notes/add-excel-to-document-converter-1920c9f9902ddf17.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Add XLSXToDocument converter that loads an Excel file using Pandas + openpyxl and by default converts each sheet into a separate Document in a CSV format.
diff --git a/test/components/converters/test_xlsx_to_document.py b/test/components/converters/test_xlsx_to_document.py
@@ -0,0 +1,139 @@
+import logging
+from typing import Union
+
+import pytest
+
+from haystack.components.converters.xlsx import XLSXToDocument
+
+
+class TestXLSXToDocument:
+    def test_init(self) -> None:
+        converter = XLSXToDocument()
+        assert converter.sheet_name is None
+        assert converter.read_excel_kwargs == {}
+        assert converter.table_format == "csv"
+        assert converter.table_format_kwargs == {}
+
+    def test_run_basic_tables(self, test_files_path) -> None:
+        converter = XLSXToDocument()
+        paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
+        results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
+        documents = results["documents"]
+        assert len(documents) == 2
+        assert documents[0].content == ",A,B\n1,col_a,col_b\n2,1.5,test\n"
+        assert documents[0].meta == {
+            "date_added": "2022-01-01T00:00:00",
+            "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
+            "xlsx": {"sheet_name": "Basic Table"},
+        }
+        assert documents[1].content == ",A,B\n1,col_c,col_d\n2,True,\n"
+        assert documents[1].meta == {
+            "date_added": "2022-01-01T00:00:00",
+            "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
+            "xlsx": {"sheet_name": "Table Missing Value"},
+        }
+
+    def test_run_table_empty_rows_and_columns(self, test_files_path) -> None:
+        converter = XLSXToDocument()
+        paths = [test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"]
+        results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
+        documents = results["documents"]
+        assert len(documents) == 1
+        assert documents[0].content == ",A,B,C\n1,,,\n2,,,\n3,,,\n4,,col_a,col_b\n5,,1.5,test\n"
+        assert documents[0].meta == {
+            "date_added": "2022-01-01T00:00:00",
+            "file_path": str(test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"),
+            "xlsx": {"sheet_name": "Sheet1"},
+        }
+
+    def test_run_multiple_tables_in_one_sheet(self, test_files_path) -> None:
+        converter = XLSXToDocument()
+        paths = [test_files_path / "xlsx" / "multiple_tables.xlsx"]
+        results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
+        documents = results["documents"]
+        assert len(documents) == 1
+        assert (
+            documents[0].content
+            == ",A,B,C,D,E,F\n1,,,,,,\n2,,,,,,\n3,,col_a,col_b,,,\n4,,1.5,test,,col_c,col_d\n5,,,,,3,True\n"
+        )
+        assert documents[0].meta == {
+            "date_added": "2022-01-01T00:00:00",
+            "file_path": str(test_files_path / "xlsx" / "multiple_tables.xlsx"),
+            "xlsx": {"sheet_name": "Sheet1"},
+        }
+
+    def test_run_markdown(self, test_files_path) -> None:
+        converter = XLSXToDocument(table_format="markdown")
+        paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
+        results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
+        documents = results["documents"]
+        assert len(documents) == 2
+        assert (
+            documents[0].content
+            == "|    | A     | B     |\n|---:|:------|:------|\n|  1 | col_a | col_b |\n|  2 | 1.5   | test  |"
+        )
+        assert documents[0].meta == {
+            "date_added": "2022-01-01T00:00:00",
+            "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
+            "xlsx": {"sheet_name": "Basic Table"},
+        }
+        assert (
+            documents[1].content
+            == "|    | A     | B     |\n|---:|:------|:------|\n|  1 | col_c | col_d |\n|  2 | True  | nan   |"
+        )
+        assert documents[1].meta == {
+            "date_added": "2022-01-01T00:00:00",
+            "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
+            "xlsx": {"sheet_name": "Table Missing Value"},
+        }
+
+    @pytest.mark.parametrize(
+        "sheet_name, expected_sheet_name, expected_content",
+        [
+            ("Basic Table", "Basic Table", ",A,B\n1,col_a,col_b\n2,1.5,test\n"),
+            ("Table Missing Value", "Table Missing Value", ",A,B\n1,col_c,col_d\n2,True,\n"),
+            (0, 0, ",A,B\n1,col_a,col_b\n2,1.5,test\n"),
+            (1, 1, ",A,B\n1,col_c,col_d\n2,True,\n"),
+        ],
+    )
+    def test_run_sheet_name(
+        self, sheet_name: Union[int, str], expected_sheet_name: str, expected_content: str, test_files_path
+    ) -> None:
+        converter = XLSXToDocument(sheet_name=sheet_name)
+        paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
+        results = converter.run(sources=paths)
+        documents = results["documents"]
+        assert len(documents) == 1
+        assert documents[0].content == expected_content
+        assert documents[0].meta == {
+            "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
+            "xlsx": {"sheet_name": expected_sheet_name},
+        }
+
+    def test_run_with_read_excel_kwargs(self, test_files_path) -> None:
+        converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1})
+        paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
+        results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
+        documents = results["documents"]
+        assert len(documents) == 1
+        assert documents[0].content == ",A,B\n1,1.5,test\n"
+        assert documents[0].meta == {
+            "date_added": "2022-01-01T00:00:00",
+            "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
+            "xlsx": {"sheet_name": "Basic Table"},
+        }
+
+    def test_run_error_wrong_file_type(self, caplog: pytest.LogCaptureFixture, test_files_path) -> None:
+        converter = XLSXToDocument()
+        sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
+        with caplog.at_level(logging.WARNING):
+            results = converter.run(sources=sources)
+            assert "sample_pdf_1.pdf and convert it" in caplog.text
+            assert results["documents"] == []
+
+    def test_run_error_non_existent_file(self, caplog: pytest.LogCaptureFixture) -> None:
+        converter = XLSXToDocument()
+        paths = ["non_existing_file.docx"]
+        with caplog.at_level(logging.WARNING):
+            converter.run(sources=paths)
+            assert "Could not read non_existing_file.docx" in caplog.text
diff --git a/test/test_files/xlsx/basic_tables_two_sheets.xlsx b/test/test_files/xlsx/basic_tables_two_sheets.xlsx
diff --git a/test/test_files/xlsx/multiple_tables.xlsx b/test/test_files/xlsx/multiple_tables.xlsx
diff --git a/test/test_files/xlsx/table_empty_rows_and_columns.xlsx b/test/test_files/xlsx/table_empty_rows_and_columns.xlsx
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,6 +16,7 @@ loaders: @@
             "pypdf",
             "tika",
             "txt",
+            "xlsx",
           ]
         ignore_when_discovered: ["__init__"]
     processors:
@@ Expand Down @@