diff --git a/CHANGELOG.md b/CHANGELOG.md index 46d75bea05..9f929c0429 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,14 @@ -## 0.16.11 +## 0.16.12-dev0 + +### Enhancements + +- **Prepare auto-partitioning for pluggable partitioners**. Move toward a uniform partitioner call signature so a custom or override partitioner can be registered without code changes. + +### Features ### Fixes -- Fix ipv4 regex to correctly include up to three digit octets. +## 0.16.11 ### Enhancements @@ -14,6 +20,8 @@ ### Fixes +- Fix ipv4 regex to correctly include up to three digit octets. + ## 0.16.10 ### Enhancements diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py index ab30b007ea..d1faba6a20 100644 --- a/test_unstructured/metrics/test_element_type.py +++ b/test_unstructured/metrics/test_element_type.py @@ -29,6 +29,7 @@ ("Title", 0): 4, ("Title", 1): 1, ("NarrativeText", 0): 3, + ("PageBreak", None): 3, ("ListItem", 0): 6, ("ListItem", 1): 6, ("ListItem", 2): 3, diff --git a/test_unstructured/partition/html/test_partition.py b/test_unstructured/partition/html/test_partition.py index 022122b54d..fd84a676c9 100644 --- a/test_unstructured/partition/html/test_partition.py +++ b/test_unstructured/partition/html/test_partition.py @@ -1232,17 +1232,6 @@ def it_knows_the_caller_provided_detection_origin( assert opts.detection_origin == detection_origin - # -- .encoding ------------------------------- - - @pytest.mark.parametrize("encoding", ["utf-8", None]) - def it_knows_the_caller_provided_encoding( - self, encoding: str | None, opts_args: dict[str, Any] - ): - opts_args["encoding"] = encoding - opts = HtmlPartitionerOptions(**opts_args) - - assert opts.encoding == encoding - # -- .html_text ------------------------------ def it_gets_the_HTML_from_the_file_path_when_one_is_provided(self, opts_args: dict[str, Any]): diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 3639f26803..9443058176 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -2,7 +2,6 @@ from __future__ import annotations -import io import json import os import pathlib @@ -561,7 +560,6 @@ def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest): strategy=PartitionStrategy.FAST, languages=None, metadata_filename=None, - include_page_breaks=False, infer_table_structure=False, extract_images_in_pdf=False, extract_image_block_types=None, @@ -897,7 +895,7 @@ def test_auto_partition_raises_with_bad_type(request: FixtureRequest): with pytest.raises( UnsupportedFileFormatError, - match="Invalid file made-up.fake. The FileType.UNK file type is not supported in partiti", + match="Partitioning is not supported for the FileType.UNK file type.", ): partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES) @@ -1037,26 +1035,6 @@ def test_auto_partition_forwards_metadata_filename_via_kwargs(): assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements) -def test_auto_partition_warns_about_file_filename_deprecation(caplog: LogCaptureFixture): - file_path = example_doc_path("fake-text.txt") - - with open(file_path, "rb") as f: - elements = partition(file=f, file_filename=file_path) - - assert all(e.metadata.filename == "fake-text.txt" for e in elements) - assert caplog.records[0].levelname == "WARNING" - assert "The file_filename kwarg will be deprecated" in caplog.text - - -def test_auto_partition_raises_when_both_file_filename_and_metadata_filename_args_are_used(): - file_path = example_doc_path("fake-text.txt") - with open(file_path, "rb") as f: - file = io.BytesIO(f.read()) - - with pytest.raises(ValueError, match="Only one of metadata_filename and file_filename is spe"): - partition(file=file, file_filename=file_path, metadata_filename=file_path) - - # -- ocr_languages -------------------------------------------------------- diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2493d925be..5e9d1b8bb0 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.11" # pragma: no cover +__version__ = "0.16.12-dev0" # pragma: no cover diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 7f3bb5e5b3..02ae5219f1 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -2,9 +2,10 @@ from __future__ import annotations +import copy import importlib import io -from typing import IO, Any, Callable, Literal, Optional +from typing import IO, Any, Callable, Optional import requests from typing_extensions import TypeAlias @@ -25,17 +26,15 @@ def partition( filename: Optional[str] = None, *, - content_type: Optional[str] = None, file: Optional[IO[bytes]] = None, - file_filename: Optional[str] = None, - url: Optional[str] = None, - include_page_breaks: bool = False, - strategy: str = PartitionStrategy.AUTO, encoding: Optional[str] = None, - paragraph_grouper: Optional[Callable[[str], str]] | Literal[False] = None, + content_type: Optional[str] = None, + url: Optional[str] = None, headers: dict[str, str] = {}, - skip_infer_table_types: list[str] = ["pdf", "jpg", "png", "heic"], ssl_verify: bool = True, + request_timeout: Optional[int] = None, + strategy: str = PartitionStrategy.AUTO, + skip_infer_table_types: list[str] = ["pdf", "jpg", "png", "heic"], ocr_languages: Optional[str] = None, # changing to optional for deprecation languages: Optional[list[str]] = None, detect_language_per_element: bool = False, @@ -44,15 +43,13 @@ def partition( extract_image_block_types: Optional[list[str]] = None, extract_image_block_output_dir: Optional[str] = None, extract_image_block_to_payload: bool = False, - xml_keep_tags: bool = False, data_source_metadata: Optional[DataSourceMetadata] = None, metadata_filename: Optional[str] = None, - request_timeout: Optional[int] = None, hi_res_model_name: Optional[str] = None, model_name: Optional[str] = None, # to be deprecated starting_page_number: int = 1, **kwargs: Any, -): +) -> list[Element]: """Partitions a document into its constituent elements. Uses libmagic to determine the file's type and route it to the appropriate partitioning @@ -63,30 +60,32 @@ def partition( ---------- filename A string defining the target filename path. - content_type - A string defining the file content in MIME type file A file-like object using "rb" mode --> open(filename, "rb"). - metadata_filename - When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt" + encoding + The character-encoding used to decode the input bytes when drawn from `filename` or `file`. + Defaults to "utf-8". url The url for a remote document. Pass in content_type if you want partition to treat the document as a specific content_type. - include_page_breaks - If True, the output will include page breaks if the filetype supports it + headers + The headers to be used in conjunction with the HTTP request if URL is set. + ssl_verify + If the URL parameter is set, determines whether or not partition uses SSL verification + in the HTTP request. + request_timeout + The timeout for the HTTP request if URL is set. Defaults to None meaning no timeout and + requests will block indefinitely. + content_type + A string defining the file content in MIME type + metadata_filename + When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt" strategy The strategy to use for partitioning PDF/image. Uses a layout detection model if set to 'hi_res', otherwise partition simply extracts the text from the document and processes it. - encoding - The encoding method used to decode the text input. If None, utf-8 will be used. - headers - The headers to be used in conjunction with the HTTP request if URL is set. skip_infer_table_types The document types that you want to skip table extraction with. - ssl_verify - If the URL parameter is set, determines whether or not partition uses SSL verification - in the HTTP request. languages The languages present in the document, for use in partitioning and/or OCR. For partitioning image or pdf documents with Tesseract, you'll first need to install the appropriate @@ -124,12 +123,6 @@ def partition( Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`. The filesystem path for saving images of the element type(s) specified in 'extract_image_block_types'. - xml_keep_tags - If True, will retain the XML tags in the output. Otherwise it will simply extract - the text from within the tags. Only applies to partition_xml. - request_timeout - The timeout for the HTTP request if URL is set. Defaults to None meaning no timeout and - requests will block indefinitely. hi_res_model_name The layout detection model used when partitioning strategy is set to `hi_res`. model_name @@ -142,18 +135,6 @@ def partition( """ exactly_one(file=file, filename=filename, url=url) - if metadata_filename and file_filename: - raise ValueError( - "Only one of metadata_filename and file_filename is specified. " - "metadata_filename is preferred. file_filename is marked for deprecation.", - ) - - if file_filename is not None: - metadata_filename = file_filename - logger.warning( - "The file_filename kwarg will be deprecated in a future version of unstructured. " - "Please use metadata_filename instead.", - ) kwargs.setdefault("metadata_filename", metadata_filename) if pdf_infer_table_structure: @@ -197,80 +178,47 @@ def partition( partitioner_loader = _PartitionerLoader() - if file_type == FileType.CSV: - partition_csv = partitioner_loader.get(file_type) - elements = partition_csv( - filename=filename, - file=file, - encoding=encoding, - infer_table_structure=infer_table_structure, - languages=languages, - detect_language_per_element=detect_language_per_element, - **kwargs, - ) - elif file_type == FileType.DOC: - partition_doc = partitioner_loader.get(file_type) - elements = partition_doc( + # -- extracting this post-processing to allow multiple exit-points from function -- + def augment_metadata(elements: list[Element]) -> list[Element]: + """Add some metadata fields to each element.""" + for element in elements: + element.metadata.url = url + element.metadata.data_source = data_source_metadata + if content_type is not None: + out_filetype = FileType.from_mime_type(content_type) + element.metadata.filetype = out_filetype.mime_type if out_filetype else None + else: + element.metadata.filetype = file_type.mime_type + + return elements + + # -- handle PDF/Image partitioning separately because they have a lot of special-case + # -- parameters. We'll come back to this after sorting out the other file types. + if file_type == FileType.PDF: + partition_pdf = partitioner_loader.get(file_type) + elements = partition_pdf( filename=filename, file=file, + url=None, infer_table_structure=infer_table_structure, - languages=languages, - detect_language_per_element=detect_language_per_element, - starting_page_number=starting_page_number, strategy=strategy, - **kwargs, - ) - elif file_type == FileType.DOCX: - partition_docx = partitioner_loader.get(file_type) - elements = partition_docx( - filename=filename, - file=file, - infer_table_structure=infer_table_structure, languages=languages, - detect_language_per_element=detect_language_per_element, + hi_res_model_name=hi_res_model_name or model_name, + extract_images_in_pdf=extract_images_in_pdf, + extract_image_block_types=extract_image_block_types, + extract_image_block_output_dir=extract_image_block_output_dir, + extract_image_block_to_payload=extract_image_block_to_payload, starting_page_number=starting_page_number, - strategy=strategy, **kwargs, ) - elif file_type == FileType.EML: - partition_email = partitioner_loader.get(file_type) - elements = partition_email( - filename=filename, - file=file, - encoding=encoding, - languages=languages, - detect_language_per_element=detect_language_per_element, - **kwargs, - ) - elif file_type == FileType.EPUB: - partition_epub = partitioner_loader.get(file_type) - elements = partition_epub( - filename=filename, - file=file, - include_page_breaks=include_page_breaks, - infer_table_structure=infer_table_structure, - languages=languages, - detect_language_per_element=detect_language_per_element, - **kwargs, - ) - elif file_type == FileType.HTML: - partition_html = partitioner_loader.get(file_type) - elements = partition_html( - filename=filename, - file=file, - include_page_breaks=include_page_breaks, - encoding=encoding, - languages=languages, - detect_language_per_element=detect_language_per_element, - **kwargs, - ) - elif file_type.partitioner_shortname == "image": + return augment_metadata(elements) + + if file_type.partitioner_shortname == "image": partition_image = partitioner_loader.get(file_type) elements = partition_image( filename=filename, file=file, url=None, - include_page_breaks=include_page_breaks, infer_table_structure=infer_table_structure, strategy=strategy, languages=languages, @@ -282,7 +230,11 @@ def partition( starting_page_number=starting_page_number, **kwargs, ) - elif file_type == FileType.JSON: + return augment_metadata(elements) + + # -- JSON is a special case because it's not a document format per se and is insensitive to + # -- most of the parameters that apply to other file types. + if file_type == FileType.JSON: if not is_json_processable(filename=filename, file=file): raise ValueError( "Detected a JSON file that does not conform to the Unstructured schema. " @@ -290,173 +242,28 @@ def partition( ) partition_json = partitioner_loader.get(file_type) elements = partition_json(filename=filename, file=file, **kwargs) - elif file_type == FileType.MD: - partition_md = partitioner_loader.get(file_type) - elements = partition_md( - filename=filename, - file=file, - include_page_breaks=include_page_breaks, - infer_table_structure=infer_table_structure, - languages=languages, - detect_language_per_element=detect_language_per_element, - **kwargs, - ) - elif file_type == FileType.MSG: - partition_msg = partitioner_loader.get(file_type) - elements = partition_msg( - filename=filename, - file=file, - languages=languages, - detect_language_per_element=detect_language_per_element, - **kwargs, - ) - elif file_type == FileType.ODT: - partition_odt = partitioner_loader.get(file_type) - elements = partition_odt( - filename=filename, - file=file, - infer_table_structure=infer_table_structure, - languages=languages, - detect_language_per_element=detect_language_per_element, - starting_page_number=starting_page_number, - strategy=strategy, - **kwargs, - ) - elif file_type == FileType.ORG: - partition_org = partitioner_loader.get(file_type) - elements = partition_org( - filename=filename, - file=file, - include_page_breaks=include_page_breaks, - languages=languages, - detect_language_per_element=detect_language_per_element, - **kwargs, - ) - elif file_type == FileType.PDF: - partition_pdf = partitioner_loader.get(file_type) - elements = partition_pdf( - filename=filename, - file=file, - url=None, - include_page_breaks=include_page_breaks, - infer_table_structure=infer_table_structure, - strategy=strategy, - languages=languages, - hi_res_model_name=hi_res_model_name or model_name, - extract_images_in_pdf=extract_images_in_pdf, - extract_image_block_types=extract_image_block_types, - extract_image_block_output_dir=extract_image_block_output_dir, - extract_image_block_to_payload=extract_image_block_to_payload, - starting_page_number=starting_page_number, - **kwargs, - ) - elif file_type == FileType.PPT: - partition_ppt = partitioner_loader.get(file_type) - elements = partition_ppt( - filename=filename, - file=file, - include_page_breaks=include_page_breaks, - infer_table_structure=infer_table_structure, - languages=languages, - detect_language_per_element=detect_language_per_element, - strategy=strategy, - **kwargs, - ) - elif file_type == FileType.PPTX: - partition_pptx = partitioner_loader.get(file_type) - elements = partition_pptx( - filename=filename, - file=file, - include_page_breaks=include_page_breaks, - infer_table_structure=infer_table_structure, - languages=languages, - detect_language_per_element=detect_language_per_element, - starting_page_number=starting_page_number, - strategy=strategy, - **kwargs, - ) - elif file_type == FileType.RST: - partition_rst = partitioner_loader.get(file_type) - elements = partition_rst( - filename=filename, - file=file, - include_page_breaks=include_page_breaks, - infer_table_structure=infer_table_structure, - languages=languages, - detect_language_per_element=detect_language_per_element, - **kwargs, - ) - elif file_type == FileType.RTF: - partition_rtf = partitioner_loader.get(file_type) - elements = partition_rtf( - filename=filename, - file=file, - include_page_breaks=include_page_breaks, - infer_table_structure=infer_table_structure, - languages=languages, - detect_language_per_element=detect_language_per_element, - **kwargs, - ) - elif file_type == FileType.TSV: - partition_tsv = partitioner_loader.get(file_type) - elements = partition_tsv( - filename=filename, - file=file, - languages=languages, - detect_language_per_element=detect_language_per_element, - **kwargs, - ) - elif file_type == FileType.TXT: - partition_text = partitioner_loader.get(file_type) - elements = partition_text( - filename=filename, - file=file, - encoding=encoding, - paragraph_grouper=paragraph_grouper, - languages=languages, - detect_language_per_element=detect_language_per_element, - **kwargs, - ) - elif file_type in (FileType.XLS, FileType.XLSX): - partition_xlsx = partitioner_loader.get(file_type) - elements = partition_xlsx( - filename=filename, - file=file, - infer_table_structure=infer_table_structure, - languages=languages, - detect_language_per_element=detect_language_per_element, - starting_page_number=starting_page_number, - **kwargs, - ) - elif file_type == FileType.XML: - partition_xml = partitioner_loader.get(file_type) - elements = partition_xml( - filename=filename, - file=file, - encoding=encoding, - xml_keep_tags=xml_keep_tags, - languages=languages, - detect_language_per_element=detect_language_per_element, - **kwargs, - ) - elif file_type == FileType.EMPTY: - elements = [] - else: - msg = "Invalid file" if not filename else f"Invalid file {filename}" - raise UnsupportedFileFormatError( - f"{msg}. The {file_type} file type is not supported in partition." - ) + return augment_metadata(elements) + + # -- EMPTY is also a special case because while we can't determine the file type, we can be + # -- sure it doesn't contain any elements. + if file_type == FileType.EMPTY: + return [] + + # ============================================================================================ + # ALL OTHER FILE TYPES + # ============================================================================================ - for element in elements: - element.metadata.url = url - element.metadata.data_source = data_source_metadata - if content_type is not None: - out_filetype = FileType.from_mime_type(content_type) - element.metadata.filetype = out_filetype.mime_type if out_filetype is not None else None - else: - element.metadata.filetype = file_type.mime_type + partitioning_kwargs = copy.deepcopy(kwargs) + partitioning_kwargs["detect_language_per_element"] = detect_language_per_element + partitioning_kwargs["encoding"] = encoding + partitioning_kwargs["infer_table_structure"] = infer_table_structure + partitioning_kwargs["languages"] = languages + partitioning_kwargs["starting_page_number"] = starting_page_number + partitioning_kwargs["strategy"] = strategy - return elements + partition = partitioner_loader.get(file_type) + elements = partition(filename=filename, file=file, **partitioning_kwargs) + return augment_metadata(elements) def file_and_type_from_url( @@ -499,17 +306,15 @@ def decide_table_extraction( class _PartitionerLoader: """Provides uniform helpful error when a partitioner dependency is not installed. - Used by `partition()` to encapsulate coping with the possibility the Python - environment it is executing in may not have all dependencies installed for a - particular partitioner. + Used by `partition()` to encapsulate coping with the possibility the Python environment it is + executing in may not have all dependencies installed for a particular partitioner. - Provides `.get()` to access partitioners by file-type, which raises when one or - more dependencies for that partitioner are not installed. + Provides `.get()` to access partitioners by file-type, which raises when one or more + dependencies for that partitioner are not installed. - The error message indicates what extra needs to be installed to enable that - partitioner. This avoids an inconsistent variety of possibly puzzling exceptions - arising from much deeper in the partitioner when access to the missing dependency is - first attempted. + The error message indicates what extra needs to be installed to enable that partitioner. This + avoids an inconsistent variety of possibly puzzling exceptions arising from much deeper in the + partitioner when access to the missing dependency is first attempted. """ # -- module-lifetime cache for partitioners once loaded -- @@ -519,8 +324,15 @@ def get(self, file_type: FileType) -> Partitioner: """Return partitioner for `file_type`. Raises when one or more package dependencies for that file-type have not been - installed. + installed. Also raises when the file-type is not partitionable. """ + if not file_type.is_partitionable: + raise UnsupportedFileFormatError( + f"Partitioning is not supported for the {file_type} file type." + ) + + # -- if the partitioner is not in the cache, load it; note this raises if one or more of + # -- the partitioner's dependencies is not installed. if file_type not in self._partitioners: self._partitioners[file_type] = self._load_partitioner(file_type) diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py index 7ed13d5ae2..a0d3af7cd3 100644 --- a/unstructured/partition/epub.py +++ b/unstructured/partition/epub.py @@ -51,7 +51,6 @@ def partition_epub( return partition_html( text=html_text, - encoding="unicode", metadata_filename=metadata_filename or filename, metadata_file_type=FileType.EPUB, metadata_last_modified=metadata_last_modified or last_modified, diff --git a/unstructured/partition/html/partition.py b/unstructured/partition/html/partition.py index 865a727b23..57dcc2b5c8 100644 --- a/unstructured/partition/html/partition.py +++ b/unstructured/partition/html/partition.py @@ -126,14 +126,6 @@ def detection_origin(self) -> str | None: """Trace of initial partitioner to be included in metadata for debugging purposes.""" return self._detection_origin - @lazyproperty - def encoding(self) -> str | None: - """Caller-provided encoding used to store HTML character stream as bytes. - - `None` when no encoding was provided and encoding should be auto-detected. - """ - return self._encoding - @lazyproperty def html_text(self) -> str: """The HTML document as a string, loaded from wherever the caller specified.""" diff --git a/unstructured/partition/org.py b/unstructured/partition/org.py index 8a958aa09e..93e973a02e 100644 --- a/unstructured/partition/org.py +++ b/unstructured/partition/org.py @@ -42,7 +42,6 @@ def partition_org( return partition_html( text=html_text, - encoding="unicode", metadata_filename=metadata_filename or filename, metadata_file_type=FileType.ORG, metadata_last_modified=metadata_last_modified or last_modified, diff --git a/unstructured/partition/rst.py b/unstructured/partition/rst.py index 84e4bc39c5..55da078df2 100644 --- a/unstructured/partition/rst.py +++ b/unstructured/partition/rst.py @@ -42,7 +42,6 @@ def partition_rst( return partition_html( text=html_text, - encoding="unicode", metadata_filename=metadata_filename or filename, metadata_file_type=FileType.RST, metadata_last_modified=metadata_last_modified or last_modified, diff --git a/unstructured/partition/rtf.py b/unstructured/partition/rtf.py index 9006d9c8fa..aa07853494 100644 --- a/unstructured/partition/rtf.py +++ b/unstructured/partition/rtf.py @@ -42,7 +42,6 @@ def partition_rtf( return partition_html( text=html_text, - encoding="unicode", metadata_filename=metadata_filename or filename, metadata_file_type=FileType.RTF, metadata_last_modified=metadata_last_modified or last_modified,