Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rfctr: prep for pluggable partitioners #3806

Merged
merged 16 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
## 0.16.11
## 0.16.12-dev0

### Enhancements

- **Prepare auto-partitioning for pluggable partitioners**. Move toward a uniform partitioner call signature so a custom or override partitioner can be registered without code changes.

### Features

### Fixes

- Fix ipv4 regex to correctly include up to three digit octets.
## 0.16.11

### Enhancements

Expand All @@ -14,6 +20,8 @@

### Fixes

- Fix ipv4 regex to correctly include up to three digit octets.

## 0.16.10

### Enhancements
Expand Down
1 change: 1 addition & 0 deletions test_unstructured/metrics/test_element_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
("Title", 0): 4,
("Title", 1): 1,
("NarrativeText", 0): 3,
("PageBreak", None): 3,
("ListItem", 0): 6,
("ListItem", 1): 6,
("ListItem", 2): 3,
Expand Down
11 changes: 0 additions & 11 deletions test_unstructured/partition/html/test_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -1232,17 +1232,6 @@ def it_knows_the_caller_provided_detection_origin(

assert opts.detection_origin == detection_origin

# -- .encoding -------------------------------

@pytest.mark.parametrize("encoding", ["utf-8", None])
def it_knows_the_caller_provided_encoding(
self, encoding: str | None, opts_args: dict[str, Any]
):
opts_args["encoding"] = encoding
opts = HtmlPartitionerOptions(**opts_args)

assert opts.encoding == encoding

# -- .html_text ------------------------------

def it_gets_the_HTML_from_the_file_path_when_one_is_provided(self, opts_args: dict[str, Any]):
Expand Down
24 changes: 1 addition & 23 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from __future__ import annotations

import io
import json
import os
import pathlib
Expand Down Expand Up @@ -561,7 +560,6 @@ def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
strategy=PartitionStrategy.FAST,
languages=None,
metadata_filename=None,
include_page_breaks=False,
infer_table_structure=False,
extract_images_in_pdf=False,
extract_image_block_types=None,
Expand Down Expand Up @@ -897,7 +895,7 @@ def test_auto_partition_raises_with_bad_type(request: FixtureRequest):

with pytest.raises(
UnsupportedFileFormatError,
match="Invalid file made-up.fake. The FileType.UNK file type is not supported in partiti",
match="Partitioning is not supported for the FileType.UNK file type.",
):
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)

Expand Down Expand Up @@ -1037,26 +1035,6 @@ def test_auto_partition_forwards_metadata_filename_via_kwargs():
assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements)


def test_auto_partition_warns_about_file_filename_deprecation(caplog: LogCaptureFixture):
file_path = example_doc_path("fake-text.txt")

with open(file_path, "rb") as f:
elements = partition(file=f, file_filename=file_path)

assert all(e.metadata.filename == "fake-text.txt" for e in elements)
assert caplog.records[0].levelname == "WARNING"
assert "The file_filename kwarg will be deprecated" in caplog.text


def test_auto_partition_raises_when_both_file_filename_and_metadata_filename_args_are_used():
file_path = example_doc_path("fake-text.txt")
with open(file_path, "rb") as f:
file = io.BytesIO(f.read())

with pytest.raises(ValueError, match="Only one of metadata_filename and file_filename is spe"):
partition(file=file, file_filename=file_path, metadata_filename=file_path)


# -- ocr_languages --------------------------------------------------------


Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.11" # pragma: no cover
__version__ = "0.16.12-dev0" # pragma: no cover
Loading
Loading