diff --git a/CHANGELOG.md b/CHANGELOG.md index 84784c22f8..9256b7e3bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,11 @@ -## 0.15.14-dev3 +## 0.15.14-dev4 ### Enhancements ### Features +* **Add (but do not install) a new post-partitioning decorator to handle metadata added for all file-types, like `.filename`, `.filetype` and `.languages`.** This will be installed in a closely following PR to replace the four currently being used for this purpose. + ### Fixes * **Update Python SDK usage in `partition_via_api`.** Make a minor syntax change to ensure forward compatibility with the upcoming 0.26.0 Python SDK. diff --git a/test_unstructured/partition/test_lang.py b/test_unstructured/partition/common/test_lang.py similarity index 85% rename from test_unstructured/partition/test_lang.py rename to test_unstructured/partition/common/test_lang.py index f1d743a8b8..076b405a71 100644 --- a/test_unstructured/partition/test_lang.py +++ b/test_unstructured/partition/common/test_lang.py @@ -6,15 +6,15 @@ import os import pathlib -from typing import Union import pytest +from test_unstructured.unit_utils import LogCaptureFixture from unstructured.documents.elements import ( NarrativeText, PageBreak, ) -from unstructured.partition.lang import ( +from unstructured.partition.common.lang import ( _clean_ocr_languages_arg, _convert_language_code_to_pytesseract_lang_code, apply_lang_metadata, @@ -61,13 +61,13 @@ def test_prepare_languages_for_tesseract_with_multiple_languages(): assert prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ" -def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog): +def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog: LogCaptureFixture): languages = ["zzz", "chi"] assert prepare_languages_for_tesseract(languages) == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert" assert "not a valid standard language code" in caplog.text -def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog): +def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog: LogCaptureFixture): languages = ["kbd", "eng"] assert prepare_languages_for_tesseract(languages) == "eng" assert "not a language supported by Tesseract" in caplog.text @@ -79,7 +79,7 @@ def test_prepare_languages_for_tesseract_None_languages(): prepare_languages_for_tesseract(languages) -def test_prepare_languages_for_tesseract_no_valid_languages(caplog): +def test_prepare_languages_for_tesseract_no_valid_languages(caplog: LogCaptureFixture): languages = [""] assert prepare_languages_for_tesseract(languages) == "eng" assert "Failed to find any valid standard language code from languages" in caplog.text @@ -96,11 +96,11 @@ def test_prepare_languages_for_tesseract_no_valid_languages(caplog): ("kor", "korean"), ], ) -def test_tesseract_to_paddle_language_valid_codes(tesseract_lang, expected_lang): +def test_tesseract_to_paddle_language_valid_codes(tesseract_lang: str, expected_lang: str): assert expected_lang == tesseract_to_paddle_language(tesseract_lang) -def test_tesseract_to_paddle_language_invalid_codes(caplog): +def test_tesseract_to_paddle_language_invalid_codes(caplog: LogCaptureFixture): tesseract_lang = "unsupported_lang" assert tesseract_to_paddle_language(tesseract_lang) == "en" assert "unsupported_lang is not a language code supported by PaddleOCR," in caplog.text @@ -114,7 +114,7 @@ def test_tesseract_to_paddle_language_invalid_codes(caplog): ("DEU", "german"), ], ) -def test_tesseract_to_paddle_language_case_sensitivity(tesseract_lang, expected_lang): +def test_tesseract_to_paddle_language_case_sensitivity(tesseract_lang: str, expected_lang: str): assert expected_lang == tesseract_to_paddle_language(tesseract_lang) @@ -139,7 +139,7 @@ def test_detect_languages_gets_multiple_languages(): assert detect_languages(text) == ["ces", "pol", "slk"] -def test_detect_languages_warns_for_auto_and_other_input(caplog): +def test_detect_languages_warns_for_auto_and_other_input(caplog: LogCaptureFixture): text = "This is another short sentence." languages = ["en", "auto", "rus"] assert detect_languages(text, languages) == ["eng"] @@ -149,10 +149,10 @@ def test_detect_languages_warns_for_auto_and_other_input(caplog): def test_detect_languages_raises_TypeError_for_invalid_languages(): with pytest.raises(TypeError): text = "This is a short sentence." - detect_languages(text, languages="eng") == ["eng"] + detect_languages(text, languages="eng") == ["eng"] # type: ignore -def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog): +def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog: LogCaptureFixture): elements = [NarrativeText("Sample text."), PageBreak("")] elements = list( apply_lang_metadata( @@ -171,7 +171,7 @@ def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog): ("fr", "fra"), ], ) -def test_convert_language_code_to_pytesseract_lang_code(lang_in, expected_lang): +def test_convert_language_code_to_pytesseract_lang_code(lang_in: str, expected_lang: str): assert expected_lang == _convert_language_code_to_pytesseract_lang_code(lang_in) @@ -187,7 +187,7 @@ def test_convert_language_code_to_pytesseract_lang_code(lang_in, expected_lang): ("deu+spa", "deu+spa"), # correct input ], ) -def test_clean_ocr_languages_arg(input_ocr_langs, expected): +def test_clean_ocr_languages_arg(input_ocr_langs: str, expected: str): assert _clean_ocr_languages_arg(input_ocr_langs) == expected @@ -209,12 +209,15 @@ def test_detect_languages_handles_spelled_out_languages(): ], ) def test_check_language_args_uses_languages_when_ocr_languages_and_languages_are_both_defined( - languages: Union[list[str], str], - ocr_languages: Union[list[str], str, None], + languages: list[str], + ocr_languages: list[str] | str, expected_langs: list[str], - caplog, + caplog: LogCaptureFixture, ): - returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) + returned_langs = check_language_args( + languages=languages, + ocr_languages=ocr_languages, + ) for lang in returned_langs: # type: ignore assert lang in expected_langs assert "ocr_languages" in caplog.text @@ -231,10 +234,10 @@ def test_check_language_args_uses_languages_when_ocr_languages_and_languages_are ], ) def test_check_language_args_uses_ocr_languages_when_languages_is_empty_or_None( - languages: Union[list[str], str], - ocr_languages: Union[list[str], str, None], + languages: list[str], + ocr_languages: str, expected_langs: list[str], - caplog, + caplog: LogCaptureFixture, ): returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) for lang in returned_langs: # type: ignore @@ -250,19 +253,15 @@ def test_check_language_args_uses_ocr_languages_when_languages_is_empty_or_None( ], ) def test_check_language_args_returns_None( - languages: Union[list[str], str, None], - ocr_languages: Union[list[str], str, None], + languages: list[str], + ocr_languages: None, ): returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) assert returned_langs is None -def test_check_language_args_returns_auto( - languages=["eng", "spa", "auto"], - ocr_languages=None, -): - returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) - assert returned_langs == ["auto"] +def test_check_language_args_returns_auto(): + assert check_language_args(languages=["eng", "spa", "auto"], ocr_languages=None) == ["auto"] @pytest.mark.parametrize( @@ -273,8 +272,11 @@ def test_check_language_args_returns_auto( ], ) def test_check_language_args_raises_error_when_ocr_languages_contains_auto( - languages: Union[list[str], str, None], - ocr_languages: Union[list[str], str, None], + languages: list[str], + ocr_languages: str | list[str], ): with pytest.raises(ValueError): - check_language_args(languages=languages, ocr_languages=ocr_languages) + check_language_args( + languages=languages, + ocr_languages=ocr_languages, + ) diff --git a/test_unstructured/partition/common/test_metadata.py b/test_unstructured/partition/common/test_metadata.py index 844486de89..345e55ae62 100644 --- a/test_unstructured/partition/common/test_metadata.py +++ b/test_unstructured/partition/common/test_metadata.py @@ -5,9 +5,13 @@ import datetime as dt import os import pathlib +from typing import Any, Callable + +import pytest from unstructured.documents.elements import ( CheckBox, + Element, ElementMetadata, FigureCaption, Header, @@ -16,7 +20,9 @@ Text, Title, ) +from unstructured.file_utils.model import FileType from unstructured.partition.common.metadata import ( + apply_metadata, get_last_modified_date, set_element_hierarchy, ) @@ -119,3 +125,193 @@ def test_set_element_hierarchy_custom_rule_set(): assert ( elements[5].metadata.parent_id == elements[4].id ), "FigureCaption should be child of Title 2" + + +class Describe_apply_metadata: + """Unit-test suite for `unstructured.partition.common.metadata.apply_metadata()` decorator.""" + + # -- unique-ids ------------------------------------------------------- + + def it_assigns_hash_element_ids_when_unique_ids_arg_is_not_specified( + self, fake_partitioner: Callable[..., list[Element]] + ): + partition = apply_metadata()(fake_partitioner) + + elements = partition() + elements_2 = partition() + + # -- SHA1 hash is 32 characters long, no hyphens -- + assert all(len(e.id) == 32 for e in elements) + assert all("-" not in e.id for e in elements) + # -- SHA1 hashes are deterministic -- + assert all(e.id == e2.id for e, e2 in zip(elements, elements_2)) + + def it_assigns_hash_element_ids_when_unique_ids_arg_is_False( + self, fake_partitioner: Callable[..., list[Element]] + ): + partition = apply_metadata()(fake_partitioner) + + elements = partition(unique_element_ids=False) + elements_2 = partition(unique_element_ids=False) + + # -- SHA1 hash is 32 characters long, no hyphens -- + assert all(len(e.id) == 32 for e in elements) + assert all("-" not in e.id for e in elements) + # -- SHA1 hashes are deterministic -- + assert all(e.id == e2.id for e, e2 in zip(elements, elements_2)) + + def it_leaves_UUID_element_ids_when_unique_ids_arg_is_True( + self, fake_partitioner: Callable[..., list[Element]] + ): + partition = apply_metadata()(fake_partitioner) + + elements = partition(unique_element_ids=True) + elements_2 = partition(unique_element_ids=True) + + # -- UUID is 36 characters long with four hyphens -- + assert all(len(e.id) == 36 for e in elements) + assert all(e.id.count("-") == 4 for e in elements) + # -- UUIDs are non-deterministic, different every time -- + assert all(e.id != e2.id for e, e2 in zip(elements, elements_2)) + + # -- parent-id -------------------------------------------------------- + + def it_computes_and_assigns_parent_id(self, fake_partitioner: Callable[..., list[Element]]): + partition = apply_metadata()(fake_partitioner) + + elements = partition() + + title = elements[0] + assert title.metadata.category_depth == 1 + narr_text = elements[1] + assert narr_text.metadata.parent_id == title.id + + # -- languages -------------------------------------------------------- + + def it_applies_language_metadata(self, fake_partitioner: Callable[..., list[Element]]): + partition = apply_metadata()(fake_partitioner) + + elements = partition(languages=["auto"], detect_language_per_element=True) + + assert all(e.metadata.languages == ["eng"] for e in elements) + + # -- filetype (MIME-type) --------------------------------------------- + + def it_assigns_the_value_of_a_metadata_file_type_arg_when_there_is_one( + self, fake_partitioner: Callable[..., list[Element]] + ): + """A `metadata_file_type` arg overrides the file-type specified in the decorator. + + This is used for example by a delegating partitioner to preserve the original file-type in + the metadata, like EPUB instead of the HTML that partitioner converts the .epub file to. + """ + partition = apply_metadata(file_type=FileType.DOCX)(fake_partitioner) + + elements = partition(metadata_file_type=FileType.ODT) + + assert all( + e.metadata.filetype == "application/vnd.oasis.opendocument.text" for e in elements + ) + + def and_it_assigns_the_decorator_file_type_when_the_metadata_file_type_arg_is_omitted( + self, fake_partitioner: Callable[..., list[Element]] + ): + """The `file_type=...` decorator arg is the "normal" way to specify the file-type. + + This is used for principal (non-delegating) partitioners. + """ + partition = apply_metadata(file_type=FileType.DOCX)(fake_partitioner) + + elements = partition() + + DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + assert all(e.metadata.filetype == DOCX_MIME_TYPE for e in elements) + + def and_it_does_not_assign_file_type_metadata_when_both_are_omitted( + self, fake_partitioner: Callable[..., list[Element]] + ): + """A partitioner can elect to assign `.metadata.filetype` for itself. + + This is done in `partition_image()` for example where the same partitioner is used for + multiple file-types. + """ + partition = apply_metadata()(fake_partitioner) + + elements = partition() + + assert all(e.metadata.filetype == "image/jpeg" for e in elements) + + # -- filename --------------------------------------------------------- + + def it_uses_metadata_filename_arg_value_when_present( + self, fake_partitioner: Callable[..., list[Element]] + ): + """A `metadata_filename` arg overrides all other sources.""" + partition = apply_metadata()(fake_partitioner) + + elements = partition(metadata_filename="a/b/c.xyz") + + assert all(e.metadata.filename == "c.xyz" for e in elements) + assert all(e.metadata.file_directory == "a/b" for e in elements) + + def and_it_uses_filename_arg_value_when_metadata_filename_arg_not_present( + self, fake_partitioner: Callable[..., list[Element]] + ): + partition = apply_metadata()(fake_partitioner) + + elements = partition(filename="a/b/c.xyz") + + assert all(e.metadata.filename == "c.xyz" for e in elements) + assert all(e.metadata.file_directory == "a/b" for e in elements) + + def and_it_does_not_assign_filename_metadata_when_neither_are_present( + self, fake_partitioner: Callable[..., list[Element]] + ): + partition = apply_metadata()(fake_partitioner) + + elements = partition() + + assert all(e.metadata.filename == "image.jpeg" for e in elements) + assert all(e.metadata.file_directory == "x/y/images" for e in elements) + + # -- url -------------------------------------------------------------- + + def it_assigns_url_metadata_field_when_url_arg_is_present( + self, fake_partitioner: Callable[..., list[Element]] + ): + partition = apply_metadata()(fake_partitioner) + + elements = partition(url="https://adobe.com/stock/54321") + + assert all(e.metadata.url == "https://adobe.com/stock/54321" for e in elements) + + def and_it_does_not_assign_url_metadata_when_url_arg_is_not_present( + self, fake_partitioner: Callable[..., list[Element]] + ): + partition = apply_metadata()(fake_partitioner) + + elements = partition() + + assert all(e.metadata.url == "http://images.com" for e in elements) + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture + def fake_partitioner(self) -> Callable[..., list[Element]]: + def fake_partitioner(**kwargs: Any) -> list[Element]: + title = Title("Introduction") + title.metadata.category_depth = 1 + title.metadata.file_directory = "x/y/images" + title.metadata.filename = "image.jpeg" + title.metadata.filetype = "image/jpeg" + title.metadata.url = "http://images.com" + + narr_text = NarrativeText("To understand bar you must first understand foo.") + narr_text.metadata.file_directory = "x/y/images" + narr_text.metadata.filename = "image.jpeg" + narr_text.metadata.filetype = "image/jpeg" + narr_text.metadata.url = "http://images.com" + + return [title, narr_text] + + return fake_partitioner diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 97668e6dc0..18a138fccd 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.14-dev3" # pragma: no cover +__version__ = "0.15.14-dev4" # pragma: no cover diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index a9f033ce12..b5e95f2e85 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -14,7 +14,7 @@ from unstructured.file_utils.model import FileType from unstructured.logger import logger from unstructured.partition.common.common import exactly_one -from unstructured.partition.lang import check_language_args +from unstructured.partition.common.lang import check_language_args from unstructured.partition.utils.constants import PartitionStrategy from unstructured.utils import dependency_exists diff --git a/unstructured/partition/lang.py b/unstructured/partition/common/lang.py similarity index 93% rename from unstructured/partition/lang.py rename to unstructured/partition/common/lang.py index d91d648829..d73d5cb921 100644 --- a/unstructured/partition/lang.py +++ b/unstructured/partition/common/lang.py @@ -3,8 +3,12 @@ import re from typing import Iterable, Iterator, Optional -import iso639 -from langdetect import DetectorFactory, detect_langs, lang_detect_exception +import iso639 # pyright: ignore[reportMissingTypeStubs] +from langdetect import ( # pyright: ignore[reportMissingTypeStubs] + DetectorFactory, + detect_langs, # pyright: ignore[reportUnknownVariableType] + lang_detect_exception, +) from unstructured.documents.elements import Element from unstructured.logger import logger @@ -208,12 +212,13 @@ def prepare_languages_for_tesseract(languages: Optional[list[str]] = ["eng"]) -> """ if languages is None: raise ValueError("`languages` can not be `None`") - converted_languages = list( - filter( - lambda x: x is not None and x != "", - [_convert_language_code_to_pytesseract_lang_code(lang) for lang in languages], - ), - ) + converted_languages = [ + lang_code + for lang_code in ( + _convert_language_code_to_pytesseract_lang_code(lang) for lang in languages + ) + if lang_code + ] # Remove duplicates from the list but keep the original order converted_languages = list(dict.fromkeys(converted_languages)) if len(converted_languages) == 0: @@ -245,13 +250,17 @@ def tesseract_to_paddle_language(tesseract_language: str) -> str: return lang -def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> Optional[list[str]]: - """Handle users defining both `ocr_languages` and `languages`, giving preference to `languages` - and converting `ocr_languages` if needed, but defaulting to `None. +def check_language_args( + languages: list[str], ocr_languages: str | list[str] | None +) -> list[str] | None: + """Handle users defining both `ocr_languages` and `languages`. + + Give preference to `languages` and convert `ocr_languages` if needed, but default to `None`. `ocr_languages` is only a parameter for `auto.partition`, `partition_image`, & `partition_pdf`. `ocr_languages` should not be defined as 'auto' since 'auto' is intended for language detection - which is not supported by `partition_image` or `partition_pdf`.""" + which is not supported by `partition_image` or `partition_pdf`. + """ # --- Clean and update defaults if ocr_languages: ocr_languages = _clean_ocr_languages_arg(ocr_languages) @@ -259,6 +268,7 @@ def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> O "The ocr_languages kwarg will be deprecated in a future version of unstructured. " "Please use languages instead.", ) + assert ocr_languages is None or isinstance(ocr_languages, str) if ocr_languages and "auto" in ocr_languages: raise ValueError( @@ -268,7 +278,7 @@ def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> O " Language detection is not currently supported in pdfs or images." ) - if not isinstance(languages, list): + if not isinstance(languages, list): # pyright: ignore[reportUnnecessaryIsInstance] raise TypeError( "The language parameter must be a list of language codes as strings, ex. ['eng']", ) @@ -354,7 +364,7 @@ def _convert_language_code_to_pytesseract_lang_code(lang: str) -> str: def _get_iso639_language_object(lang: str) -> Optional[iso639.Language]: try: - return iso639.Language.match(lang.lower()) + return iso639.Language.match(lang.lower()) # pyright: ignore[reportUnknownMemberType] except iso639.LanguageNotFoundError: logger.warning(f"{lang} is not a valid standard language code.") return None @@ -431,10 +441,10 @@ def detect_languages( # machine translation # TODO(shreya): decide how to maintain nonstandard chinese script information for langobj in langdetect_result: - if str(langobj.lang).startswith("zh"): + if str(langobj.lang).startswith("zh"): # pyright: ignore langdetect_langs.append("zho") else: - language = _get_iso639_language_object(langobj.lang[:3]) + language = _get_iso639_language_object(langobj.lang[:3]) # pyright: ignore if language: langdetect_langs.append(language.part3) diff --git a/unstructured/partition/common/metadata.py b/unstructured/partition/common/metadata.py index f783096a3d..7de1909a37 100644 --- a/unstructured/partition/common/metadata.py +++ b/unstructured/partition/common/metadata.py @@ -3,13 +3,21 @@ from __future__ import annotations import datetime as dt +import functools import os -from typing import Optional, Sequence +from typing import Any, Callable, Sequence -from unstructured.documents.elements import Element +from typing_extensions import ParamSpec +from unstructured.documents.elements import Element, ElementMetadata, assign_and_map_hash_ids +from unstructured.file_utils.model import FileType +from unstructured.partition.common.lang import apply_lang_metadata +from unstructured.utils import get_call_args_applying_defaults -def get_last_modified_date(filename: str) -> Optional[str]: +_P = ParamSpec("_P") + + +def get_last_modified_date(filename: str) -> str | None: """Modification time of file at path `filename`, if it exists. Returns `None` when `filename` is not a path to a file on the local filesystem. @@ -54,9 +62,9 @@ def get_last_modified_date(filename: str) -> Optional[str]: def set_element_hierarchy( elements: Sequence[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET ) -> list[Element]: - """Sets the parent_id for each element in the list of elements - based on the element's category, depth and a ruleset + """Sets `.metadata.parent_id` for each element it applies to. + `parent_id` assignment is based on the element's category, depth and a ruleset. """ stack: list[Element] = [] for element in elements: @@ -97,3 +105,104 @@ def set_element_hierarchy( stack.append(element) return list(elements) + + +# ================================================================================================ +# METADATA POST-PARTITIONING PROCESSING DECORATOR +# ================================================================================================ + + +def apply_metadata( + file_type: FileType | None = None, +) -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]: + """Post-process element-metadata for this document. + + This decorator adds a post-processing step to a partitioner, primarily to apply metadata that + is common to all partitioners. It assumes the following responsibilities: + + - Hash element-ids. Computes and applies SHA1 hash element.id when `unique_element_ids` + argument is False. + + - Element Hierarchy. Computes and applies `parent_id` metadata based on `category_depth` + etc. added by partitioner. + + - Language metadata. Computes and applies `language` metadata based on a language detection + model. + + - Apply `filetype` (MIME-type) metadata. There are three cases; first one in this order that + applies is used: + + - `metadata_file_type` argument is present in call, use that. + - `file_type` decorator argument is populated, use that. + - `file_type` decorator argument is omitted or None, don't apply `.metadata.filetype` + (assume the partitioner will do that for itself, like `partition_image()`. + + - Replace `filename` with `metadata_filename` when present. + + - Apply `url` metadata when present. + """ + + def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: + """The decorator function itself. + + This function is returned by the `apply_metadata()` function and is the actual decorator. + Think of `apply_metadata()` as a factory function that configures this decorator, in + particular by setting its `file_type` value. + """ + + @functools.wraps(func) + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]: + elements = func(*args, **kwargs) + call_args = get_call_args_applying_defaults(func, *args, **kwargs) + + # -- Compute and apply hash-ids if the user does not want UUIDs. Note this changes the + # -- elements themselves, not the metadata. + unique_element_ids: bool = call_args.get("unique_element_ids", False) + if unique_element_ids is False: + elements = assign_and_map_hash_ids(elements) + + # -- `parent_id` - process category-level etc. to assign parent-id -- + elements = set_element_hierarchy(elements) + + # -- `language` - auto-detect language (e.g. eng, spa) -- + languages = call_args.get("languages") + detect_language_per_element = call_args.get("detect_language_per_element", False) + elements = list( + apply_lang_metadata( + elements=elements, + languages=languages, + detect_language_per_element=detect_language_per_element, + ) + ) + + # == apply filetype, filename, and url metadata ========================= + metadata_kwargs: dict[str, Any] = {} + + # -- `filetype` (MIME-type) metadata -- + metadata_file_type = call_args.get("metadata_file_type") or file_type + if metadata_file_type is not None: + metadata_kwargs["filetype"] = metadata_file_type.mime_type + + # -- `filename` metadata - override with metadata_filename when it's present -- + filename = call_args.get("metadata_filename") or call_args.get("filename") + if filename: + metadata_kwargs["filename"] = filename + + # -- `url` metadata - record url when present -- + url = call_args.get("url") + if url: + metadata_kwargs["url"] = url + + # -- update element.metadata in single pass -- + for element in elements: + # NOTE(robinson) - Attached files have already run through this logic in their own + # partitioning function + if element.metadata.attached_to_filename: + continue + element.metadata.update(ElementMetadata(**metadata_kwargs)) + + return elements + + return wrapper + + return decorator diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 5b228da656..9e2975dcfa 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -16,8 +16,8 @@ ) from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType +from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.metadata import get_last_modified_date -from unstructured.partition.lang import apply_lang_metadata from unstructured.utils import is_temp_file_path, lazyproperty DETECTION_ORIGIN: str = "csv" diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 53490f2cfd..89889621a8 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -46,8 +46,8 @@ ) from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType +from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.metadata import get_last_modified_date -from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text_type import ( is_bulleted_text, is_email_address, diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 6e3b6ab74b..b6508eddd7 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -47,9 +47,9 @@ from unstructured.logger import logger from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE from unstructured.partition.common.common import convert_to_bytes, exactly_one +from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.html import partition_html -from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text import partition_text VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"] @@ -101,7 +101,7 @@ def append_address_header_elements(header: AddressHeader, element_type: Type[Ele for addr in header.addresses: elements.append( element_type( - name=addr.display_name or addr.username, + name=addr.display_name or addr.username, # type: ignore text=addr.addr_spec, # type: ignore ) ) diff --git a/unstructured/partition/html/partition.py b/unstructured/partition/html/partition.py index 8983f39b4d..a4e695cf7a 100644 --- a/unstructured/partition/html/partition.py +++ b/unstructured/partition/html/partition.py @@ -14,9 +14,9 @@ from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType +from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.html.parser import Flow, html_parser -from unstructured.partition.lang import apply_lang_metadata from unstructured.utils import is_temp_file_path, lazyproperty diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index c5fae5a9c7..50ceaa1187 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -6,7 +6,7 @@ from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import add_metadata from unstructured.partition.common.common import exactly_one -from unstructured.partition.lang import check_language_args +from unstructured.partition.common.lang import check_language_args from unstructured.partition.pdf import partition_pdf_or_image from unstructured.partition.utils.constants import PartitionStrategy diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 8ce7e5880a..0641c662bb 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -14,9 +14,9 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType from unstructured.logger import logger +from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.html import partition_html -from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text import partition_text from unstructured.utils import is_temp_file_path, lazyproperty diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index bb276fc194..1f8300ce37 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -48,12 +48,12 @@ ocr_data_to_elements, spooled_to_bytes_io_if_needed, ) -from unstructured.partition.common.metadata import get_last_modified_date -from unstructured.partition.lang import ( +from unstructured.partition.common.lang import ( check_language_args, prepare_languages_for_tesseract, tesseract_to_paddle_language, ) +from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.pdf_image.analysis.layout_dump import ( ExtractedLayoutDumper, FinalLayoutDumper, diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index b856adf977..df24daac00 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -37,8 +37,8 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType from unstructured.partition.common.common import convert_ms_office_table_to_text +from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.metadata import get_last_modified_date -from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text_type import ( is_email_address, is_possible_narrative_text, diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index ef50ab6b5b..12df9abd5e 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -30,8 +30,8 @@ from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE from unstructured.nlp.tokenize import sent_tokenize from unstructured.partition.common.common import exactly_one +from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.metadata import get_last_modified_date -from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text_type import ( is_bulleted_text, is_email_address, diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py index 8509a337d9..72c0984e4e 100644 --- a/unstructured/partition/tsv.py +++ b/unstructured/partition/tsv.py @@ -18,8 +18,8 @@ exactly_one, spooled_to_bytes_io_if_needed, ) +from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.metadata import get_last_modified_date -from unstructured.partition.lang import apply_lang_metadata DETECTION_ORIGIN: str = "tsv" diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index c6d516ee5e..7f6e93eca5 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -26,8 +26,8 @@ ) from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType +from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.metadata import get_last_modified_date -from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text_type import ( is_bulleted_text, is_possible_narrative_text, diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py index f9cca71a27..1ed30966b9 100644 --- a/unstructured/partition/xml.py +++ b/unstructured/partition/xml.py @@ -20,8 +20,8 @@ exactly_one, spooled_to_bytes_io_if_needed, ) +from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.metadata import get_last_modified_date -from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text import element_from_text DETECTION_ORIGIN: str = "xml"