diff --git a/CHANGELOG.md b/CHANGELOG.md index f40e602451..416ef54472 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.3-dev3 +## 0.14.3-dev4 ### Enhancements @@ -13,8 +13,9 @@ * **Add the missing `form_extraction_skip_tables` argument to the `partition_pdf_or_image` call**. * **Turn off XML resolve entities** Sets `resolve_entities=False` for XML parsing with `lxml` to avoid text being dynamically injected into the XML document. - * **Chromadb change from Add to Upsert using element_id to make idempotent** +* **Reduce excessive logging** Change per page ocr info level logging into detail level trace logging +* **Replace try block in `document_to_element_list` for handling HTMLDocument** Use `getattr(element, "type", "")` to get the `type` attribute of an element when it exists. This is more explicit way to handle the special case for HTML documents and prevents other types of attribute error from being silenced by the try block ## 0.14.2 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3915e31193..a1c7ac250b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.3-dev3" # pragma: no cover +__version__ = "0.14.3-dev4" # pragma: no cover diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 299ed02d78..24ca177ac3 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -537,6 +537,9 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]: } +# FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in +# unstructured.documents.html, which imports this module so we can't import the class for type +# hints. Moreover, those two types of documents have different lists of attributes def document_to_element_list( document: "DocumentLayout", sortable: bool = False, @@ -550,7 +553,7 @@ def document_to_element_list( starting_page_number: int = 1, **kwargs: Any, ) -> list[Element]: - """Converts a DocumentLayout object to a list of unstructured elements.""" + """Converts a DocumentLayout or HTMLDocument object to a list of unstructured elements.""" elements: list[Element] = [] num_pages = len(document.pages) @@ -588,13 +591,16 @@ def document_to_element_list( element.metadata.last_modified = last_modification_date element.metadata.text_as_html = getattr(layout_element, "text_as_html", None) element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None) - try: - if ( - isinstance(element, Title) and element.metadata.category_depth is None - ) and any(el.type in ["Headline", "Subheadline"] for el in page.elements): - element.metadata.category_depth = 0 - except AttributeError: - logger.info("HTML element instance has no attribute type") + # FIXME: here the elements in a page can be either: + # 1. LayoutElement if the document is LayoutDocument (if the partition is on a + # pdf/image); + # 2. Element if the document is HTMLDocument (if the partition is on an html file) + # this discrepency is due to Element class defined in unstructured and LayoutElement + # class defined in unstructured_inference do not have the same list of attributes + if (isinstance(element, Title) and element.metadata.category_depth is None) and any( + getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements + ): + element.metadata.category_depth = 0 page_elements.append(element) translation_mapping.append((layout_element, element)) diff --git a/unstructured/partition/utils/ocr_models/paddle_ocr.py b/unstructured/partition/utils/ocr_models/paddle_ocr.py index 151c1277fd..def01a99e9 100644 --- a/unstructured/partition/utils/ocr_models/paddle_ocr.py +++ b/unstructured/partition/utils/ocr_models/paddle_ocr.py @@ -6,7 +6,7 @@ from PIL import Image as PILImage from unstructured.documents.elements import ElementType -from unstructured.logger import logger +from unstructured.logger import logger, trace_logger from unstructured.partition.utils.constants import DEFAULT_PADDLE_LANG, Source from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent from unstructured.utils import requires_dependencies @@ -71,7 +71,7 @@ def get_layout_from_image( ) -> list[TextRegion]: """Get the OCR regions from image as a list of text regions with paddle.""" - logger.info("Processing entire page OCR with paddle...") + trace_logger.detail("Processing entire page OCR with paddle...") # TODO(yuming): pass in language parameter once we # have the mapping for paddle lang code diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 915569b472..7f2c874240 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -10,7 +10,7 @@ from PIL import Image as PILImage from unstructured_pytesseract import Output -from unstructured.logger import logger +from unstructured.logger import trace_logger from unstructured.partition.utils.config import env_config from unstructured.partition.utils.constants import ( IMAGE_COLOR_DEPTH, @@ -44,7 +44,7 @@ def get_layout_from_image( ) -> List[TextRegion]: """Get the OCR regions from image as a list of text regions with tesseract.""" - logger.info("Processing entire page OCR with tesseract...") + trace_logger.detail("Processing entire page OCR with tesseract...") zoom = 1 ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data( np.array(image),