Skip to content

Commit

Permalink
chore: reduce excessive logging (#3095)
Browse files Browse the repository at this point in the history
- change some info level logging for per page processing into detail
level logging on trace logger
- replace the try block in `document_to_element_list` to use `getattr`
instead and add comment on the reason why sometimes `type` attribute may
not exist for an element
  • Loading branch information
badGarnet authored May 24, 2024
1 parent 26d403d commit 809c7e5
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 15 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.14.3-dev3
## 0.14.3-dev4

### Enhancements

Expand All @@ -13,8 +13,9 @@
* **Add the missing `form_extraction_skip_tables` argument to the `partition_pdf_or_image` call**.
* **Turn off XML resolve entities** Sets `resolve_entities=False` for XML parsing with `lxml`
to avoid text being dynamically injected into the XML document.

* **Chromadb change from Add to Upsert using element_id to make idempotent**
* **Reduce excessive logging** Change per page ocr info level logging into detail level trace logging
* **Replace try block in `document_to_element_list` for handling HTMLDocument** Use `getattr(element, "type", "")` to get the `type` attribute of an element when it exists. This is more explicit way to handle the special case for HTML documents and prevents other types of attribute error from being silenced by the try block

## 0.14.2

Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.14.3-dev3" # pragma: no cover
__version__ = "0.14.3-dev4" # pragma: no cover
22 changes: 14 additions & 8 deletions unstructured/partition/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,9 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]:
}


# FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in
# unstructured.documents.html, which imports this module so we can't import the class for type
# hints. Moreover, those two types of documents have different lists of attributes
def document_to_element_list(
document: "DocumentLayout",
sortable: bool = False,
Expand All @@ -550,7 +553,7 @@ def document_to_element_list(
starting_page_number: int = 1,
**kwargs: Any,
) -> list[Element]:
"""Converts a DocumentLayout object to a list of unstructured elements."""
"""Converts a DocumentLayout or HTMLDocument object to a list of unstructured elements."""
elements: list[Element] = []

num_pages = len(document.pages)
Expand Down Expand Up @@ -588,13 +591,16 @@ def document_to_element_list(
element.metadata.last_modified = last_modification_date
element.metadata.text_as_html = getattr(layout_element, "text_as_html", None)
element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None)
try:
if (
isinstance(element, Title) and element.metadata.category_depth is None
) and any(el.type in ["Headline", "Subheadline"] for el in page.elements):
element.metadata.category_depth = 0
except AttributeError:
logger.info("HTML element instance has no attribute type")
# FIXME: here the elements in a page can be either:
# 1. LayoutElement if the document is LayoutDocument (if the partition is on a
# pdf/image);
# 2. Element if the document is HTMLDocument (if the partition is on an html file)
# this discrepency is due to Element class defined in unstructured and LayoutElement
# class defined in unstructured_inference do not have the same list of attributes
if (isinstance(element, Title) and element.metadata.category_depth is None) and any(
getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements
):
element.metadata.category_depth = 0

page_elements.append(element)
translation_mapping.append((layout_element, element))
Expand Down
4 changes: 2 additions & 2 deletions unstructured/partition/utils/ocr_models/paddle_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from PIL import Image as PILImage

from unstructured.documents.elements import ElementType
from unstructured.logger import logger
from unstructured.logger import logger, trace_logger
from unstructured.partition.utils.constants import DEFAULT_PADDLE_LANG, Source
from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent
from unstructured.utils import requires_dependencies
Expand Down Expand Up @@ -71,7 +71,7 @@ def get_layout_from_image(
) -> list[TextRegion]:
"""Get the OCR regions from image as a list of text regions with paddle."""

logger.info("Processing entire page OCR with paddle...")
trace_logger.detail("Processing entire page OCR with paddle...")

# TODO(yuming): pass in language parameter once we
# have the mapping for paddle lang code
Expand Down
4 changes: 2 additions & 2 deletions unstructured/partition/utils/ocr_models/tesseract_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from PIL import Image as PILImage
from unstructured_pytesseract import Output

from unstructured.logger import logger
from unstructured.logger import trace_logger
from unstructured.partition.utils.config import env_config
from unstructured.partition.utils.constants import (
IMAGE_COLOR_DEPTH,
Expand Down Expand Up @@ -44,7 +44,7 @@ def get_layout_from_image(
) -> List[TextRegion]:
"""Get the OCR regions from image as a list of text regions with tesseract."""

logger.info("Processing entire page OCR with tesseract...")
trace_logger.detail("Processing entire page OCR with tesseract...")
zoom = 1
ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
np.array(image),
Expand Down

0 comments on commit 809c7e5

Please sign in to comment.