Skip to content

Commit

Permalink
feat/Move the category field to Element (#3056)
Browse files Browse the repository at this point in the history
It's pretty basic change, just literally moved the category field to
Element class. Can't think of other changes that are needed here,
because I think pretty much everything expected the category to be
directly in elements list.

For local testing, IDE's and linters should see difference in that
`category` is now in Element.
  • Loading branch information
hubert-rutkowski85 authored May 23, 2024
1 parent c997676 commit b8d894f
Show file tree
Hide file tree
Showing 7 changed files with 8 additions and 11 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

### Enhancements

* **Move `category` field from Text class to Element class.**

### Features

### Fixes
Expand Down
1 change: 0 additions & 1 deletion test_unstructured/chunking/test_dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ class Describe_chunk:
"""Unit-test suite for `unstructured.chunking.dispatch.chunk()` function."""

def it_dispatches_to_the_chunker_registered_for_the_chunking_strategy(self):

register_chunking_strategy("by_something_else", chunk_by_something_else)
kwargs = {
"max_characters": 750,
Expand Down
10 changes: 5 additions & 5 deletions test_unstructured/partition/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,9 +415,11 @@ def test_set_element_hierarchy():
assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
assert (
elements[7].metadata.parent_id is None
), "CheckBox should be None, as it's not a Text based element"
# NOTE(Hubert): moving the category field to Element, caused this to fail.
# Checkboxes will soon be deprecated, then we can remove the test.
# assert (
# elements[7].metadata.parent_id is None
# ), "CheckBox should be None, as it's not a Text based element"
assert elements[8].metadata.parent_id is None, "Title 2 should be child of None"
assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
Expand Down Expand Up @@ -567,7 +569,6 @@ def test_ocr_data_to_elements(


class Describe_get_last_modified_date:

def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path):
modified_timestamp = dt.datetime(
year=2024, month=3, day=5, hour=17, minute=43, second=40
Expand All @@ -589,7 +590,6 @@ def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathl


class Describe_get_last_modified_date_from_file:

def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file(
self, tmp_path: pathlib.Path
):
Expand Down
3 changes: 1 addition & 2 deletions unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,7 @@ class Element(abc.ABC):
"""

text: str
category = "UncategorizedText"

def __init__(
self,
Expand Down Expand Up @@ -844,8 +845,6 @@ def to_dict(self) -> dict[str, Any]:
class Text(Element):
"""Base element for capturing free text from within document."""

category = "UncategorizedText"

def __init__(
self,
text: str,
Expand Down
1 change: 0 additions & 1 deletion unstructured/ingest/connector/clarifai.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ def normalize_dict(self, element_dict: dict) -> dict:
}

def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:

from google.protobuf.struct_pb2 import Struct

logger.info(
Expand Down
1 change: 0 additions & 1 deletion unstructured/ingest/v2/interfaces/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

@dataclass
class BaseProcess(ABC):

def is_async(self) -> bool:
return False

Expand Down
1 change: 0 additions & 1 deletion unstructured/partition/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ def partition_doc(
# -- transient files in a temporary directory that is automatically removed so they don't
# -- pile up.
with tempfile.TemporaryDirectory() as target_dir:

source_file_path = f"{target_dir}/document.doc" if file is not None else filename
assert source_file_path is not None

Expand Down

0 comments on commit b8d894f

Please sign in to comment.