Skip to content

Commit

Permalink
implement and document a simple function for deducing doctype
Browse files Browse the repository at this point in the history
  • Loading branch information
micmarty-deepsense committed May 21, 2024
1 parent d70da29 commit 6b57229
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 0 deletions.
14 changes: 14 additions & 0 deletions test_unstructured/metrics/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
TableStructureMetricsCalculator,
TextExtractionMetricsCalculator,
filter_metrics,
get_document_type,
get_mean_grouping,
)

Expand Down Expand Up @@ -139,6 +140,19 @@ def test_process_document_returns_the_correct_amount_of_values(
assert len(output_list) == expected_length


@pytest.mark.parametrize(
("filename", "expected"),
[
("document.pdf.json", "pdf"),
("report.docx.json", "docx"),
("file.txt.json", "txt"),
("file.with.multiple.dots.pdf.json", "pdf"),
],
)
def test_get_document_type_from_filename_returns_correct_document_type(filename, expected):
assert get_document_type(Path(filename)) == expected


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_text_extraction_evaluation_type_txt():
Expand Down
21 changes: 21 additions & 0 deletions unstructured/metrics/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,27 @@
OUTPUT_TYPE_OPTIONS = ["json", "txt"]


def get_document_type(path: Path) -> str:
"""Extracts the document type from the filename.
The document type is assumed to be the second-to-last suffix in the filename.
This is because of two reasons:
1. Evaluated document names are expected to always have two extensions.
Partitioned files are supposed to have the ".json" extension and
ground truth files can have either ".txt" or ".json".
2. The filename can theoretically have multiple dots in their base name
(which should not be treated as extensions).
Args:
path: Path to the document file
Returns:
The document type extracted from the filename
"""
extension = path.suffixes[-2]
return extension[1:] if extension.startswith(".") else extension


@dataclass
class BaseMetricsCalculator(ABC):
"""Foundation class for specialized metrics calculators.
Expand Down

0 comments on commit 6b57229

Please sign in to comment.