Skip to content

Commit

Permalink
fix: Updates to hOCR Template to follow hOCR Spec (#195)
Browse files Browse the repository at this point in the history
- Added validation in testing with https://github.com/kba/hocr-spec-python
  • Loading branch information
holtskinner authored Nov 7, 2023
1 parent e05cf50 commit 3f52e82
Show file tree
Hide file tree
Showing 10 changed files with 60 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="ocr-system" content="Document AI OCR" />
<meta name="ocr-langs" content="unknown" />
<meta name="ocr-scripts" content="unknown" />
<meta name="ocr-number-of-pages" content="{{ pages|length }}" />
<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
<meta name="ocr-capabilities" content="ocrp_lang ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
</head>
<body>
{% for page in pages -%}
Expand All @@ -16,13 +17,13 @@
{% set bidx = loop.index0 -%}
<span class='ocr_carea' id='block_{{ page_number }}_{{ bidx }}' title='{{ docai_block.hocr_bounding_box -}}'>{% for paragraph in docai_block.paragraphs -%}
{% set paridx = loop.index0 -%}
<span class='ocr_par' id='par_{{ page_number }}_{{ bidx }}_{{ paridx }}' title='{{ paragraph.hocr_bounding_box -}}'>{% for line in paragraph.lines -%}
<p class='ocr_par' id='par_{{ page_number }}_{{ bidx }}_{{ paridx }}' title='{{ paragraph.hocr_bounding_box -}}'>{% for line in paragraph.lines -%}
{% set lidx = loop.index0 -%}
<span class='ocr_line' id='line_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}' title='{{ line.hocr_bounding_box }}'>{{ line.text }}{% for token in line.tokens -%}
{% set tidx = loop.index0 -%}
<span class='ocrx_word' id='word_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}_{{ tidx }}' title='{{ token.hocr_bounding_box }}'>{{ token.text }}</span>{% endfor -%}
</span>{% endfor -%}
</span>{% endfor -%}
</p>{% endfor -%}
</span>{% endfor -%}
</div>
{% endfor -%}
Expand Down
6 changes: 5 additions & 1 deletion samples/snippets/test_convert_document_to_hocr_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ def test_convert_document_to_hocr_sample() -> None:
document_path=document_path, document_title=document_title
)

with open("../../tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r") as f:
with open(
"../../tests/unit/resources/toolbox_invoice_test_0_hocr.xml",
"r",
encoding="utf-8",
) as f:
expected = f.read()

assert actual == expected
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
"immutabledict >= 2.0.0, < 3.0.0dev; python_version<'3.8'",
"Pillow >= 9.5.0, < 11.0.0",
"Jinja2 >= 3.1.0, <= 4.0.0",
"hocr-spec >= 0.2.0",
),
python_requires=">=3.7",
classifiers=[
Expand Down
1 change: 1 addition & 0 deletions testing/constraints-3.10.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ google-cloud-documentai
google-cloud-storage
numpy
pikepdf
hocr-spec
1 change: 1 addition & 0 deletions testing/constraints-3.11.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ google-cloud-documentai
google-cloud-storage
numpy
pikepdf
hocr-spec
1 change: 1 addition & 0 deletions testing/constraints-3.7.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ google-cloud-documentai==2.20.0
google-cloud-storage==2.7.0
numpy==1.19.5
pikepdf==6.2.9
hocr-spec==0.2.0
1 change: 1 addition & 0 deletions testing/constraints-3.8.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ google-cloud-documentai
google-cloud-storage
numpy==1.21.6
pikepdf==8.2.3
hocr-spec
1 change: 1 addition & 0 deletions testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ google-cloud-documentai
google-cloud-storage
numpy
pikepdf
hocr-spec
67 changes: 34 additions & 33 deletions tests/unit/resources/toolbox_invoice_test_0_hocr.xml

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion tests/unit/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from io import BytesIO
import json
import os
import shutil
Expand All @@ -32,6 +33,8 @@
from google.cloud import documentai
from google.cloud.documentai_toolbox import document, gcs_utilities

from hocr_spec import HocrValidator


def get_bytes(file_name):
result = []
Expand Down Expand Up @@ -689,8 +692,15 @@ def test_export_hocr_str():
)

actual_hocr = wrapped_document.export_hocr_str(title="toolbox_invoice_test-0")
assert actual_hocr
validator = HocrValidator(profile="standard")
report = validator.validate(BytesIO(actual_hocr.encode("utf-8")), parse_strict=True)

assert report.format("bool")

with open("tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r") as f:
with open(
"tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r", encoding="utf-8"
) as f:
expected = f.read()

assert actual_hocr == expected
Expand Down

0 comments on commit 3f52e82

Please sign in to comment.