Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[formrecognizer] Add type to FormField #12561

Merged
merged 7 commits into from
Jul 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions sdk/formrecognizer/azure-ai-formrecognizer/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@

- Values are now capitalized for enums `FormContentType`, `LengthUnit`, `TrainingStatus`, and `CustomFormModelStatus`

**New features**

- `FormField` now has attribute `type` which contains the semantic data type of the field value

## 1.0.0b4 (2020-07-07)

**Breaking Changes**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
CustomFormModel,
CustomFormSubmodel,
CustomFormModelField,
FieldValueType
)


Expand Down Expand Up @@ -59,6 +60,7 @@
'CustomFormModel',
'CustomFormSubmodel',
'CustomFormModelField',
'FieldValueType'
]

__VERSION__ = VERSION
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,20 @@ def get_field_value(field, value, read_result): # pylint: disable=too-many-retu
return None


class FieldValueType(str, Enum):
"""Semantic data type of the field value.
"""

STRING = "string"
DATE = "date"
TIME = "time"
PHONE_NUMBER = "phoneNumber"
NUMBER = "number"
INTEGER = "integer"
ARRAY = "array"
OBJECT = "object"


class LengthUnit(str, Enum):
"""The unit used by the width, height and bounding box properties.
For images, the unit is "pixel". For PDF, the unit is "inch".
Expand Down Expand Up @@ -188,21 +202,24 @@ def __repr__(self):
class FormField(object):
"""Represents a field recognized in an input form.

:ivar type: The type of `value` found on FormField. Possible types include: 'string',
'date', 'time', 'phoneNumber', 'number', 'integer', 'object', or 'array'.
:vartype type: str or ~azure.ai.formrecognizer.FieldValueType
:ivar ~azure.ai.formrecognizer.FieldData label_data:
Contains the text, bounding box, and field elements for the field label.
:ivar ~azure.ai.formrecognizer.FieldData value_data:
Contains the text, bounding box, and field elements for the field value.
:ivar str name: The unique name of the field or label.
:ivar value:
The value for the recognized field. Possible types include: 'string',
'date', 'time', 'phoneNumber', 'number', 'integer', 'object', or 'array'.
The value for the recognized field. Its semantic data type is described by `type`.
:vartype value: str, int, float, :class:`~datetime.date`, :class:`~datetime.time`,
:class:`~azure.ai.formrecognizer.FormField`, or list[:class:`~azure.ai.formrecognizer.FormField`]
:ivar float confidence:
Measures the degree of certainty of the recognition result. Value is between [0.0, 1.0].
"""

def __init__(self, **kwargs):
self.type = kwargs.get("type", None)
self.label_data = kwargs.get("label_data", None)
self.value_data = kwargs.get("value_data", None)
self.name = kwargs.get("name", None)
Expand All @@ -212,6 +229,7 @@ def __init__(self, **kwargs):
@classmethod
def _from_generated(cls, field, value, read_result):
return cls(
type=value.type if value else None,
label_data=FieldData._from_generated(field, read_result),
value_data=FieldData._from_generated(value, read_result),
value=get_field_value(field, value, read_result),
Expand All @@ -222,6 +240,7 @@ def _from_generated(cls, field, value, read_result):
@classmethod
def _from_generated_unlabeled(cls, field, idx, page, read_result):
return cls(
type="string", # unlabeled only returns string
label_data=FieldData._from_generated_unlabeled(field.key, page, read_result),
value_data=FieldData._from_generated_unlabeled(field.value, page, read_result),
value=field.value.text,
Expand All @@ -230,8 +249,8 @@ def _from_generated_unlabeled(cls, field, idx, page, read_result):
)

def __repr__(self):
return "FormField(label_data={}, value_data={}, name={}, value={}, confidence={})".format(
repr(self.label_data), repr(self.value_data), self.name, repr(self.value), self.confidence
return "FormField(type={}, label_data={}, value_data={}, name={}, value={}, confidence={})".format(
self.type, repr(self.label_data), repr(self.value_data), self.name, repr(self.value), self.confidence
)[:1024]


Expand Down
5 changes: 2 additions & 3 deletions sdk/formrecognizer/azure-ai-formrecognizer/tests/test_mgmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,7 @@ def test_get_form_recognizer_client(self, resource_group, location, form_recogni

@GlobalFormRecognizerAccountPreparer()
def test_api_version_form_training_client(self, resource_group, location, form_recognizer_account, form_recognizer_account_key):
transport = RequestsTransport()
with self.assertRaises(ValueError):
ftc = FormTrainingClient(endpoint=form_recognizer_account, credential=AzureKeyCredential(form_recognizer_account_key), transport=transport, api_version="2.1")
ftc = FormTrainingClient(endpoint=form_recognizer_account, credential=AzureKeyCredential(form_recognizer_account_key), api_version="2.1")

ftc = FormTrainingClient(endpoint=form_recognizer_account, credential=AzureKeyCredential(form_recognizer_account_key), transport=transport, api_version="2.0")
ftc = FormTrainingClient(endpoint=form_recognizer_account, credential=AzureKeyCredential(form_recognizer_account_key), api_version="2.0")
Original file line number Diff line number Diff line change
Expand Up @@ -310,10 +310,10 @@ def test_receipt_jpg_include_field_elements(self, client):
for field, value in receipt.__dict__.items():
if field not in ["receipt_items", "page_range", "pages", "fields", "form_type"]:
form_field = getattr(receipt, field)
self.assertTextContentHasValues(form_field.value_data.field_elements, receipt.page_range.first_page_number)
self.assertFieldElementsHasValues(form_field.value_data.field_elements, receipt.page_range.first_page_number)

for field, value in receipt.fields.items():
self.assertTextContentHasValues(value.value_data.field_elements, receipt.page_range.first_page_number)
self.assertFieldElementsHasValues(value.value_data.field_elements, receipt.page_range.first_page_number)

@GlobalFormRecognizerAccountPreparer()
@GlobalClientPreparer()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -321,10 +321,10 @@ async def test_receipt_jpg_include_field_elements(self, client):
for field, value in receipt.__dict__.items():
if field not in ["receipt_items", "page_range", "pages", "fields", "form_type"]:
form_field = getattr(receipt, field)
self.assertTextContentHasValues(form_field.value_data.field_elements, receipt.page_range.first_page_number)
self.assertFieldElementsHasValues(form_field.value_data.field_elements, receipt.page_range.first_page_number)

for field, value in receipt.fields.items():
self.assertTextContentHasValues(value.value_data.field_elements, receipt.page_range.first_page_number)
self.assertFieldElementsHasValues(value.value_data.field_elements, receipt.page_range.first_page_number)

@GlobalFormRecognizerAccountPreparer()
@GlobalClientPreparer()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,10 @@ def test_receipt_url_include_field_elements(self, client):
for field, value in receipt.__dict__.items():
if field not in ["receipt_items", "page_range", "pages", "fields", "form_type"]:
field = getattr(receipt, field)
self.assertTextContentHasValues(field.value_data.field_elements, receipt.page_range.first_page_number)
self.assertFieldElementsHasValues(field.value_data.field_elements, receipt.page_range.first_page_number)

for field, value in receipt.fields.items():
self.assertTextContentHasValues(value.value_data.field_elements, receipt.page_range.first_page_number)
self.assertFieldElementsHasValues(value.value_data.field_elements, receipt.page_range.first_page_number)

@GlobalFormRecognizerAccountPreparer()
@GlobalClientPreparer()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,10 +218,10 @@ async def test_receipt_url_include_field_elements(self, client):
for field, value in receipt.__dict__.items():
if field not in ["page_range", "pages", "fields", "form_type"]:
field = getattr(receipt, field)
self.assertTextContentHasValues(field.value_data.field_elements, receipt.page_range.first_page_number)
self.assertFieldElementsHasValues(field.value_data.field_elements, receipt.page_range.first_page_number)

for field, value in receipt.fields.items():
self.assertTextContentHasValues(value.value_data.field_elements, receipt.page_range.first_page_number)
self.assertFieldElementsHasValues(value.value_data.field_elements, receipt.page_range.first_page_number)

@GlobalFormRecognizerAccountPreparer()
@GlobalClientPreparer()
Expand Down
14 changes: 7 additions & 7 deletions sdk/formrecognizer/azure-ai-formrecognizer/tests/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,23 +59,23 @@ def form_table(form_table_cell):
return model, model_repr

@pytest.fixture
def field_text(bounding_box, form_word, form_line):
def field_data(bounding_box, form_word, form_line):
model = _models.FieldData(page_number=1, text="This is text.", bounding_box=bounding_box[0], field_elements=[form_word[0], form_line[0]])
model_repr = "FieldData(page_number=1, text=This is text., bounding_box={}, field_elements=[{}, {}])".format(bounding_box[1], form_word[1], form_line[1])[:1024]
assert repr(model) == model_repr
return model, model_repr

@pytest.fixture
def form_field_two(field_text):
model = _models.FormField(label_data=field_text[0], value_data=field_text[0], name="form_field_two", value="value", confidence=0)
model_repr = "FormField(label_data={}, value_data={}, name=form_field_two, value='value', confidence=0)".format(field_text[1], field_text[1])[:1024]
def form_field_two(field_data):
model = _models.FormField(type="string", label_data=field_data[0], value_data=field_data[0], name="form_field_two", value="value", confidence=0)
model_repr = "FormField(type=string, label_data={}, value_data={}, name=form_field_two, value='value', confidence=0)".format(field_data[1], field_data[1])[:1024]
assert repr(model) == model_repr
return model, model_repr

@pytest.fixture
def form_field_one(field_text, form_field_two):
model = _models.FormField(label_data=field_text[0], value_data=field_text[0], name="form_field_one", value=form_field_two[0], confidence=1.0)
model_repr = "FormField(label_data={}, value_data={}, name=form_field_one, value={}, confidence=1.0)".format(field_text[1], field_text[1], form_field_two[1])[:1024]
def form_field_one(field_data, form_field_two):
model = _models.FormField(type="string", label_data=field_data[0], value_data=field_data[0], name="form_field_one", value=form_field_two[0], confidence=1.0)
model_repr = "FormField(type=string, label_data={}, value_data={}, name=form_field_one, value={}, confidence=1.0)".format(field_data[1], field_data[1], form_field_two[1])[:1024]
assert repr(model) == model_repr
return model, model_repr

Expand Down
32 changes: 19 additions & 13 deletions sdk/formrecognizer/azure-ai-formrecognizer/tests/testcase.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def assertBoundingBoxTransformCorrect(self, box, actual):
self.assertEqual(box[3].x, actual[6])
self.assertEqual(box[3].y, actual[7])

def assertTextContentTransformCorrect(self, field_elements, actual_elements, read_result):
def assertFieldElementsTransFormCorrect(self, field_elements, actual_elements, read_result):
if field_elements is None and actual_elements is None:
return
for receipt, actual in zip(field_elements, actual_elements):
Expand All @@ -222,6 +222,7 @@ def assertLabeledFormFieldDictTransformCorrect(self, form_fields, actual_fields,
self.assertBoundingBoxTransformCorrect(b[label].value_data.bounding_box, a.bounding_box)
self.assertEqual(a.text, b[label].value_data.text)
field_type = a.type
self.assertEqual(field_type, b[label].type)
if field_type == "string":
self.assertEqual(b[label].value, a.value_string)
if field_type == "number":
Expand All @@ -235,7 +236,7 @@ def assertLabeledFormFieldDictTransformCorrect(self, form_fields, actual_fields,
if field_type == "time":
self.assertEqual(b[label].value, a.value_time)
if read_results:
self.assertTextContentTransformCorrect(
self.assertFieldElementsTransFormCorrect(
b[label].value_data.field_elements,
a.elements,
read_results
Expand All @@ -249,15 +250,15 @@ def assertUnlabeledFormFieldDictTransformCorrect(self, form_fields, actual_field
self.assertEqual(a.key.text, form_fields["field-"+str(idx)].label_data.text)
self.assertBoundingBoxTransformCorrect(form_fields["field-"+str(idx)].label_data.bounding_box, a.key.bounding_box)
if read_results:
self.assertTextContentTransformCorrect(
self.assertFieldElementsTransFormCorrect(
form_fields["field-"+str(idx)].label_data.field_elements,
a.key.elements,
read_results
)
self.assertEqual(a.value.text, form_fields["field-" + str(idx)].value_data.text)
self.assertBoundingBoxTransformCorrect(form_fields["field-" + str(idx)].value_data.bounding_box, a.value.bounding_box)
if read_results:
self.assertTextContentTransformCorrect(
self.assertFieldElementsTransFormCorrect(
form_fields["field-"+str(idx)].value_data.field_elements,
a.value.elements,
read_results
Expand All @@ -267,6 +268,7 @@ def assertFormFieldTransformCorrect(self, receipt_field, actual_field, read_resu
if actual_field is None:
return
field_type = actual_field.type
self.assertEqual(field_type, receipt_field.type)
if field_type == "string":
self.assertEqual(receipt_field.value, actual_field.value_string)
if field_type == "number":
Expand All @@ -284,7 +286,7 @@ def assertFormFieldTransformCorrect(self, receipt_field, actual_field, read_resu
self.assertEqual(receipt_field.value_data.text, actual_field.text)
self.assertEqual(receipt_field.confidence, actual_field.confidence if actual_field.confidence is not None else 1.0)
if read_results:
self.assertTextContentTransformCorrect(
self.assertFieldElementsTransFormCorrect(
receipt_field.value_data.field_elements,
actual_field.elements,
read_results
Expand Down Expand Up @@ -314,28 +316,32 @@ def assertTablesTransformCorrect(self, layout, actual_layout, read_results=None,
self.assertEqual(cell.is_header, actual_cell.is_header if actual_cell.is_header is not None else False)
self.assertEqual(cell.is_footer, actual_cell.is_footer if actual_cell.is_footer is not None else False)
self.assertBoundingBoxTransformCorrect(cell.bounding_box, actual_cell.bounding_box)
self.assertTextContentTransformCorrect(cell.field_elements, actual_cell.elements, read_results)
self.assertFieldElementsTransFormCorrect(cell.field_elements, actual_cell.elements, read_results)

def assertReceiptItemsHasValues(self, items, page_number, include_field_elements):
for item in items:
self.assertEqual(item.type, "object")
self.assertBoundingBoxHasPoints(item.value.get("Name").value_data.bounding_box)
self.assertIsNotNone(item.value.get("Name").confidence)
self.assertIsNotNone(item.value.get("Name").value_data.text)
self.assertIsNotNone(item.value.get("Name").type)
self.assertBoundingBoxHasPoints(item.value.get("Quantity").value_data.bounding_box)
self.assertIsNotNone(item.value.get("Quantity").confidence)
self.assertIsNotNone(item.value.get("Quantity").value_data.text)
self.assertIsNotNone(item.value.get("Quantity").type)
self.assertBoundingBoxHasPoints(item.value.get("TotalPrice").value_data.bounding_box)
self.assertIsNotNone(item.value.get("TotalPrice").confidence)
self.assertIsNotNone(item.value.get("TotalPrice").value_data.text)
self.assertIsNotNone(item.value.get("TotalPrice").type)

if include_field_elements:
self.assertTextContentHasValues(item.value.get("Name").value_data.field_elements, page_number)
self.assertTextContentHasValues(item.value.get("Name").value_data.field_elements, page_number)
self.assertTextContentHasValues(item.value.get("Name").value_data.field_elements, page_number)
self.assertFieldElementsHasValues(item.value.get("Name").value_data.field_elements, page_number)
self.assertFieldElementsHasValues(item.value.get("Quantity").value_data.field_elements, page_number)
self.assertFieldElementsHasValues(item.value.get("TotalPrice").value_data.field_elements, page_number)
else:
self.assertIsNone(item.value.get("Name").value_data.field_elements)
self.assertIsNone(item.value.get("Name").value_data.field_elements)
self.assertIsNone(item.value.get("Name").value_data.field_elements)
self.assertIsNone(item.value.get("Quantity").value_data.field_elements)
self.assertIsNone(item.value.get("TotalPrice").value_data.field_elements)

def assertBoundingBoxHasPoints(self, box):
if box is None:
Expand Down Expand Up @@ -376,15 +382,15 @@ def assertFormPagesHasValues(self, pages):
self.assertIsNotNone(cell.row_span)
self.assertIsNotNone(cell.column_span)
self.assertBoundingBoxHasPoints(cell.bounding_box)
self.assertTextContentHasValues(cell.field_elements, page.page_number)
self.assertFieldElementsHasValues(cell.field_elements, page.page_number)

def assertFormWordHasValues(self, word, page_number):
self.assertIsNotNone(word.confidence)
self.assertIsNotNone(word.text)
self.assertBoundingBoxHasPoints(word.bounding_box)
self.assertEqual(word.page_number, page_number)

def assertTextContentHasValues(self, elements, page_number):
def assertFieldElementsHasValues(self, elements, page_number):
if elements is None:
return
for word in elements:
Expand Down