Skip to content

Commit

Permalink
[text analytics] add normalized_text (#17074)
Browse files Browse the repository at this point in the history
  • Loading branch information
iscai-msft authored Mar 3, 2021
1 parent e16a929 commit 4ee7f35
Show file tree
Hide file tree
Showing 10 changed files with 280 additions and 4 deletions.
5 changes: 5 additions & 0 deletions sdk/textanalytics/azure-ai-textanalytics/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
- Renamed properties `aspect` and `opinions` to `target` and `assessments` respectively in class `MinedOpinion`.
- Renamed classes `AspectSentiment` and `OpinionSentiment` to `TargetSentiment` and `AssessmentSentiment` respectively.

**New Features**

- Add property `normalized_text` to `HealthcareEntity`. This property is a normalized version of the `text` property that already
exists on the `HealthcareEntity`

## 5.1.0b5 (2021-02-10)

**Breaking Changes**
Expand Down
1 change: 1 addition & 0 deletions sdk/textanalytics/azure-ai-textanalytics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ print("Results of Healthcare Entities Analysis:")
for idx, doc in enumerate(docs):
for entity in doc.entities:
print("Entity: {}".format(entity.text))
print("...Normalized Text: {}".format(entity.normalized_text))
print("...Category: {}".format(entity.category))
print("...Subcategory: {}".format(entity.subcategory))
print("...Offset: {}".format(entity.offset))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,9 @@ def __repr__(self):
class HealthcareEntity(DictMixin):
"""HealthcareEntity contains information about a Healthcare entity found in text.
:ivar str text: Entity text as appears in the request.
:ivar str text: Entity text as appears in the document.
:ivar str normalized_text: Optional. Normalized version of the raw `text` we extract
from the document. Not all `text`s have a normalized version.
:ivar str category: Entity category, see the following link for health's named
entity types: https://aka.ms/text-analytics-health-entities
:ivar str subcategory: Entity subcategory.
Expand All @@ -510,6 +512,7 @@ class HealthcareEntity(DictMixin):

def __init__(self, **kwargs):
self.text = kwargs.get("text", None)
self.normalized_text = kwargs.get("normalized_text", None)
self.category = kwargs.get("category", None)
self.subcategory = kwargs.get("subcategory", None)
self.length = kwargs.get("length", None)
Expand All @@ -521,6 +524,7 @@ def __init__(self, **kwargs):
def _from_generated(cls, healthcare_entity):
return cls(
text=healthcare_entity.text,
normalized_text=healthcare_entity.name,
category=healthcare_entity.category,
subcategory=healthcare_entity.subcategory,
length=healthcare_entity.length,
Expand All @@ -535,9 +539,10 @@ def __hash__(self):
return hash(repr(self))

def __repr__(self):
return "HealthcareEntity(text={}, category={}, subcategory={}, length={}, offset={}, confidence_score={}, "\
"data_sources={})".format(
return "HealthcareEntity(text={}, normalized_text={}, category={}, subcategory={}, length={}, offset={}, "\
"confidence_score={}, data_sources={})".format(
self.text,
self.normalized_text,
self.category,
self.subcategory,
self.length,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ async def analyze_healthcare_entities_async(self):
for idx, doc in enumerate(docs):
for entity in doc.entities:
print("Entity: {}".format(entity.text))
print("...Normalized Text: {}".format(entity.normalized_text))
print("...Category: {}".format(entity.category))
print("...Subcategory: {}".format(entity.subcategory))
print("...Offset: {}".format(entity.offset))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def analyze_healthcare_entities(self):
for idx, doc in enumerate(docs):
for entity in doc.entities:
print("Entity: {}".format(entity.text))
print("...Normalized Text: {}".format(entity.normalized_text))
print("...Category: {}".format(entity.category))
print("...Subcategory: {}".format(entity.subcategory))
print("...Offset: {}".format(entity.offset))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "patients must have histologically confirmed
NHL", "language": "en"}]}'
headers:
Accept:
- application/json, text/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '105'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs?stringIndexType=UnicodeCodePoint
response:
body:
string: ''
headers:
apim-request-id:
- 1c4b7bf9-4eaf-41c1-8c28-585fd380d751
date:
- Wed, 03 Mar 2021 21:46:23 GMT
operation-location:
- https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '338'
status:
code: 202
message: Accepted
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: GET
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
response:
body:
string: '{"jobId":"7249ddad-d020-43dd-9a69-a2b0216cd9bd","lastUpdateDateTime":"2021-03-03T21:46:24Z","createdDateTime":"2021-03-03T21:46:23Z","expirationDateTime":"2021-03-04T21:46:23Z","status":"notStarted","errors":[]}'
headers:
apim-request-id:
- 57339114-5845-4f08-ab4d-0aa36c843d25
content-type:
- application/json; charset=utf-8
date:
- Wed, 03 Mar 2021 21:46:28 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '146'
status:
code: 200
message: OK
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: GET
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
response:
body:
string: '{"jobId":"7249ddad-d020-43dd-9a69-a2b0216cd9bd","lastUpdateDateTime":"2021-03-03T21:46:32Z","createdDateTime":"2021-03-03T21:46:23Z","expirationDateTime":"2021-03-04T21:46:23Z","status":"running","errors":[]}'
headers:
apim-request-id:
- 417f0558-5abd-49fd-8cd7-32f2d03549bd
content-type:
- application/json; charset=utf-8
date:
- Wed, 03 Mar 2021 21:46:33 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '122'
status:
code: 200
message: OK
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: GET
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
response:
body:
string: '{"jobId":"7249ddad-d020-43dd-9a69-a2b0216cd9bd","lastUpdateDateTime":"2021-03-03T21:46:32Z","createdDateTime":"2021-03-03T21:46:23Z","expirationDateTime":"2021-03-04T21:46:23Z","status":"running","errors":[]}'
headers:
apim-request-id:
- 54ddb168-5bcc-4610-86b4-1b02d2241bd5
content-type:
- application/json; charset=utf-8
date:
- Wed, 03 Mar 2021 21:46:39 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '87'
status:
code: 200
message: OK
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: GET
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
response:
body:
string: '{"jobId":"7249ddad-d020-43dd-9a69-a2b0216cd9bd","lastUpdateDateTime":"2021-03-03T21:46:43Z","createdDateTime":"2021-03-03T21:46:23Z","expirationDateTime":"2021-03-04T21:46:23Z","status":"succeeded","errors":[],"results":{"documents":[{"id":"0","entities":[{"offset":19,"length":14,"text":"histologically","category":"ExaminationName","confidenceScore":1.0,"links":[{"dataSource":"UMLS","id":"C0344441"},{"dataSource":"CHV","id":"0000030964"},{"dataSource":"LNC","id":"MTHU010496"},{"dataSource":"MDR","id":"10062005"},{"dataSource":"MTH","id":"U002823"},{"dataSource":"MTHMST","id":"MT140012"},{"dataSource":"NCI","id":"C49131"},{"dataSource":"SNOMEDCT_US","id":"714797009"}]},{"offset":44,"length":3,"text":"NHL","category":"Diagnosis","confidenceScore":1.0,"links":[{"dataSource":"UMLS","id":"C0024305"},{"dataSource":"BI","id":"BI00323"},{"dataSource":"CCPSS","id":"0001640"},{"dataSource":"CCS","id":"2.10.2"},{"dataSource":"CCSR_10","id":"NEO058"},{"dataSource":"CHV","id":"0000007621"},{"dataSource":"COSTAR","id":"U000045"},{"dataSource":"CSP","id":"4001-0094"},{"dataSource":"DXP","id":"U002830"},{"dataSource":"HPO","id":"HP:0012539"},{"dataSource":"ICD10","id":"C85.9"},{"dataSource":"ICD10AM","id":"M9672/3"},{"dataSource":"ICD10CM","id":"C85.9"},{"dataSource":"ICPC2ICD10ENG","id":"MTHU053464"},{"dataSource":"ICPC2P","id":"B74002"},{"dataSource":"MDR","id":"10029547"},{"dataSource":"MEDCIN","id":"35839"},{"dataSource":"MEDLINEPLUS","id":"117"},{"dataSource":"MSH","id":"D008228"},{"dataSource":"NCI","id":"C3211"},{"dataSource":"NCI_CELLOSAURUS","id":"C3211"},{"dataSource":"NCI_CPTAC","id":"C3211"},{"dataSource":"NCI_CTEP-SDC","id":"10029593"},{"dataSource":"NCI_CTRP","id":"C3211"},{"dataSource":"NCI_GDC","id":"C3211"},{"dataSource":"NCI_NCI-GLOSS","id":"CDR0000045148"},{"dataSource":"NCI_NICHD","id":"C3211"},{"dataSource":"OMIM","id":"MTHU014311"},{"dataSource":"PDQ","id":"CDR0000038957"},{"dataSource":"QMR","id":"R0121804"},{"dataSource":"RCD","id":"B627."},{"dataSource":"SNM","id":"M-YYX54"},{"dataSource":"SNMI","id":"M-96723"},{"dataSource":"SNOMEDCT_US","id":"1929004"},{"dataSource":"WHO","id":"1544"}]}],"relations":[],"warnings":[]}],"errors":[],"modelVersion":"2021-01-11"}}'
headers:
apim-request-id:
- 356495ad-d24a-4870-ae9a-3bc03cdc951b
content-type:
- application/json; charset=utf-8
date:
- Wed, 03 Mar 2021 21:46:45 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '302'
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "patients must have histologically confirmed
NHL", "language": "en"}]}'
headers:
Accept:
- application/json, text/json
Content-Length:
- '105'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs?stringIndexType=UnicodeCodePoint
response:
body:
string: ''
headers:
apim-request-id: 5f62849b-975a-4da4-8d9f-359e2b7af6d4
date: Wed, 03 Mar 2021 21:46:45 GMT
operation-location: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/4be5a0c6-2663-46d8-ba56-ffeefe175b9b
strict-transport-security: max-age=31536000; includeSubDomains; preload
transfer-encoding: chunked
x-content-type-options: nosniff
x-envoy-upstream-service-time: '75'
status:
code: 202
message: Accepted
url: https://westus2.api.cognitive.microsoft.com//text/analytics/v3.1-preview.4/entities/health/jobs?stringIndexType=UnicodeCodePoint
- request:
body: null
headers:
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: GET
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/4be5a0c6-2663-46d8-ba56-ffeefe175b9b
response:
body:
string: '{"jobId":"4be5a0c6-2663-46d8-ba56-ffeefe175b9b","lastUpdateDateTime":"2021-03-03T21:46:48Z","createdDateTime":"2021-03-03T21:46:45Z","expirationDateTime":"2021-03-04T21:46:45Z","status":"succeeded","errors":[],"results":{"documents":[{"id":"0","entities":[{"offset":19,"length":14,"text":"histologically","category":"ExaminationName","confidenceScore":1.0,"links":[{"dataSource":"UMLS","id":"C0344441"},{"dataSource":"CHV","id":"0000030964"},{"dataSource":"LNC","id":"MTHU010496"},{"dataSource":"MDR","id":"10062005"},{"dataSource":"MTH","id":"U002823"},{"dataSource":"MTHMST","id":"MT140012"},{"dataSource":"NCI","id":"C49131"},{"dataSource":"SNOMEDCT_US","id":"714797009"}]},{"offset":44,"length":3,"text":"NHL","category":"Diagnosis","confidenceScore":1.0,"links":[{"dataSource":"UMLS","id":"C0024305"},{"dataSource":"BI","id":"BI00323"},{"dataSource":"CCPSS","id":"0001640"},{"dataSource":"CCS","id":"2.10.2"},{"dataSource":"CCSR_10","id":"NEO058"},{"dataSource":"CHV","id":"0000007621"},{"dataSource":"COSTAR","id":"U000045"},{"dataSource":"CSP","id":"4001-0094"},{"dataSource":"DXP","id":"U002830"},{"dataSource":"HPO","id":"HP:0012539"},{"dataSource":"ICD10","id":"C85.9"},{"dataSource":"ICD10AM","id":"M9672/3"},{"dataSource":"ICD10CM","id":"C85.9"},{"dataSource":"ICPC2ICD10ENG","id":"MTHU053464"},{"dataSource":"ICPC2P","id":"B74002"},{"dataSource":"MDR","id":"10029547"},{"dataSource":"MEDCIN","id":"35839"},{"dataSource":"MEDLINEPLUS","id":"117"},{"dataSource":"MSH","id":"D008228"},{"dataSource":"NCI","id":"C3211"},{"dataSource":"NCI_CELLOSAURUS","id":"C3211"},{"dataSource":"NCI_CPTAC","id":"C3211"},{"dataSource":"NCI_CTEP-SDC","id":"10029593"},{"dataSource":"NCI_CTRP","id":"C3211"},{"dataSource":"NCI_GDC","id":"C3211"},{"dataSource":"NCI_NCI-GLOSS","id":"CDR0000045148"},{"dataSource":"NCI_NICHD","id":"C3211"},{"dataSource":"OMIM","id":"MTHU014311"},{"dataSource":"PDQ","id":"CDR0000038957"},{"dataSource":"QMR","id":"R0121804"},{"dataSource":"RCD","id":"B627."},{"dataSource":"SNM","id":"M-YYX54"},{"dataSource":"SNMI","id":"M-96723"},{"dataSource":"SNOMEDCT_US","id":"1929004"},{"dataSource":"WHO","id":"1544"}]}],"relations":[],"warnings":[]}],"errors":[],"modelVersion":"2021-01-11"}}'
headers:
apim-request-id: f9b79e8f-3fa1-4623-99b1-bf925c6b3b60
content-type: application/json; charset=utf-8
date: Wed, 03 Mar 2021 21:46:50 GMT
strict-transport-security: max-age=31536000; includeSubDomains; preload
transfer-encoding: chunked
x-content-type-options: nosniff
x-envoy-upstream-service-time: '30'
status:
code: 200
message: OK
url: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/4be5a0c6-2663-46d8-ba56-ffeefe175b9b
version: 1
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,17 @@ def test_relations(self, client):
else:
assert role.name == HealthcareEntityRelationRoleType.ABBREVIATED_TERM
self.assert_healthcare_entities_equal(role.entity, parkinsons_abbreviation_entity)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
def test_normalized_text(self, client):
result = list(client.begin_analyze_healthcare_entities(
documents=["patients must have histologically confirmed NHL"]
).result())

# currently just testing it has that attribute.
# have an issue to update https://github.com/Azure/azure-sdk-for-python/issues/17072

assert all([
e for e in result[0].entities if hasattr(e, "normalized_text")
])
Original file line number Diff line number Diff line change
Expand Up @@ -420,4 +420,22 @@ async def test_relations(self, client):
assert role.name == "AbbreviatedTerm"
self.assert_healthcare_entities_equal(role.entity, parkinsons_abbreviation_entity)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
async def test_normalized_text(self, client):
response = await (await client.begin_analyze_healthcare_entities(
documents=["patients must have histologically confirmed NHL"]
)).result()

result = []
async for r in response:
result.append(r)

# currently just testing it has that attribute.
# have an issue to update https://github.com/Azure/azure-sdk-for-python/issues/17072

assert all([
e for e in result[0].entities if hasattr(e, "normalized_text")
])


3 changes: 2 additions & 1 deletion sdk/textanalytics/azure-ai-textanalytics/tests/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def data_source():
def healthcare_entity(data_source):
model = _models.HealthcareEntity(
text="Bonjour",
normalized_text="Bonjour",
category="MyCategory",
subcategory="MySubcategory",
length=7,
Expand All @@ -286,7 +287,7 @@ def healthcare_entity(data_source):
data_sources=[data_source[0]],
)
model_repr = (
"HealthcareEntity(text=Bonjour, category=MyCategory, subcategory=MySubcategory, length=7, offset=12, " +
"HealthcareEntity(text=Bonjour, normalized_text=Bonjour, category=MyCategory, subcategory=MySubcategory, length=7, offset=12, " +
"confidence_score=0.95, data_sources=[{}])".format(data_source[1])
)

Expand Down

0 comments on commit 4ee7f35

Please sign in to comment.