Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[text analytics] Add redacted_text #13449

Merged
merged 6 commits into from
Sep 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdk/textanalytics/azure-ai-textanalytics/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- We are now targeting the service's v3.1-preview.1 API as the default. If you would like to still use version v3.0 of the service,
pass in `v3.0` to the kwarg `api_version` when creating your TextAnalyticsClient
- We have added an API `recognize_pii_entities` which returns entities containing personal information for a batch of documents. Only available for API version v3.1-preview.1 and up.
- In API version v3.1-preview.2 and up, the redacted text of the document is returned on the top-level result object `RecognizePiiEntitiesResult` through property `redacted_text`.
- Added `offset` and `length` properties for `CategorizedEntity`, `SentenceSentiment`, and `LinkedEntityMatch`. These properties are only available for API versions v3.1-preview.1 and up.
- `length` is the number of characters in the text of these models
- `offset` is the offset of the text from the start of the document
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ class RecognizePiiEntitiesResult(DictMixin):
:ivar entities: Recognized PII entities in the document.
:vartype entities:
list[~azure.ai.textanalytics.PiiEntity]
:ivar str redacted_text: Returns the text of the input document with all of the PII information
redacted out. Only returned for API versions v3.1-preview.2 and up.
:ivar warnings: Warnings encountered while processing document. Results will still be returned
if there are warnings, but they may not be fully accurate.
:vartype warnings: list[~azure.ai.textanalytics.TextAnalyticsWarning]
Expand All @@ -155,18 +157,28 @@ class RecognizePiiEntitiesResult(DictMixin):
~azure.ai.textanalytics.TextDocumentStatistics
:ivar bool is_error: Boolean check for error item when iterating over list of
results. Always False for an instance of a RecognizePiiEntitiesResult.
.. versionadded:: v3.1-preview.2
The *redacted_text* parameter.
"""

def __init__(self, **kwargs):
self.id = kwargs.get("id", None)
self.entities = kwargs.get("entities", None)
self.redacted_text = kwargs.get("redacted_text", None)
self.warnings = kwargs.get("warnings", [])
self.statistics = kwargs.get("statistics", None)
self.is_error = False

def __repr__(self):
return "RecognizePiiEntitiesResult(id={}, entities={}, warnings={}, statistics={}, is_error={})" \
.format(self.id, repr(self.entities), repr(self.warnings), repr(self.statistics), self.is_error)[:1024]
return "RecognizePiiEntitiesResult(id={}, entities={}, redacted_text={}, warnings={}, " \
"statistics={}, is_error={})" .format(
self.id,
repr(self.entities),
self.redacted_text,
repr(self.warnings),
repr(self.statistics),
self.is_error
)[:1024]


class DetectLanguageResult(DictMixin):
Expand Down Expand Up @@ -214,9 +226,9 @@ class CategorizedEntity(DictMixin):
:ivar subcategory: Entity subcategory, such as Age/Year/TimeRange etc
:vartype subcategory: str
:ivar int offset: The entity text offset from the start of the document.
Returned in unicode code points. Only returned for api versions v3.1-preview.1 and up.
Returned in unicode code points. Only returned for API versions v3.1-preview.1 and up.
:ivar int length: The length of the entity text. Returned
in unicode code points. Only returned for api versions v3.1-preview.1 and up.
in unicode code points. Only returned for API versions v3.1-preview.1 and up.
:ivar confidence_score: Confidence score between 0 and 1 of the extracted
entity.
:vartype confidence_score: float
Expand Down Expand Up @@ -669,9 +681,9 @@ class LinkedEntityMatch(DictMixin):
:vartype confidence_score: float
:ivar text: Entity text as appears in the request.
:ivar int offset: The linked entity match text offset from the start of the document.
Returned in unicode code points. Only returned for api versions v3.1-preview.1 and up.
Returned in unicode code points. Only returned for API versions v3.1-preview.1 and up.
:ivar int length: The length of the linked entity match text. Returned
in unicode code points. Only returned for api versions v3.1-preview.1 and up.
in unicode code points. Only returned for API versions v3.1-preview.1 and up.
:vartype text: str
"""

Expand Down Expand Up @@ -781,9 +793,9 @@ class SentenceSentiment(DictMixin):
:vartype confidence_scores:
~azure.ai.textanalytics.SentimentConfidenceScores
:ivar int offset: The sentence offset from the start of the document. Returned
in unicode code points. Only returned for api versions v3.1-preview.1 and up.
in unicode code points. Only returned for API versions v3.1-preview.1 and up.
:ivar int length: The length of the sentence. Returned
in unicode code points. Only returned for api versions v3.1-preview.1 and up.
in unicode code points. Only returned for API versions v3.1-preview.1 and up.
:ivar mined_opinions: The list of opinions mined from this sentence.
For example in "The food is good, but the service is bad", we would
mind these two opinions "food is good", "service is bad". Only returned
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ def pii_entities_result(entity, results): # pylint: disable=unused-argument
return RecognizePiiEntitiesResult(
id=entity.id,
entities=[PiiEntity._from_generated(e) for e in entity.entities], # pylint: disable=protected-access
redacted_text=entity.redacted_text if hasattr(entity, "redacted_text") else None,
warnings=[TextAnalyticsWarning._from_generated(w) for w in entity.warnings], # pylint: disable=protected-access
statistics=TextDocumentStatistics._from_generated(entity.statistics), # pylint: disable=protected-access
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "My SSN is 859-98-0987.", "language":
"en"}]}'
headers:
Accept:
- application/json, text/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '80'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://cognitiveusw2dev.azure-api.net/text/analytics/v3.1-preview.2/entities/recognition/pii?showStats=false&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"redactedText":"My SSN is ***********.","id":"0","entities":[{"text":"859-98-0987","category":"U.S.
Social Security Number (SSN)","offset":10,"length":11,"confidenceScore":0.65}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id:
- c5ba8c84-0e46-471a-b4c8-f02c411c20ec
content-type:
- application/json; charset=utf-8
csp-billing-usage:
- CognitiveServices.TextAnalytics.BatchScoring=1
date:
- Mon, 31 Aug 2020 20:15:43 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '78'
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "My SSN is 859-98-0987.", "language":
"en"}]}'
headers:
Accept:
- application/json, text/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '80'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"id":"0","entities":[{"text":"859-98-0987","category":"U.S.
Social Security Number (SSN)","offset":10,"length":11,"confidenceScore":0.65}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id:
- 4ae026d1-15d1-4d77-8913-46922e72d7cb
content-type:
- application/json; charset=utf-8
csp-billing-usage:
- CognitiveServices.TextAnalytics.BatchScoring=1
date:
- Mon, 31 Aug 2020 19:58:17 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '68'
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "My SSN is 859-98-0987.", "language":
"en"}]}'
headers:
Accept:
- application/json, text/json
Content-Length:
- '80'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://cognitiveusw2dev.azure-api.net/text/analytics/v3.1-preview.2/entities/recognition/pii?showStats=false&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"redactedText":"My SSN is ***********.","id":"0","entities":[{"text":"859-98-0987","category":"U.S.
Social Security Number (SSN)","offset":10,"length":11,"confidenceScore":0.65}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id: dc638432-dc71-4f52-aadb-829c2dfd1935
content-type: application/json; charset=utf-8
csp-billing-usage: CognitiveServices.TextAnalytics.BatchScoring=1
date: Mon, 31 Aug 2020 20:15:43 GMT
strict-transport-security: max-age=31536000; includeSubDomains; preload
transfer-encoding: chunked
x-content-type-options: nosniff
x-envoy-upstream-service-time: '80'
status:
code: 200
message: OK
url: https://cognitiveusw2dev.azure-api.net//text/analytics/v3.1-preview.2/entities/recognition/pii?showStats=false&stringIndexType=UnicodeCodePoint
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "My SSN is 859-98-0987.", "language":
"en"}]}'
headers:
Accept:
- application/json, text/json
Content-Length:
- '80'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"id":"0","entities":[{"text":"859-98-0987","category":"U.S.
Social Security Number (SSN)","offset":10,"length":11,"confidenceScore":0.65}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id: eeda4dd4-74dd-4e54-88cb-5a0352f065cf
content-type: application/json; charset=utf-8
csp-billing-usage: CognitiveServices.TextAnalytics.BatchScoring=1
date: Mon, 31 Aug 2020 19:58:17 GMT
strict-transport-security: max-age=31536000; includeSubDomains; preload
transfer-encoding: chunked
x-content-type-options: nosniff
x-envoy-upstream-service-time: '106'
status:
code: 200
message: OK
url: https://westus2.api.cognitive.microsoft.com//text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&stringIndexType=UnicodeCodePoint
version: 1
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# ------------------------------------

import os
import pytest
import platform
import functools
Expand Down Expand Up @@ -576,6 +576,24 @@ def test_recognize_pii_entities_v3(self, client):

assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)

# currently only have this as playback since the dev endpoint is unreliable
@pytest.mark.playback_test_only
@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer(client_kwargs={
"api_version": TextAnalyticsApiVersion.V3_1_PREVIEW_2,
"text_analytics_account_key": os.environ.get('AZURE_TEXT_ANALYTICS_KEY'),
"text_analytics_account": "https://cognitiveusw2dev.azure-api.net/"
mssfang marked this conversation as resolved.
Show resolved Hide resolved
})
def test_redacted_text(self, client):
result = client.recognize_pii_entities(["My SSN is 859-98-0987."])
self.assertEqual("My SSN is ***********.", result[0].redacted_text)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
def test_redacted_text_v3_1_preview_1(self, client):
result = client.recognize_pii_entities(["My SSN is 859-98-0987."])
self.assertIsNone(result[0].redacted_text)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
def test_phi_domain_filter(self, client):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# ------------------------------------

import os
import pytest
import platform
import functools
Expand Down Expand Up @@ -574,6 +574,24 @@ async def test_recognize_pii_entities_v3(self, client):

assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)

# currently only have this as playback since the dev endpoint is unreliable
@pytest.mark.playback_test_only
@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer(client_kwargs={
"api_version": TextAnalyticsApiVersion.V3_1_PREVIEW_2,
"text_analytics_account_key": os.environ.get('AZURE_TEXT_ANALYTICS_KEY'),
"text_analytics_account": "https://cognitiveusw2dev.azure-api.net/"
})
async def test_redacted_text(self, client):
result = await client.recognize_pii_entities(["My SSN is 859-98-0987."])
self.assertEqual("My SSN is ***********.", result[0].redacted_text)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
async def test_redacted_text_v3_1_preview_1(self, client):
result = await client.recognize_pii_entities(["My SSN is 859-98-0987."])
self.assertIsNone(result[0].redacted_text)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
async def test_phi_domain_filter(self, client):
Expand Down
4 changes: 3 additions & 1 deletion sdk/textanalytics/azure-ai-textanalytics/tests/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,11 +290,13 @@ def test_recognize_pii_entities_result(self, pii_entity, text_analytics_warning,
model = _models.RecognizePiiEntitiesResult(
id="1",
entities=[pii_entity[0]],
redacted_text="***********",
warnings=[text_analytics_warning[0]],
statistics=text_document_statistics[0],
is_error=False
)
model_repr = "RecognizePiiEntitiesResult(id=1, entities=[{}], warnings=[{}], statistics={}, is_error=False)".format(
model_repr = "RecognizePiiEntitiesResult(id=1, entities=[{}], redacted_text=***********, warnings=[{}], " \
"statistics={}, is_error=False)".format(
pii_entity[1], text_analytics_warning[1], text_document_statistics[1]
)

Expand Down