diff --git a/.vscode/cspell.json b/.vscode/cspell.json index 44f5d8200e097..adf1b396b2412 100644 --- a/.vscode/cspell.json +++ b/.vscode/cspell.json @@ -1822,7 +1822,8 @@ "deid", "deidentification", "healthdataaiservices", - "deidentify" + "deidentify", + "deidentified" ] } ], diff --git a/sdk/healthdataaiservices/azure-health-deidentification/samples/README.md b/sdk/healthdataaiservices/azure-health-deidentification/samples/README.md new file mode 100644 index 0000000000000..22b116e38677b --- /dev/null +++ b/sdk/healthdataaiservices/azure-health-deidentification/samples/README.md @@ -0,0 +1,55 @@ +# Azure Health Deidentification client library for Python +Azure Health Deidentification is Microsoft's solution to anonymize unstructured health text. + +[Source code](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/healthdataaiservices/azure-health-deidentification/azure/health/deidentification) +| [Package (PyPI)](https://pypi.org/project/azure-health-deidentification/) + +| [Samples](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/healthdataaiservices/azure-health-deidentification/samples) + + +## Getting started + +### Prerequisites +* Python 3.8 or later is required to use this package. For more details, please read our page on [Azure SDK for Python version support policy](https://github.com/Azure/azure-sdk-for-python/wiki/Azure-SDKs-Python-version-support-policy). +* You must have an [Azure subscription](https://azure.microsoft.com/free/) and an +**Azure Deidentification Service** to use this package. + +### Install the package +Install the Azure Health Deidentification client library for Python with [pip](https://pypi.org/project/pip/): + +```bash +pip install azure-health-deidentification +``` + +### Create a Deidentification Service +If you wish to create a new storage account, you can use the +[Azure Portal](https://docs.microsoft.com/azure/storage/common/storage-quickstart-create-account?tabs=azure-portal). + +### Create the client +In order to create a Deidentification client you must obtain the **Service URL** from your Azure Deidentification Service + +```python + endpoint = os.environ["AZURE_HEALTH_DEIDENTIFICATION_ENDPOINT"] + endpoint = endpoint.replace("https://", "") + print(endpoint) + # example: fuf4h4bxg5b0d0dr.api.cac001.deid.azure.com + + credential = DefaultAzureCredential() + + client = DeidentificationClient(endpoint, DefaultAzureCredential()) +``` + +## Key concepts +Operation Modes: +- Tag: Will return a structure of offset and length with the PHI category of the related text spans. +- Redact: Will return output text with placeholder stubbed text. ex. `[name]` +- Surrogate: Will return output text with synthetic replacements. + - `My name is John Smith` + - `My name is Tom Jones` + +## Contributing +This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.microsoft.com. + +When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. \ No newline at end of file diff --git a/sdk/healthdataaiservices/azure-health-deidentification/samples/async_samples/sample_realtime_deidentification_async.py b/sdk/healthdataaiservices/azure-health-deidentification/samples/async_samples/sample_realtime_deidentification_async.py new file mode 100644 index 0000000000000..78af7163e6744 --- /dev/null +++ b/sdk/healthdataaiservices/azure-health-deidentification/samples/async_samples/sample_realtime_deidentification_async.py @@ -0,0 +1,61 @@ +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +FILE: sample_realtime_deidentification_async.py + +DESCRIPTION: + This sample demonstrates the most simple deidentification scenario. It takes in a string of text and will return + the deidentified text. + +USAGE: + python sample_realtime_deidentification_async.py + + Set the environment variables with your own values before running the sample: + 1) AZURE_HEALTH_DEIDENTIFICATION_ENDPOINT - the endpoint to your Deidentification Service resource. +""" +import asyncio + + +async def sample_realtime_deidentification_async(): + # [START realtime_deidentification] + import os + from azure.identity import DefaultAzureCredential + from azure.health.deidentification import DeidentificationClient + from azure.health.deidentification.models import ( + DeidentificationResult, + DeidentificationContent, + OperationType, + DocumentDataType, + ) + + endpoint = os.environ["AZURE_HEALTH_DEIDENTIFICATION_ENDPOINT"] + endpoint = endpoint.replace("https://", "") + # uri decode + print(endpoint) + + credential = DefaultAzureCredential() + + client = DeidentificationClient(endpoint, DefaultAzureCredential()) + + body = DeidentificationContent( + input_text="Hello, my name is John Smith.", + operation=OperationType.SURROGATE, + data_type=DocumentDataType.PLAINTEXT, + ) + + result: DeidentificationResult = await client.deidentify(body) + + print(f'Original Text: "{body.input_text}"') + print(f'Deidentified Text: "{result.output_text}"') + # [END realtime_deidentification] + + +async def main(): + await sample_realtime_deidentification_async() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/sdk/healthdataaiservices/azure-health-deidentification/samples/sample_create_and_wait_job.py b/sdk/healthdataaiservices/azure-health-deidentification/samples/sample_create_and_wait_job.py new file mode 100644 index 0000000000000..9ae0f9f8b63de --- /dev/null +++ b/sdk/healthdataaiservices/azure-health-deidentification/samples/sample_create_and_wait_job.py @@ -0,0 +1,83 @@ +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +FILE: sample_create_and_wait_job.py + +DESCRIPTION: + This sample demonstrates the most simple job-based deidentification scenario. + It takes a blob uri as input and an input prefix. It will create a job and wait for the job to complete. + +USAGE: + python sample_create_and_wait_job.py + + Set the environment variables with your own values before running the sample: + 1) AZURE_HEALTH_DEIDENTIFICATION_ENDPOINT - the endpoint to your Deidentification Service resource. + 2) AZURE_STORAGE_ACCOUNT_LOCATION - the location of the storage account where the input and output files are stored. + This can be either a URL (which is configured with Managed Identity) or a SasURI. + 3) INPUT_PREFIX - the prefix of the input files in the storage account. +""" + + +import uuid + + +def sample_create_and_wait_job(): + # [START sample_create_and_wait_job] + import os + from azure.identity import DefaultAzureCredential + from azure.health.deidentification import DeidentificationClient + from azure.health.deidentification.models import ( + DeidentificationJob, + SourceStorageLocation, + TargetStorageLocation, + OperationType, + DocumentDataType, + ) + from azure.core.polling import LROPoller + + endpoint = os.environ["AZURE_HEALTH_DEIDENTIFICATION_ENDPOINT"] + endpoint = endpoint.replace("https://", "") + # uri decode + print(endpoint) + + storage_location = os.environ["AZURE_STORAGE_ACCOUNT_LOCATION"] + inputPrefix = os.environ["INPUT_PREFIX"] + outputPrefix = "_output" + + credential = DefaultAzureCredential() + + client = DeidentificationClient( + endpoint, + DefaultAzureCredential(), + connection_verify="localhost" not in endpoint, + ) + + jobname = f"sample-job-{uuid.uuid4().hex[:8]}" + + job = DeidentificationJob( + source_location=SourceStorageLocation( + location=storage_location, + prefix=inputPrefix, + ), + target_location=TargetStorageLocation( + location=storage_location, prefix=outputPrefix + ), + operation=OperationType.SURROGATE, + data_type=DocumentDataType.PLAINTEXT, + ) + + lro: LROPoller = client.begin_create_job(jobname, job) + lro.wait(timeout=60) + + finished_job: DeidentificationJob = lro.result() + print(f"Job Name: {finished_job.name}") + print(f"Job Status: {finished_job.status}") + print(f"File Count: {finished_job.summary.total}") + # [END sample_create_and_wait_job] + + +if __name__ == "__main__": + sample_create_and_wait_job() diff --git a/sdk/healthdataaiservices/azure-health-deidentification/samples/sample_list_job_files.py b/sdk/healthdataaiservices/azure-health-deidentification/samples/sample_list_job_files.py new file mode 100644 index 0000000000000..5860925a8650a --- /dev/null +++ b/sdk/healthdataaiservices/azure-health-deidentification/samples/sample_list_job_files.py @@ -0,0 +1,93 @@ +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +FILE: sample_list_job_files.py + +DESCRIPTION: + This sample demonstrates how to create a job, wait for it to finish, and then list the files associated with the job. + +USAGE: + python sample_list_job_files.py + + Set the environment variables with your own values before running the sample: + 1) AZURE_HEALTH_DEIDENTIFICATION_ENDPOINT - the endpoint to your Deidentification Service resource. + 2) AZURE_STORAGE_ACCOUNT_LOCATION - the location of the storage account where the input and output files are stored. + This can be either a URL (which is configured with Managed Identity) or a SasURI. + 3) INPUT_PREFIX - the prefix of the input files in the storage account. +""" + + +import uuid + + +def sample_list_job_files(): + # [START sample_list_job_files] + import os + from azure.identity import DefaultAzureCredential + from azure.health.deidentification import DeidentificationClient + from azure.health.deidentification.models import ( + DeidentificationJob, + SourceStorageLocation, + TargetStorageLocation, + OperationType, + DocumentDataType, + ) + from azure.core.polling import LROPoller + + endpoint = os.environ["AZURE_HEALTH_DEIDENTIFICATION_ENDPOINT"] + endpoint = endpoint.replace("https://", "") + # uri decode + print(endpoint) + + storage_location = os.environ["AZURE_STORAGE_ACCOUNT_LOCATION"] + inputPrefix = os.environ["INPUT_PREFIX"] + outputPrefix = "_output" + + credential = DefaultAzureCredential() + + client = DeidentificationClient( + endpoint, + DefaultAzureCredential(), + connection_verify="localhost" not in endpoint, + ) + + jobname = f"sample-job-{uuid.uuid4().hex[:8]}" + + job = DeidentificationJob( + source_location=SourceStorageLocation( + location=storage_location, + prefix=inputPrefix, + ), + target_location=TargetStorageLocation( + location=storage_location, prefix=outputPrefix + ), + operation=OperationType.SURROGATE, + data_type=DocumentDataType.PLAINTEXT, + ) + + print(f"Creating job with name: {jobname}") + poller: LROPoller = client.begin_create_job(jobname, job) + poller.wait(timeout=60) + + job = poller.result() + print(f"Job Status: {job.status}") + + files = client.list_job_files(job.name) + + print("Completed files (Max 10):") + filesToLookThrough = 10 + for f in files: + print(f"\t - {f.input.path}") + + filesToLookThrough -= 1 + if filesToLookThrough <= 0: + break + + # [END sample_list_job_files] + + +if __name__ == "__main__": + sample_list_job_files() diff --git a/sdk/healthdataaiservices/azure-health-deidentification/samples/sample_list_jobs.py b/sdk/healthdataaiservices/azure-health-deidentification/samples/sample_list_jobs.py new file mode 100644 index 0000000000000..8dae370d307e8 --- /dev/null +++ b/sdk/healthdataaiservices/azure-health-deidentification/samples/sample_list_jobs.py @@ -0,0 +1,90 @@ +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +FILE: sample_list_jobs.py + +DESCRIPTION: + This sample demonstrates how to list the latest 5 jobs in the Deidentification Service resource. + It will create a job and then list it using the list_jobs method. + +USAGE: + python sample_list_jobs.py + + Set the environment variables with your own values before running the sample: + 1) AZURE_HEALTH_DEIDENTIFICATION_ENDPOINT - the endpoint to your Deidentification Service resource. + 2) AZURE_STORAGE_ACCOUNT_LOCATION - the location of the storage account where the input and output files are stored. + This can be either a URL (which is configured with Managed Identity) or a SasURI. + 3) INPUT_PREFIX - the prefix of the input files in the storage account. +""" + + +import uuid + + +def sample_list_jobs(): + # [START sample_list_jobs] + import os + from azure.identity import DefaultAzureCredential + from azure.health.deidentification import DeidentificationClient + from azure.health.deidentification.models import ( + DeidentificationJob, + SourceStorageLocation, + TargetStorageLocation, + OperationType, + DocumentDataType, + ) + from azure.core.polling import LROPoller + + endpoint = os.environ["AZURE_HEALTH_DEIDENTIFICATION_ENDPOINT"] + endpoint = endpoint.replace("https://", "") + # uri decode + print(endpoint) + + storage_location = os.environ["AZURE_STORAGE_ACCOUNT_LOCATION"] + inputPrefix = os.environ["INPUT_PREFIX"] + outputPrefix = "_output" + + credential = DefaultAzureCredential() + + client = DeidentificationClient( + endpoint, + DefaultAzureCredential(), + connection_verify="localhost" not in endpoint, + ) + + jobname = f"sample-job-{uuid.uuid4().hex[:8]}" + + job = DeidentificationJob( + source_location=SourceStorageLocation( + location=storage_location, + prefix=inputPrefix, + ), + target_location=TargetStorageLocation( + location=storage_location, prefix=outputPrefix + ), + operation=OperationType.SURROGATE, + data_type=DocumentDataType.PLAINTEXT, + ) + + print(f"Creating job with name: {jobname}") + client.begin_create_job(jobname, job) + + jobs = client.list_jobs() + + print("Listing latest 5 jobs:") + jobsToLookThrough = 5 + for j in jobs: + print(f"Job Name: {j.name}") + + jobsToLookThrough -= 1 + if jobsToLookThrough <= 0: + break + + # [END sample_list_jobs] + + +if __name__ == "__main__": + sample_list_jobs() diff --git a/sdk/healthdataaiservices/azure-health-deidentification/samples/sample_realtime_deidentification.py b/sdk/healthdataaiservices/azure-health-deidentification/samples/sample_realtime_deidentification.py new file mode 100644 index 0000000000000..65501d8a7d2f5 --- /dev/null +++ b/sdk/healthdataaiservices/azure-health-deidentification/samples/sample_realtime_deidentification.py @@ -0,0 +1,55 @@ +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +FILE: sample_realtime_deidentification.py + +DESCRIPTION: + This sample demonstrates the most simple deidentification scenario. It takes in a string of text and will return + the deidentified text. + +USAGE: + python sample_realtime_deidentification.py + + Set the environment variables with your own values before running the sample: + 1) AZURE_HEALTH_DEIDENTIFICATION_ENDPOINT - the endpoint to your Deidentification Service resource. +""" + + +def sample_realtime_deidentification(): + # [START realtime_deidentification] + import os + from azure.identity import DefaultAzureCredential + from azure.health.deidentification import DeidentificationClient + from azure.health.deidentification.models import ( + DeidentificationResult, + DeidentificationContent, + OperationType, + DocumentDataType, + ) + + endpoint = os.environ["AZURE_HEALTH_DEIDENTIFICATION_ENDPOINT"] + endpoint = endpoint.replace("https://", "") + # uri decode + print(endpoint) + + credential = DefaultAzureCredential() + + client = DeidentificationClient(endpoint, DefaultAzureCredential()) + + body = DeidentificationContent( + input_text="Hello, my name is John Smith.", + operation=OperationType.SURROGATE, + data_type=DocumentDataType.PLAINTEXT, + ) + + result: DeidentificationResult = client.deidentify(body) + print(f'Original Text: "{body.input_text}"') + print(f'Deidentified Text: "{result.output_text}"') + # [END realtime_deidentification] + + +if __name__ == "__main__": + sample_realtime_deidentification() diff --git a/sdk/healthdataaiservices/azure-health-deidentification/tests/test_create_list.py b/sdk/healthdataaiservices/azure-health-deidentification/tests/test_create_list.py index f4473e657a4e3..146db09f29711 100644 --- a/sdk/healthdataaiservices/azure-health-deidentification/tests/test_create_list.py +++ b/sdk/healthdataaiservices/azure-health-deidentification/tests/test_create_list.py @@ -37,7 +37,7 @@ def test_create_list(self, **kwargs): job = None jobsToLookThrough = 10 for j in jobs: - jobsToLookThrough += 1 + jobsToLookThrough -= 1 if j.name == jobname: job = j break diff --git a/sdk/healthdataaiservices/azure-health-deidentification/tests/test_hello_world.py b/sdk/healthdataaiservices/azure-health-deidentification/tests/test_hello_world.py index 391da2ba790ef..32e700227510a 100644 --- a/sdk/healthdataaiservices/azure-health-deidentification/tests/test_hello_world.py +++ b/sdk/healthdataaiservices/azure-health-deidentification/tests/test_hello_world.py @@ -2,6 +2,7 @@ from devtools_testutils import ( recorded_by_proxy, ) +import pytest from azure.health.deidentification.models import * @@ -13,7 +14,6 @@ def test_hello_world(self, healthdataaiservices_deid_service_endpoint): client = self.make_client(healthdataaiservices_deid_service_endpoint) assert client is not None - # TODO: NEED TO OVERRIDE StringIndexType content = DeidentificationContent( input_text="Hello, my name is John Smith.", operation=OperationType.SURROGATE,