From 9b1214bec90d0fd84a53636ad00eeb1e8aa66276 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds Date: Mon, 11 Sep 2023 09:40:02 +0000 Subject: [PATCH 01/16] WIP Instance anonymization --- .../data_anonymizer/deanonymizer_mapping.py | 14 +++++++++++++- .../data_anonymizer/presidio.py | 7 ++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py index 2ee03eb208040..a26790e7d0028 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py @@ -17,5 +17,17 @@ def data(self) -> MappingDataType: return {k: dict(v) for k, v in self.mapping.items()} def update(self, new_mapping: MappingDataType) -> None: + """Update the deanonymizer mapping with new values + Duplicate values will not be added + """ + new_values_seen = set() + for entity_type, values in new_mapping.items(): - self.mapping[entity_type].update(values) + for k, v in values.items(): + # Make sure it is not a duplicate value + if ( + v not in self.mapping[entity_type].values() + and v not in new_values_seen + ): + self.mapping[entity_type][k] = v + new_values_seen.update({v}) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index b2be1dc5a1c0d..526f685624bea 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -282,7 +282,12 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: text, filtered_analyzer_results, anonymizer_results ) - return anonymizer_results.text + anonymizer_mapping = { + key: {v: k for k, v in inner_dict.items()} + for key, inner_dict in self.deanonymizer_mapping.items() + } + + return default_matching_strategy(text, anonymizer_mapping) def _deanonymize( self, From a95e424db15d372c9b242ed546e4c05c0dea5ab6 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds Date: Mon, 11 Sep 2023 14:05:56 +0000 Subject: [PATCH 02/16] Refactor mapping creation, adding it to anonymizer --- .../data_anonymizer/deanonymizer_mapping.py | 63 +++++++++++++- .../data_anonymizer/presidio.py | 87 ++++++------------- 2 files changed, 87 insertions(+), 63 deletions(-) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py index a26790e7d0028..c104a14dcf23b 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py @@ -1,6 +1,9 @@ from collections import defaultdict from dataclasses import dataclass, field -from typing import Dict +from typing import Dict, List + +from presidio_analyzer import RecognizerResult +from presidio_anonymizer.entities import EngineResult MappingDataType = Dict[str, Dict[str, str]] @@ -31,3 +34,61 @@ def update(self, new_mapping: MappingDataType) -> None: ): self.mapping[entity_type][k] = v new_values_seen.update({v}) + + +def create_anonymizer_mapping( + original_text: str, + analyzer_results: List[RecognizerResult], + anonymizer_results: EngineResult, + reversed: bool = False, +) -> MappingDataType: + """Creates or updates the mapping used to anonymize and/or deanonymize text. + + This method exploits the results returned by the + analysis and anonymization processes. + + If reversed is True, it constructs a mapping from each original + entity to its anonymized value. + + If reversed is False, it constructs a mapping from each + anonymized entity back to its original text value. + + Example of mapping: + { + "PERSON": { + "": "", + "John Doe": "Slim Shady" + }, + "PHONE_NUMBER": { + "111-111-1111": "555-555-5555" + } + ... + } + """ + + # We are able to zip and loop through both lists because we expect + # them to return corresponding entities for each identified piece + # of analyzable data from our input. + + # We sort them by their 'start' attribute because it allows us to + # match corresponding entities by their position in the input text. + analyzer_results = sorted(analyzer_results, key=lambda d: d.start) + anonymizer_results.items = sorted(anonymizer_results.items, key=lambda d: d.start) + + new_anonymizer_mapping: MappingDataType = defaultdict(dict) + + for analyzed_entity, anonymized_entity in zip( + analyzer_results, anonymizer_results.items + ): + original_value = original_text[analyzed_entity.start : analyzed_entity.end] + + if reversed: + new_anonymizer_mapping[anonymized_entity.entity_type][ + anonymized_entity.text + ] = original_value + else: + new_anonymizer_mapping[anonymized_entity.entity_type][ + original_value + ] = anonymized_entity.text + + return new_anonymizer_mapping diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index 526f685624bea..70bd7884fcaa7 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -1,7 +1,6 @@ from __future__ import annotations import json -from collections import defaultdict from pathlib import Path from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union @@ -14,6 +13,7 @@ from langchain_experimental.data_anonymizer.deanonymizer_mapping import ( DeanonymizerMapping, MappingDataType, + create_anonymizer_mapping, ) from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import ( default_matching_strategy, @@ -43,8 +43,7 @@ ) from e if TYPE_CHECKING: - from presidio_analyzer import EntityRecognizer, RecognizerResult - from presidio_anonymizer.entities import EngineResult + from presidio_analyzer import EntityRecognizer # Configuring Anonymizer for multiple languages # Detailed description and examples can be found here: @@ -156,17 +155,30 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: "Change your language configuration file to add more languages." ) - results = self._analyzer.analyze( + analyzer_results = self._analyzer.analyze( text, entities=self.analyzed_fields, language=language, ) - return self._anonymizer.anonymize( + filtered_analyzer_results = ( + self._anonymizer._remove_conflicts_and_get_text_manipulation_data( + analyzer_results + ) + ) + + anonymizer_results = self._anonymizer.anonymize( text, - analyzer_results=results, + analyzer_results=analyzer_results, operators=self.operators, - ).text + ) + + anonymizer_mapping = create_anonymizer_mapping( + text, + filtered_analyzer_results, + anonymizer_results, + ) + return default_matching_strategy(text, anonymizer_mapping) class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase): @@ -185,58 +197,6 @@ def deanonymizer_mapping(self) -> MappingDataType: """Return the deanonymizer mapping""" return self._deanonymizer_mapping.data - def _update_deanonymizer_mapping( - self, - original_text: str, - analyzer_results: List[RecognizerResult], - anonymizer_results: EngineResult, - ) -> None: - """Creates or updates the mapping used to de-anonymize text. - - This method exploits the results returned by the - analysis and anonymization processes. - - It constructs a mapping from each anonymized entity - back to its original text value. - - Mapping will be stored as "deanonymizer_mapping" property. - - Example of "deanonymizer_mapping": - { - "PERSON": { - "": "", - "John Doe": "Slim Shady" - }, - "PHONE_NUMBER": { - "111-111-1111": "555-555-5555" - } - ... - } - """ - - # We are able to zip and loop through both lists because we expect - # them to return corresponding entities for each identified piece - # of analyzable data from our input. - - # We sort them by their 'start' attribute because it allows us to - # match corresponding entities by their position in the input text. - analyzer_results = sorted(analyzer_results, key=lambda d: d.start) - anonymizer_results.items = sorted( - anonymizer_results.items, key=lambda d: d.start - ) - - new_deanonymizer_mapping: MappingDataType = defaultdict(dict) - - for analyzed_entity, anonymized_entity in zip( - analyzer_results, anonymizer_results.items - ): - original_value = original_text[analyzed_entity.start : analyzed_entity.end] - new_deanonymizer_mapping[anonymized_entity.entity_type][ - anonymized_entity.text - ] = original_value - - self._deanonymizer_mapping.update(new_deanonymizer_mapping) - def _anonymize(self, text: str, language: Optional[str] = None) -> str: """Anonymize text. Each PII entity is replaced with a fake value. @@ -278,15 +238,18 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: operators=self.operators, ) - self._update_deanonymizer_mapping( - text, filtered_analyzer_results, anonymizer_results + new_deanonymizer_mapping = create_anonymizer_mapping( + text, + filtered_analyzer_results, + anonymizer_results, + reversed=True, ) + self._deanonymizer_mapping.update(new_deanonymizer_mapping) anonymizer_mapping = { key: {v: k for k, v in inner_dict.items()} for key, inner_dict in self.deanonymizer_mapping.items() } - return default_matching_strategy(text, anonymizer_mapping) def _deanonymize( From 866831ff634c24be4a20aeda83b127bb2123cb10 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds Date: Mon, 11 Sep 2023 14:06:07 +0000 Subject: [PATCH 03/16] Adjust tests --- libs/experimental/tests/unit_tests/test_data_anonymizer.py | 2 ++ .../tests/unit_tests/test_reversible_data_anonymizer.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py index 138b60eca89e0..a72b4eea95e38 100644 --- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py @@ -78,6 +78,8 @@ def test_add_recognizer_operator() -> None: assert anonymized_text == " Jane Doe was here." # anonymizing with custom recognizer and operator + anonymizer = PresidioAnonymizer(analyzed_fields=[]) + anonymizer.add_recognizer(custom_recognizer) custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})} anonymizer.add_operators(custom_operator) anonymized_text = anonymizer.anonymize(text) diff --git a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py index 9484a0e9dca06..cbd52e53a80be 100644 --- a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py @@ -79,6 +79,8 @@ def test_add_recognizer_operator() -> None: assert anonymized_text == "<TITLE> Jane Doe was here." # anonymizing with custom recognizer and operator + anonymizer = PresidioReversibleAnonymizer(analyzed_fields=[]) + anonymizer.add_recognizer(custom_recognizer) custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})} anonymizer.add_operators(custom_operator) anonymized_text = anonymizer.anonymize(text) From 429b8dc304e200858b4f38d0483283e0dd9647c0 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai> Date: Mon, 11 Sep 2023 14:13:19 +0000 Subject: [PATCH 04/16] CR --- .../data_anonymizer/deanonymizer_mapping.py | 2 +- .../data_anonymizer/presidio.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py index c104a14dcf23b..e4385352cf45f 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py @@ -21,7 +21,7 @@ def data(self) -> MappingDataType: def update(self, new_mapping: MappingDataType) -> None: """Update the deanonymizer mapping with new values - Duplicate values will not be added + Duplicated values will not be added """ new_values_seen = set() diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index 70bd7884fcaa7..14898492a6001 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -197,6 +197,15 @@ def deanonymizer_mapping(self) -> MappingDataType: """Return the deanonymizer mapping""" return self._deanonymizer_mapping.data + @property + def anonymizer_mapping(self) -> MappingDataType: + """Return the anonymizer mapping + This is just the reverse version of the deanonymizer mapping.""" + return { + key: {v: k for k, v in inner_dict.items()} + for key, inner_dict in self.deanonymizer_mapping.items() + } + def _anonymize(self, text: str, language: Optional[str] = None) -> str: """Anonymize text. Each PII entity is replaced with a fake value. @@ -246,11 +255,7 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: ) self._deanonymizer_mapping.update(new_deanonymizer_mapping) - anonymizer_mapping = { - key: {v: k for k, v in inner_dict.items()} - for key, inner_dict in self.deanonymizer_mapping.items() - } - return default_matching_strategy(text, anonymizer_mapping) + return default_matching_strategy(text, self.anonymizer_mapping) def _deanonymize( self, From 9503094053b73a918ad0fa6a8ee6076be9d63404 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai> Date: Tue, 12 Sep 2023 08:49:42 +0000 Subject: [PATCH 05/16] test instance anonymization --- .../tests/unit_tests/test_data_anonymizer.py | 16 ++++++++++++ .../test_reversible_data_anonymizer.py | 26 +++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py index a72b4eea95e38..8bda04e026f94 100644 --- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py @@ -39,6 +39,22 @@ def test_anonymize_multiple() -> None: assert phrase not in anonymized_text +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_check_instances() -> None: + """Test anonymizing multiple items in a sentence""" + from langchain_experimental.data_anonymizer import PresidioAnonymizer + + text = ( + "This is John Smith. John Smith works in a bakery." "John Smith is a good guy" + ) + anonymizer = PresidioAnonymizer(["PERSON"], faker_seed=42) + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text.count("Noah Rhodes") == 3 + + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text.count("Noah Rhodes") == 0 + + @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") def test_anonymize_with_custom_operator() -> None: """Test anonymize a name with a custom operator""" diff --git a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py index cbd52e53a80be..4e045a91c6e59 100644 --- a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py @@ -40,6 +40,32 @@ def test_anonymize_multiple() -> None: assert phrase not in anonymized_text +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_check_instances() -> None: + """Test anonymizing multiple items in a sentence""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + text = ( + "This is John Smith. John Smith works in a bakery." "John Smith is a good guy" + ) + anonymizer = PresidioReversibleAnonymizer(["PERSON"], faker_seed=42) + anonymized_text = anonymizer.anonymize(text) + persons = list(anonymizer.deanonymizer_mapping["PERSON"].keys()) + assert len(persons) == 1 + + anonymized_name = persons[0] + assert anonymized_text.count(anonymized_name) == 3 + + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text.count(anonymized_name) == 3 + assert anonymizer.deanonymizer_mapping["PERSON"][anonymized_name] == "John Smith" + + text = "This is Jane Smith" + anonymized_text = anonymizer.anonymize(text) + persons = list(anonymizer.deanonymizer_mapping["PERSON"].keys()) + assert len(persons) == 2 + + @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") def test_anonymize_with_custom_operator() -> None: """Test anonymize a name with a custom operator""" From 9ee9ae532c62646984736c860510dff0a2fcd7de Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai> Date: Tue, 12 Sep 2023 09:51:36 +0000 Subject: [PATCH 06/16] CR --- .../data_anonymizer/deanonymizer_mapping.py | 8 ++++---- .../langchain_experimental/data_anonymizer/presidio.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py index e4385352cf45f..f9f079baf39c3 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py @@ -40,17 +40,17 @@ def create_anonymizer_mapping( original_text: str, analyzer_results: List[RecognizerResult], anonymizer_results: EngineResult, - reversed: bool = False, + is_reversed: bool = False, ) -> MappingDataType: """Creates or updates the mapping used to anonymize and/or deanonymize text. This method exploits the results returned by the analysis and anonymization processes. - If reversed is True, it constructs a mapping from each original + If is_reversed is True, it constructs a mapping from each original entity to its anonymized value. - If reversed is False, it constructs a mapping from each + If is_reversed is False, it constructs a mapping from each anonymized entity back to its original text value. Example of mapping: @@ -82,7 +82,7 @@ def create_anonymizer_mapping( ): original_value = original_text[analyzed_entity.start : analyzed_entity.end] - if reversed: + if is_reversed: new_anonymizer_mapping[anonymized_entity.entity_type][ anonymized_entity.text ] = original_value diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index 14898492a6001..7e377e67839d5 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -251,7 +251,7 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: text, filtered_analyzer_results, anonymizer_results, - reversed=True, + is_reversed=True, ) self._deanonymizer_mapping.update(new_deanonymizer_mapping) From 2ce52566e81ff731e320760b646b659accde1092 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai> Date: Tue, 12 Sep 2023 09:57:26 +0000 Subject: [PATCH 07/16] Add descriptions about instance anonymization --- .../data_anonymizer/presidio.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index 7e377e67839d5..e2835cf828996 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -139,6 +139,14 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: Each PII entity is replaced with a fake value. Each time fake values will be different, as they are generated randomly. + PresidioAnonymizer has no built-in memory - + so it will not remember the effects of anonymizing previous texts. + >>> anonymizer = PresidioAnonymizer() + >>> anonymizer.anonymize("John Doe") + 'Noah Rhodes' + >>> anonymizer.anonymize("John Doe") + 'Brett Russell' + Args: text: text to anonymize language: language to use for analysis of PII @@ -213,6 +221,14 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: At the same time, we will create a mapping from each anonymized entity back to its original text value. + Thanks to the built-in memory, all previously anonymised entities + will be remembered and replaced by the same fake values: + >>> anonymizer = PresidioReversibleAnonymizer() + >>> anonymizer.anonymize("John Doe") + 'Noah Rhodes' + >>> anonymizer.anonymize("John Doe") + 'Noah Rhodes' + Args: text: text to anonymize language: language to use for analysis of PII From 90fe863ede59f1a0609fd9cd73f6cdb1deb0e7d8 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai> Date: Tue, 12 Sep 2023 11:10:04 +0000 Subject: [PATCH 08/16] Update docs --- .../presidio_data_anonymization/index.ipynb | 98 ++++++++++++++++--- .../reversible.ipynb | 75 ++++++++++---- 2 files changed, 143 insertions(+), 30 deletions(-) diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb index 2502a45092244..ba65d33994a15 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb @@ -53,7 +53,7 @@ { "data": { "text/plain": [ - "'My name is Laura Ruiz, call me at +1-412-982-8374x13414 or email me at javierwatkins@example.net'" + "'My name is James Martinez, call me at (576)928-1972x679 or email me at lisa44@example.com'" ] }, "execution_count": 2, @@ -114,11 +114,11 @@ "text": [ "Dear Sir/Madam,\n", "\n", - "We regret to inform you that Richard Fields has recently misplaced his wallet, which contains a sum of cash and his credit card bearing the number 30479847307774. \n", + "We regret to inform you that Mr. Dennis Cooper has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 3588895295514977. \n", "\n", - "Should you happen to come across it, we kindly request that you contact us immediately at 6439182672 or via email at frank45@example.com.\n", + "Should you happen to come across the aforementioned wallet, kindly contact us immediately at (428)451-3494x4110 or send an email to perryluke@example.com.\n", "\n", - "Thank you for your attention to this matter.\n", + "Your prompt assistance in this matter would be greatly appreciated.\n", "\n", "Yours faithfully,\n", "\n", @@ -159,7 +159,7 @@ { "data": { "text/plain": [ - "'My name is Adrian Fleming, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" + "'My name is Shannon Steele, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" ] }, "execution_count": 6, @@ -190,7 +190,7 @@ { "data": { "text/plain": [ - "'My name is Justin Miller, call me at 761-824-1889 or email me at real.slim.shady@gmail.com'" + "'My name is Wesley Flores, call me at (498)576-9526 or email me at real.slim.shady@gmail.com'" ] }, "execution_count": 7, @@ -225,7 +225,7 @@ { "data": { "text/plain": [ - "'My name is Dr. Jennifer Baker, call me at (508)839-9329x232 or email me at ehamilton@example.com'" + "'My name is Carla Fisher, call me at 001-683-324-0721x0644 or email me at krausejeremy@example.com'" ] }, "execution_count": 8, @@ -256,7 +256,7 @@ { "data": { "text/plain": [ - "'My polish phone number is NRGN41434238921378'" + "'My polish phone number is QESQ21234635370499'" ] }, "execution_count": 9, @@ -361,7 +361,7 @@ { "data": { "text/plain": [ - "'511 622 683'" + "'665 631 080'" ] }, "execution_count": 13, @@ -422,7 +422,7 @@ { "data": { "text/plain": [ - "'My polish phone number is +48 734 630 977'" + "'My polish phone number is 538 521 657'" ] }, "execution_count": 16, @@ -438,8 +438,80 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Future works\n", - "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object." + "## Important considerations\n", + "\n", + "### Anonymizer detection rates\n", + "\n", + "**The level of anonymization and the precision of detection are just as good as the quality of the recognizers implemented.**\n", + "\n", + "Texts from different sources and in different languages have varying characteristics, so it is necessary to test the detection precision and iteratively add recognizers and operators to achieve better and better results.\n", + "\n", + "Microsoft Presidio gives a lot of freedom to refine anonymization. The library's author has provided his [recommendations and a step-by-step guide for improving detection rates](https://github.com/microsoft/presidio/discussions/767#discussion-3567223)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Instance anonymization\n", + "\n", + "`PresidioAnonymizer` has no built-in memory. Therefore, two occurrences of the entity in the subsequent texts will be replaced with two different fake values:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "My name is Robert Morales. Hi Robert Morales!\n", + "My name is Kelly Mccoy. Hi Kelly Mccoy!\n" + ] + } + ], + "source": [ + "print(anonymizer.anonymize(\"My name is John Doe. Hi John Doe!\"))\n", + "print(anonymizer.anonymize(\"My name is John Doe. Hi John Doe!\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To preserve previous anonymization results, use `PresidioReverseAnonymizer`, which has built-in memory:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "My name is Ashley Cervantes. Hi Ashley Cervantes!\n", + "My name is Ashley Cervantes. Hi Ashley Cervantes!\n" + ] + } + ], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer_with_memory = PresidioReversibleAnonymizer()\n", + "\n", + "print(anonymizer_with_memory.anonymize(\"My name is John Doe. Hi John Doe!\"))\n", + "print(anonymizer_with_memory.anonymize(\"My name is John Doe. Hi John Doe!\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can learn more about `PresidioReverseAnonymizer` in the next section." ] } ], @@ -459,7 +531,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb index de5655ba1e9d5..a61f7894d3ec2 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb @@ -185,14 +185,13 @@ "text": [ "Dear Sir/Madam,\n", "\n", - "We regret to inform you that Mr. Dana Rhodes has reported the loss of his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4397528473885757. \n", + "We regret to inform you that Monique Turner has recently misplaced his wallet, which contains a sum of cash and his credit card with the number 213152056829866. \n", "\n", - "If you happen to come across the aforementioned wallet, we kindly request that you contact us immediately at 258-481-7074x714 or via email at laurengoodman@example.com.\n", + "If you happen to come across this wallet, kindly contact us at (770)908-7734x2835 or send an email to barbara25@example.net.\n", "\n", - "Your prompt assistance in this matter would be greatly appreciated.\n", - "\n", - "Yours faithfully,\n", + "Thank you for your cooperation.\n", "\n", + "Sincerely,\n", "[Your Name]\n" ] } @@ -232,14 +231,13 @@ "text": [ "Dear Sir/Madam,\n", "\n", - "We regret to inform you that Mr. Slim Shady has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4916 0387 9536 0861. \n", - "\n", - "If by any chance you come across the lost wallet, kindly contact us immediately at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n", + "We regret to inform you that Slim Shady has recently misplaced his wallet, which contains a sum of cash and his credit card with the number 4916 0387 9536 0861. \n", "\n", - "Your prompt assistance in this matter would be greatly appreciated.\n", + "If you happen to come across this wallet, kindly contact us at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n", "\n", - "Yours faithfully,\n", + "Thank you for your cooperation.\n", "\n", + "Sincerely,\n", "[Your Name]\n" ] } @@ -356,13 +354,57 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can save the mapping itself to a file for future use: " + "Thanks to the built-in memory, entities that have already been detected and anonymised will take the same form in subsequent processed texts, so no duplicates will exist in the mapping:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "My VISA card number is 3537672423884966 and my name is William Bowman.\n" + ] + }, + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", + " '3537672423884966': '4001 9192 5753 7193'}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\n", + " anonymizer.anonymize(\n", + " \"My VISA card number is 4001 9192 5753 7193 and my name is John Doe.\"\n", + " )\n", + ")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can save the mapping itself to a file for future use: " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, "outputs": [], "source": [ "# We can save the deanonymizer mapping as a JSON or YAML file\n", @@ -380,7 +422,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -389,7 +431,7 @@ "{}" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -402,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -415,7 +457,7 @@ " '3537672423884966': '4001 9192 5753 7193'}}" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -432,7 +474,6 @@ "source": [ "## Future works\n", "\n", - "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object.\n", "- **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs." ] } @@ -453,7 +494,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.11.4" } }, "nbformat": 4, From a95d7032c62336b050eb5c812cad604f65d7cca7 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai> Date: Tue, 12 Sep 2023 11:37:16 +0000 Subject: [PATCH 09/16] CR 2 --- .../data_anonymizer/presidio.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index e2835cf828996..c98ab2b73e4ad 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -142,10 +142,10 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: PresidioAnonymizer has no built-in memory - so it will not remember the effects of anonymizing previous texts. >>> anonymizer = PresidioAnonymizer() - >>> anonymizer.anonymize("John Doe") - 'Noah Rhodes' - >>> anonymizer.anonymize("John Doe") - 'Brett Russell' + >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") + 'My name is Noah Rhodes. Hi Noah Rhodes!' + >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") + 'My name is Brett Russell. Hi Brett Russell!' Args: text: text to anonymize @@ -224,10 +224,10 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: Thanks to the built-in memory, all previously anonymised entities will be remembered and replaced by the same fake values: >>> anonymizer = PresidioReversibleAnonymizer() - >>> anonymizer.anonymize("John Doe") - 'Noah Rhodes' - >>> anonymizer.anonymize("John Doe") - 'Noah Rhodes' + >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") + 'My name is Noah Rhodes. Hi Noah Rhodes!' + >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") + 'My name is Noah Rhodes. Hi Noah Rhodes!' Args: text: text to anonymize From 8f2ac21ab93c2cedf3ce4ed4df43d386ebc43bfe Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai> Date: Tue, 12 Sep 2023 11:44:05 +0000 Subject: [PATCH 10/16] CR 3 --- .../guides/privacy/presidio_data_anonymization/index.ipynb | 4 ++-- libs/experimental/tests/unit_tests/test_data_anonymizer.py | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb index ba65d33994a15..617809d489d85 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb @@ -481,7 +481,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To preserve previous anonymization results, use `PresidioReverseAnonymizer`, which has built-in memory:" + "To preserve previous anonymization results, use `PresidioReversibleAnonymizer`, which has built-in memory:" ] }, { @@ -511,7 +511,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can learn more about `PresidioReverseAnonymizer` in the next section." + "You can learn more about `PresidioReversibleAnonymizer` in the next section." ] } ], diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py index 8bda04e026f94..db28e17c1581f 100644 --- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py @@ -94,8 +94,6 @@ def test_add_recognizer_operator() -> None: assert anonymized_text == "<TITLE> Jane Doe was here." # anonymizing with custom recognizer and operator - anonymizer = PresidioAnonymizer(analyzed_fields=[]) - anonymizer.add_recognizer(custom_recognizer) custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})} anonymizer.add_operators(custom_operator) anonymized_text = anonymizer.anonymize(text) From ddb5cdacb5a76abae454e5106a121dd097700e71 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai> Date: Thu, 21 Sep 2023 15:36:08 +0000 Subject: [PATCH 11/16] Add possibility to omit using faker values --- .../data_anonymizer/deanonymizer_mapping.py | 52 +++++++++++-------- .../data_anonymizer/presidio.py | 37 +++++++++---- 2 files changed, 58 insertions(+), 31 deletions(-) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py index f9f079baf39c3..1b9907593559e 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py @@ -23,17 +23,24 @@ def update(self, new_mapping: MappingDataType) -> None: """Update the deanonymizer mapping with new values Duplicated values will not be added """ - new_values_seen = set() + seen_values = set() for entity_type, values in new_mapping.items(): - for k, v in values.items(): - # Make sure it is not a duplicate value + count = len(self.mapping[entity_type]) + 1 + + for key, value in values.items(): if ( - v not in self.mapping[entity_type].values() - and v not in new_values_seen + value not in seen_values + and value not in self.mapping[entity_type].values() ): - self.mapping[entity_type][k] = v - new_values_seen.update({v}) + new_key = ( + f"<{entity_type}_{count}>" + if key.startswith("<") and key.endswith(">") + else key + ) + self.mapping[entity_type][new_key] = value + seen_values.add(value) + count += 1 def create_anonymizer_mapping( @@ -75,20 +82,23 @@ def create_anonymizer_mapping( analyzer_results = sorted(analyzer_results, key=lambda d: d.start) anonymizer_results.items = sorted(anonymizer_results.items, key=lambda d: d.start) - new_anonymizer_mapping: MappingDataType = defaultdict(dict) + mapping: MappingDataType = defaultdict(dict) + count: dict = defaultdict(int) + + for analyzed, anonymized in zip(analyzer_results, anonymizer_results.items): + original_value = original_text[analyzed.start : analyzed.end] + anonymized_value = ( + anonymized.text + if not anonymized.text.startswith("<") + else f"<{anonymized.entity_type}_{count[anonymized.entity_type] + 1}>" + ) - for analyzed_entity, anonymized_entity in zip( - analyzer_results, anonymizer_results.items - ): - original_value = original_text[analyzed_entity.start : analyzed_entity.end] + entity_type = anonymized.entity_type + mapping_key = anonymized_value if is_reversed else original_value + mapping_value = original_value if is_reversed else anonymized_value - if is_reversed: - new_anonymizer_mapping[anonymized_entity.entity_type][ - anonymized_entity.text - ] = original_value - else: - new_anonymizer_mapping[anonymized_entity.entity_type][ - original_value - ] = anonymized_entity.text + if mapping_key not in mapping[entity_type]: + mapping[entity_type][mapping_key] = mapping_value + count[entity_type] += 1 - return new_anonymizer_mapping + return mapping diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index c98ab2b73e4ad..5bd8dd41862f6 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -68,6 +68,7 @@ def __init__( analyzed_fields: Optional[List[str]] = None, operators: Optional[Dict[str, OperatorConfig]] = None, languages_config: Dict = DEFAULT_LANGUAGES_CONFIG, + use_faker_operators: bool = True, faker_seed: Optional[int] = None, ): """ @@ -92,17 +93,11 @@ def __init__( if analyzed_fields is not None else list(get_pseudoanonymizer_mapping().keys()) ) + self.operators = ( operators - if operators is not None - else { - field: OperatorConfig( - operator_name="custom", params={"lambda": faker_function} - ) - for field, faker_function in get_pseudoanonymizer_mapping( - faker_seed - ).items() - } + if operators + else self._get_operator_config(use_faker_operators, faker_seed) ) provider = NlpEngineProvider(nlp_configuration=languages_config) @@ -132,6 +127,21 @@ def add_operators(self, operators: Dict[str, OperatorConfig]) -> None: """ self.operators.update(operators) + def _get_operator_config( + self, use_faker_operators: bool, faker_seed: Optional[int] + ) -> dict: + if use_faker_operators: + return { + field: OperatorConfig( + operator_name="custom", params={"lambda": faker_function} + ) + for field, faker_function in get_pseudoanonymizer_mapping( + faker_seed + ).items() + } + + return {} + class PresidioAnonymizer(PresidioAnonymizerBase): def _anonymize(self, text: str, language: Optional[str] = None) -> str: @@ -195,9 +205,16 @@ def __init__( analyzed_fields: Optional[List[str]] = None, operators: Optional[Dict[str, OperatorConfig]] = None, languages_config: Dict = DEFAULT_LANGUAGES_CONFIG, + use_faker_operators: bool = True, faker_seed: Optional[int] = None, ): - super().__init__(analyzed_fields, operators, languages_config, faker_seed) + super().__init__( + analyzed_fields, + operators, + languages_config, + use_faker_operators, + faker_seed, + ) self._deanonymizer_mapping = DeanonymizerMapping() @property From bd09a828e7d174a0913c7d018a96b6cdc4924587 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai> Date: Thu, 21 Sep 2023 15:36:19 +0000 Subject: [PATCH 12/16] Adjust tests --- .../tests/unit_tests/test_data_anonymizer.py | 11 ++++++----- .../unit_tests/test_reversible_data_anonymizer.py | 6 +++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py index db28e17c1581f..c515ac0c92ee1 100644 --- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py @@ -49,10 +49,11 @@ def test_check_instances() -> None: ) anonymizer = PresidioAnonymizer(["PERSON"], faker_seed=42) anonymized_text = anonymizer.anonymize(text) - assert anonymized_text.count("Noah Rhodes") == 3 + assert anonymized_text.count("Connie Lawrence") == 3 + # New name should be generated anonymized_text = anonymizer.anonymize(text) - assert anonymized_text.count("Noah Rhodes") == 0 + assert anonymized_text.count("Connie Lawrence") == 0 @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") @@ -62,13 +63,13 @@ def test_anonymize_with_custom_operator() -> None: from langchain_experimental.data_anonymizer import PresidioAnonymizer - custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "<name>"})} + custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "NAME"})} anonymizer = PresidioAnonymizer(operators=custom_operator) text = "Jane Doe was here." anonymized_text = anonymizer.anonymize(text) - assert anonymized_text == "<name> was here." + assert anonymized_text == "NAME was here." @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") @@ -91,7 +92,7 @@ def test_add_recognizer_operator() -> None: # anonymizing with custom recognizer text = "Madam Jane Doe was here." anonymized_text = anonymizer.anonymize(text) - assert anonymized_text == "<TITLE> Jane Doe was here." + assert anonymized_text == "<TITLE_1> Jane Doe was here." # anonymizing with custom recognizer and operator custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})} diff --git a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py index 4e045a91c6e59..92579bb9e154f 100644 --- a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py @@ -73,13 +73,13 @@ def test_anonymize_with_custom_operator() -> None: from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer - custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "<name>"})} + custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "NAME"})} anonymizer = PresidioReversibleAnonymizer(operators=custom_operator) text = "Jane Doe was here." anonymized_text = anonymizer.anonymize(text) - assert anonymized_text == "<name> was here." + assert anonymized_text == "NAME was here." @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") @@ -102,7 +102,7 @@ def test_add_recognizer_operator() -> None: # anonymizing with custom recognizer text = "Madam Jane Doe was here." anonymized_text = anonymizer.anonymize(text) - assert anonymized_text == "<TITLE> Jane Doe was here." + assert anonymized_text == "<TITLE_1> Jane Doe was here." # anonymizing with custom recognizer and operator anonymizer = PresidioReversibleAnonymizer(analyzed_fields=[]) From 0db3ab83bdccae1360e1c4b187779d4e52df1b7c Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai> Date: Thu, 21 Sep 2023 15:45:47 +0000 Subject: [PATCH 13/16] Add two more tests for non faker values --- .../tests/unit_tests/test_data_anonymizer.py | 18 ++++++++++++++++++ .../test_reversible_data_anonymizer.py | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py index c515ac0c92ee1..ccaaed544181e 100644 --- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py @@ -99,3 +99,21 @@ def test_add_recognizer_operator() -> None: anonymizer.add_operators(custom_operator) anonymized_text = anonymizer.anonymize(text) assert anonymized_text == "Dear Jane Doe was here." + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_non_faker_values() -> None: + """Test anonymizing multiple items in a sentence without faker values""" + from langchain_experimental.data_anonymizer import PresidioAnonymizer + + text = ( + "My name is John Smith. Your name is Adam Smith. Her name is Jane Smith." + "Our names are: John Smith, Adam Smith, Jane Smith." + ) + expected_result = ( + "My name is <PERSON_1>. Your name is <PERSON_2>. Her name is <PERSON_3>." + "Our names are: <PERSON_1>, <PERSON_2>, <PERSON_3>." + ) + anonymizer = PresidioAnonymizer(use_faker_operators=False) + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == expected_result diff --git a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py index 92579bb9e154f..89e908b550a2b 100644 --- a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py @@ -180,3 +180,21 @@ def test_save_load_deanonymizer_mapping() -> None: finally: os.remove("test_file.json") + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_non_faker_values() -> None: + """Test anonymizing multiple items in a sentence without faker values""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + text = ( + "My name is John Smith. Your name is Adam Smith. Her name is Jane Smith." + "Our names are: John Smith, Adam Smith, Jane Smith." + ) + expected_result = ( + "My name is <PERSON_1>. Your name is <PERSON_2>. Her name is <PERSON_3>." + "Our names are: <PERSON_1>, <PERSON_2>, <PERSON_3>." + ) + anonymizer = PresidioReversibleAnonymizer(use_faker_operators=False) + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == expected_result From 97767bdb549cd06bd9da58219ef3e83c1070fd83 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai> Date: Thu, 5 Oct 2023 18:08:45 +0000 Subject: [PATCH 14/16] Better faker operators handling --- .../data_anonymizer/presidio.py | 40 ++++++++----------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index 5bd8dd41862f6..6f102fdd62b55 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -68,7 +68,7 @@ def __init__( analyzed_fields: Optional[List[str]] = None, operators: Optional[Dict[str, OperatorConfig]] = None, languages_config: Dict = DEFAULT_LANGUAGES_CONFIG, - use_faker_operators: bool = True, + add_default_faker_operators: bool = True, faker_seed: Optional[int] = None, ): """ @@ -94,11 +94,20 @@ def __init__( else list(get_pseudoanonymizer_mapping().keys()) ) - self.operators = ( - operators - if operators - else self._get_operator_config(use_faker_operators, faker_seed) - ) + if add_default_faker_operators: + self.operators = { + field: OperatorConfig( + operator_name="custom", params={"lambda": faker_function} + ) + for field, faker_function in get_pseudoanonymizer_mapping( + faker_seed + ).items() + } + else: + self.operators = {} + + if operators: + self.add_operators(operators) provider = NlpEngineProvider(nlp_configuration=languages_config) nlp_engine = provider.create_engine() @@ -127,21 +136,6 @@ def add_operators(self, operators: Dict[str, OperatorConfig]) -> None: """ self.operators.update(operators) - def _get_operator_config( - self, use_faker_operators: bool, faker_seed: Optional[int] - ) -> dict: - if use_faker_operators: - return { - field: OperatorConfig( - operator_name="custom", params={"lambda": faker_function} - ) - for field, faker_function in get_pseudoanonymizer_mapping( - faker_seed - ).items() - } - - return {} - class PresidioAnonymizer(PresidioAnonymizerBase): def _anonymize(self, text: str, language: Optional[str] = None) -> str: @@ -205,14 +199,14 @@ def __init__( analyzed_fields: Optional[List[str]] = None, operators: Optional[Dict[str, OperatorConfig]] = None, languages_config: Dict = DEFAULT_LANGUAGES_CONFIG, - use_faker_operators: bool = True, + add_default_faker_operators: bool = True, faker_seed: Optional[int] = None, ): super().__init__( analyzed_fields, operators, languages_config, - use_faker_operators, + add_default_faker_operators, faker_seed, ) self._deanonymizer_mapping = DeanonymizerMapping() From 5e70b20531e37c4f2af121c551e69f590115b046 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai> Date: Thu, 5 Oct 2023 18:18:19 +0000 Subject: [PATCH 15/16] Better duplicate handling --- .../data_anonymizer/deanonymizer_mapping.py | 65 ++++++++++++++----- .../tests/unit_tests/test_data_anonymizer.py | 8 +-- .../test_reversible_data_anonymizer.py | 8 +-- 3 files changed, 58 insertions(+), 23 deletions(-) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py index 1b9907593559e..7b05c9d0e30ed 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py @@ -1,5 +1,6 @@ from collections import defaultdict from dataclasses import dataclass, field +import re from typing import Dict, List from presidio_analyzer import RecognizerResult @@ -8,6 +9,18 @@ MappingDataType = Dict[str, Dict[str, str]] +def format_duplicated_operator(operator_name: str, count: int) -> str: + """Format the operator name with the count""" + + clean_operator_name = re.sub(r"[<>]", "", operator_name) + clean_operator_name = re.sub(r"_\d+$", "", clean_operator_name) + + if operator_name.startswith("<") and operator_name.endswith(">"): + return f"<{clean_operator_name}_{count}>" + else: + return f"{clean_operator_name}_{count}" + + @dataclass class DeanonymizerMapping: mapping: MappingDataType = field( @@ -22,6 +35,9 @@ def data(self) -> MappingDataType: def update(self, new_mapping: MappingDataType) -> None: """Update the deanonymizer mapping with new values Duplicated values will not be added + If there are multiple entities of the same type, the mapping will + include a count to differentiate them. For example, if there are + two names in the input text, the mapping will include NAME_1 and NAME_2. """ seen_values = set() @@ -34,10 +50,11 @@ def update(self, new_mapping: MappingDataType) -> None: and value not in self.mapping[entity_type].values() ): new_key = ( - f"<{entity_type}_{count}>" - if key.startswith("<") and key.endswith(">") + format_duplicated_operator(key, count) + if key in self.mapping[entity_type] else key ) + self.mapping[entity_type][new_key] = value seen_values.add(value) count += 1 @@ -60,6 +77,10 @@ def create_anonymizer_mapping( If is_reversed is False, it constructs a mapping from each anonymized entity back to its original text value. + If there are multiple entities of the same type, the mapping will + include a count to differentiate them. For example, if there are + two names in the input text, the mapping will include NAME_1 and NAME_2. + Example of mapping: { "PERSON": { @@ -72,33 +93,47 @@ def create_anonymizer_mapping( ... } """ - # We are able to zip and loop through both lists because we expect # them to return corresponding entities for each identified piece # of analyzable data from our input. # We sort them by their 'start' attribute because it allows us to # match corresponding entities by their position in the input text. - analyzer_results = sorted(analyzer_results, key=lambda d: d.start) - anonymizer_results.items = sorted(anonymizer_results.items, key=lambda d: d.start) + analyzer_results.sort(key=lambda d: d.start) + anonymizer_results.items.sort(key=lambda d: d.start) mapping: MappingDataType = defaultdict(dict) count: dict = defaultdict(int) for analyzed, anonymized in zip(analyzer_results, anonymizer_results.items): original_value = original_text[analyzed.start : analyzed.end] - anonymized_value = ( - anonymized.text - if not anonymized.text.startswith("<") - else f"<{anonymized.entity_type}_{count[anonymized.entity_type] + 1}>" - ) - entity_type = anonymized.entity_type - mapping_key = anonymized_value if is_reversed else original_value - mapping_value = original_value if is_reversed else anonymized_value - if mapping_key not in mapping[entity_type]: - mapping[entity_type][mapping_key] = mapping_value + if is_reversed: + cond = original_value in mapping[entity_type].values() + else: + cond = original_value in mapping[entity_type] + + if cond: + continue + + if ( + anonymized.text in mapping[entity_type].values() + or anonymized.text in mapping[entity_type] + ): + anonymized_value = format_duplicated_operator( + anonymized.text, count[entity_type] + 2 + ) count[entity_type] += 1 + else: + anonymized_value = anonymized.text + + mapping_key, mapping_value = ( + (anonymized_value, original_value) + if is_reversed + else (original_value, anonymized_value) + ) + + mapping[entity_type][mapping_key] = mapping_value return mapping diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py index ccaaed544181e..bf12a87395847 100644 --- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py @@ -92,7 +92,7 @@ def test_add_recognizer_operator() -> None: # anonymizing with custom recognizer text = "Madam Jane Doe was here." anonymized_text = anonymizer.anonymize(text) - assert anonymized_text == "<TITLE_1> Jane Doe was here." + assert anonymized_text == "<TITLE> Jane Doe was here." # anonymizing with custom recognizer and operator custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})} @@ -111,9 +111,9 @@ def test_non_faker_values() -> None: "Our names are: John Smith, Adam Smith, Jane Smith." ) expected_result = ( - "My name is <PERSON_1>. Your name is <PERSON_2>. Her name is <PERSON_3>." - "Our names are: <PERSON_1>, <PERSON_2>, <PERSON_3>." + "My name is <PERSON>. Your name is <PERSON_2>. Her name is <PERSON_3>." + "Our names are: <PERSON>, <PERSON_2>, <PERSON_3>." ) - anonymizer = PresidioAnonymizer(use_faker_operators=False) + anonymizer = PresidioAnonymizer(add_default_faker_operators=False) anonymized_text = anonymizer.anonymize(text) assert anonymized_text == expected_result diff --git a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py index 89e908b550a2b..8ef2dcf68ca3f 100644 --- a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py @@ -102,7 +102,7 @@ def test_add_recognizer_operator() -> None: # anonymizing with custom recognizer text = "Madam Jane Doe was here." anonymized_text = anonymizer.anonymize(text) - assert anonymized_text == "<TITLE_1> Jane Doe was here." + assert anonymized_text == "<TITLE> Jane Doe was here." # anonymizing with custom recognizer and operator anonymizer = PresidioReversibleAnonymizer(analyzed_fields=[]) @@ -192,9 +192,9 @@ def test_non_faker_values() -> None: "Our names are: John Smith, Adam Smith, Jane Smith." ) expected_result = ( - "My name is <PERSON_1>. Your name is <PERSON_2>. Her name is <PERSON_3>." - "Our names are: <PERSON_1>, <PERSON_2>, <PERSON_3>." + "My name is <PERSON>. Your name is <PERSON_2>. Her name is <PERSON_3>." + "Our names are: <PERSON>, <PERSON_2>, <PERSON_3>." ) - anonymizer = PresidioReversibleAnonymizer(use_faker_operators=False) + anonymizer = PresidioReversibleAnonymizer(add_default_faker_operators=False) anonymized_text = anonymizer.anonymize(text) assert anonymized_text == expected_result From a8bfb3af036b1d473cc96ecaf48f84f97cd8ab35 Mon Sep 17 00:00:00 2001 From: Bagatur <baskaryan@gmail.com> Date: Thu, 5 Oct 2023 11:21:20 -0700 Subject: [PATCH 16/16] fmt --- .../data_anonymizer/deanonymizer_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py index 7b05c9d0e30ed..9db586c2848c3 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py @@ -1,6 +1,6 @@ +import re from collections import defaultdict from dataclasses import dataclass, field -import re from typing import Dict, List from presidio_analyzer import RecognizerResult