From 555a6bfe7d109cbf8a43cc1c27f13c50ec8604bb Mon Sep 17 00:00:00 2001 From: Avani Siddhapura Date: Thu, 29 Feb 2024 10:34:54 +0530 Subject: [PATCH] test(bigquery-table-desc) added unit test for desc string cleaning --- .../ingestion/source/bigquery_v2/bigquery.py | 23 +++--------- .../source/bigquery_v2/bigquery_helper.py | 19 ++++++++++ .../unit/test_bigqueryv2_usage_source.py | 36 +++++++++++++++++++ 3 files changed, 59 insertions(+), 19 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 930ac90bfef03c..bcc0aa50ed22e6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -40,6 +40,9 @@ BigQueryTableRef, ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_helper import ( + unquote_and_decode_unicode_escape_seq, +) from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( BigqueryColumn, @@ -1073,7 +1076,7 @@ def gen_dataset_workunits( dataset_properties = DatasetProperties( name=datahub_dataset_name.get_table_display_name(), - description=self.unquote_and_decode_unicode_escape_seq(table.comment) + description=unquote_and_decode_unicode_escape_seq(table.comment) if table.comment else "", qualifiedName=str(datahub_dataset_name), @@ -1383,21 +1386,3 @@ def add_config_to_report(self): self.config.start_time, self.config.end_time, ) - - def unquote_and_decode_unicode_escape_seq( - self, - string: str, - leading_quote: str = '"', - trailing_quote: Optional[str] = None, - ) -> str: - """ - If string starts and ends with a quote, unquote it and decode Unicode escape sequences - """ - trailing_quote = trailing_quote if trailing_quote else leading_quote - - if string.startswith(leading_quote) and string.endswith(trailing_quote): - string = string[1:-1] - - cleaned_string = string.encode().decode("unicode-escape") - - return cleaned_string diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py new file mode 100644 index 00000000000000..6142c96a5faa1d --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py @@ -0,0 +1,19 @@ +from typing import Optional + + +def unquote_and_decode_unicode_escape_seq( + string: str, + leading_quote: str = '"', + trailing_quote: Optional[str] = None, +) -> str: + """ + If string starts and ends with a quote, unquote it and decode Unicode escape sequences + """ + trailing_quote = trailing_quote if trailing_quote else leading_quote + + if string.startswith(leading_quote) and string.endswith(trailing_quote): + string = string[1:-1] + + cleaned_string = string.encode().decode("unicode-escape") + + return cleaned_string diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py index 80f9ab927f887b..8a3fa5ca46ea4a 100644 --- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py @@ -8,6 +8,9 @@ BigQueryTableRef, ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_helper import ( + unquote_and_decode_unicode_escape_seq, +) from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor from datahub.sql_parsing.schema_resolver import SchemaResolver @@ -176,3 +179,36 @@ def test_bigquery_table_sanitasitation(): assert table_identifier.dataset == "dataset-4567" assert table_identifier.table == "foo_2016*" assert table_identifier.get_table_display_name() == "foo" + + +def test_unquote_and_decode_unicode_escape_seq(): + + # Test with a string that starts and ends with quotes and has Unicode escape sequences + input_string = '"Hello \\u003cWorld\\u003e"' + expected_output = "Hello " + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that does not start and end with quotes + input_string = "Hello \\u003cWorld\\u003e" + expected_output = "Hello " + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with an empty string + input_string = "" + expected_output = "" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that does not have Unicode escape sequences + input_string = "No escape sequences here" + expected_output = "No escape sequences here" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that starts and ends with quotes but does not have escape sequences + input_string = '"No escape sequences here"' + expected_output = "No escape sequences here" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output