Skip to content

Commit

Permalink
test(bigquery-table-desc) added unit test for desc string cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
AvaniSiddhapuraAPT committed Feb 29, 2024
1 parent 09e101f commit 9e6191f
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
BigQueryTableRef,
)
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
from datahub.ingestion.source.bigquery_v2.bigquery_helper import (
unquote_and_decode_unicode_escape_seq,
)
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
BigqueryColumn,
Expand Down Expand Up @@ -1073,7 +1076,7 @@ def gen_dataset_workunits(

dataset_properties = DatasetProperties(
name=datahub_dataset_name.get_table_display_name(),
description=self.unquote_and_decode_unicode_escape_seq(table.comment)
description=unquote_and_decode_unicode_escape_seq(table.comment)
if table.comment
else "",
qualifiedName=str(datahub_dataset_name),
Expand Down Expand Up @@ -1383,21 +1386,3 @@ def add_config_to_report(self):
self.config.start_time,
self.config.end_time,
)

def unquote_and_decode_unicode_escape_seq(
self,
string: str,
leading_quote: str = '"',
trailing_quote: Optional[str] = None,
) -> str:
"""
If string starts and ends with a quote, unquote it and decode Unicode escape sequences
"""
trailing_quote = trailing_quote if trailing_quote else leading_quote

if string.startswith(leading_quote) and string.endswith(trailing_quote):
string = string[1:-1]

cleaned_string = string.encode().decode("unicode-escape")

return cleaned_string
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import Optional


def unquote_and_decode_unicode_escape_seq(
string: str,
leading_quote: str = '"',
trailing_quote: Optional[str] = None,
) -> str:
"""
If string starts and ends with a quote, unquote it and decode Unicode escape sequences
"""
trailing_quote = trailing_quote if trailing_quote else leading_quote

if string.startswith(leading_quote) and string.endswith(trailing_quote):
string = string[1:-1]

cleaned_string = string.encode().decode("unicode-escape")

return cleaned_string
36 changes: 36 additions & 0 deletions metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
BigQueryTableRef,
)
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
from datahub.ingestion.source.bigquery_v2.bigquery_helper import (
unquote_and_decode_unicode_escape_seq,
)
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
from datahub.sql_parsing.schema_resolver import SchemaResolver
Expand Down Expand Up @@ -176,3 +179,36 @@ def test_bigquery_table_sanitasitation():
assert table_identifier.dataset == "dataset-4567"
assert table_identifier.table == "foo_2016*"
assert table_identifier.get_table_display_name() == "foo"


def test_unquote_and_decode_unicode_escape_seq():

# Test with a string that starts and ends with quotes and has Unicode escape sequences
input_string = '"Hello \\u003cWorld\\u003e"'
expected_output = "Hello <World>"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that does not start and end with quotes
input_string = "Hello \\u003cWorld\\u003e"
expected_output = "Hello <World>"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with an empty string
input_string = ""
expected_output = ""
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that does not have Unicode escape sequences
input_string = "No escape sequences here"
expected_output = "No escape sequences here"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that starts and ends with quotes but does not have escape sequences
input_string = '"No escape sequences here"'
expected_output = "No escape sequences here"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

0 comments on commit 9e6191f

Please sign in to comment.