From af203228cef6488e87c7b05885307d3e09590d82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Torres=20Marroqu=C3=ADn?= Date: Tue, 29 Oct 2024 10:35:27 -0600 Subject: [PATCH 01/14] HJ-116 - fix BigQuery partitioning support bug - WHERE queries are not properly generated --- src/fides/api/service/connectors/query_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fides/api/service/connectors/query_config.py b/src/fides/api/service/connectors/query_config.py index 9a4477703f..19dd046f42 100644 --- a/src/fides/api/service/connectors/query_config.py +++ b/src/fides/api/service/connectors/query_config.py @@ -896,7 +896,7 @@ def get_formatted_query_string( Returns a query string with backtick formatting for tables that have the same names as BigQuery reserved words. """ - return f'SELECT {field_list} FROM `{self._generate_table_name()}` WHERE {" OR ".join(clauses)}' + return f'SELECT {field_list} FROM `{self._generate_table_name()}` WHERE ({" OR ".join(clauses)})' def generate_masking_stmt( self, From 26a780b0a0457001e5a0abfd22f0d95c3957b0ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Torres=20Marroqu=C3=ADn?= Date: Tue, 29 Oct 2024 11:17:27 -0600 Subject: [PATCH 02/14] HJ-116 - Update tests --- tests/ops/service/connectors/test_bigquery_connector.py | 1 - tests/ops/service/connectors/test_queryconfig.py | 8 +++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/ops/service/connectors/test_bigquery_connector.py b/tests/ops/service/connectors/test_bigquery_connector.py index a083e0ae44..7a9ef245b8 100644 --- a/tests/ops/service/connectors/test_bigquery_connector.py +++ b/tests/ops/service/connectors/test_bigquery_connector.py @@ -14,7 +14,6 @@ from fides.api.service.connectors.sql_connector import BigQueryConnector -@pytest.mark.skip(reason="move to plus in progress") @pytest.mark.integration_external @pytest.mark.integration_bigquery class TestBigQueryConnector: diff --git a/tests/ops/service/connectors/test_queryconfig.py b/tests/ops/service/connectors/test_queryconfig.py index 3085ed98bf..54f27cfd35 100644 --- a/tests/ops/service/connectors/test_queryconfig.py +++ b/tests/ops/service/connectors/test_queryconfig.py @@ -744,7 +744,6 @@ def test_put_query_param_formatting_single_key( } -@pytest.mark.skip(reason="move to plus in progress") class TestBigQueryQueryConfig: @pytest.fixture(scope="function") def bigquery_client(self, bigquery_connection_config): @@ -994,7 +993,6 @@ def test_query_to_str(self, complete_execution_node): assert query_to_str == "SELECT name FROM users WHERE email = 'test@example.com'" -@pytest.mark.skip(reason="move to plus in progress") @pytest.mark.integration_external @pytest.mark.integration_bigquery class TestBigQueryQueryConfig: @@ -1026,16 +1024,16 @@ def execution_node( BigQueryNamespaceMeta( project_id="cool_project", dataset_id="first_dataset" ), - "SELECT address_id, created, email, id, name FROM `cool_project.first_dataset.customer` WHERE email = :email", + "SELECT address_id, created, email, id, name FROM `cool_project.first_dataset.customer` WHERE (email = :email)", ), # Namespace meta will be a dict / JSON when retrieved from the DB ( {"project_id": "cool_project", "dataset_id": "first_dataset"}, - "SELECT address_id, created, email, id, name FROM `cool_project.first_dataset.customer` WHERE email = :email", + "SELECT address_id, created, email, id, name FROM `cool_project.first_dataset.customer` WHERE (email = :email)", ), ( None, - "SELECT address_id, created, email, id, name FROM `customer` WHERE email = :email", + "SELECT address_id, created, email, id, name FROM `customer` WHERE (email = :email)", ), ], ) From 414f43e0e25e219d74b5d35c52b277dfb231bbdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Torres=20Marroqu=C3=ADn?= Date: Tue, 29 Oct 2024 11:26:44 -0600 Subject: [PATCH 03/14] QA --- tests/ops/service/connectors/test_queryconfig.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ops/service/connectors/test_queryconfig.py b/tests/ops/service/connectors/test_queryconfig.py index 54f27cfd35..5cc03ca680 100644 --- a/tests/ops/service/connectors/test_queryconfig.py +++ b/tests/ops/service/connectors/test_queryconfig.py @@ -744,6 +744,7 @@ def test_put_query_param_formatting_single_key( } +@pytest.mark.skip(reason="move to plus in progress") class TestBigQueryQueryConfig: @pytest.fixture(scope="function") def bigquery_client(self, bigquery_connection_config): From 1b6b97719c62f1eda5796faeb97a8216168a3032 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Torres=20Marroqu=C3=ADn?= Date: Tue, 29 Oct 2024 11:47:58 -0600 Subject: [PATCH 04/14] QA --- src/fides/api/service/connectors/query_config.py | 8 ++++---- tests/ops/service/connectors/test_queryconfig.py | 6 ------ 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/fides/api/service/connectors/query_config.py b/src/fides/api/service/connectors/query_config.py index 19dd046f42..4b0758e69f 100644 --- a/src/fides/api/service/connectors/query_config.py +++ b/src/fides/api/service/connectors/query_config.py @@ -673,7 +673,7 @@ def get_formatted_query_string( ) -> str: """Returns a query string with double quotation mark formatting for tables that have the same names as Postgres reserved words.""" - return f'SELECT {field_list} FROM "{self.node.collection.name}" WHERE {" OR ".join(clauses)}' + return f'SELECT {field_list} FROM "{self.node.collection.name}" WHERE ({" OR ".join(clauses)})' class MySQLQueryConfig(SQLQueryConfig): @@ -688,7 +688,7 @@ def get_formatted_query_string( ) -> str: """Returns a query string with backtick formatting for tables that have the same names as MySQL reserved words.""" - return f'SELECT {field_list} FROM `{self.node.collection.name}` WHERE {" OR ".join(clauses)}' + return f'SELECT {field_list} FROM `{self.node.collection.name}` WHERE ({" OR ".join(clauses)})' class QueryStringWithoutTuplesOverrideQueryConfig(SQLQueryConfig): @@ -797,7 +797,7 @@ def get_formatted_query_string( clauses: List[str], ) -> str: """Returns a query string with double quotation mark formatting as required by Snowflake syntax.""" - return f'SELECT {field_list} FROM "{self.node.collection.name}" WHERE {" OR ".join(clauses)}' + return f'SELECT {field_list} FROM "{self.node.collection.name}" WHERE ({" OR ".join(clauses)})' def format_key_map_for_update_stmt(self, fields: List[str]) -> List[str]: """Adds the appropriate formatting for update statements in this datastore.""" @@ -823,7 +823,7 @@ def get_formatted_query_string( ) -> str: """Returns a query string with double quotation mark formatting for tables that have the same names as Redshift reserved words.""" - return f'SELECT {field_list} FROM "{self.node.collection.name}" WHERE {" OR ".join(clauses)}' + return f'SELECT {field_list} FROM "{self.node.collection.name}" WHERE ({" OR ".join(clauses)})' class GoogleCloudSQLPostgresQueryConfig(QueryStringWithoutTuplesOverrideQueryConfig): diff --git a/tests/ops/service/connectors/test_queryconfig.py b/tests/ops/service/connectors/test_queryconfig.py index 5cc03ca680..838f69183c 100644 --- a/tests/ops/service/connectors/test_queryconfig.py +++ b/tests/ops/service/connectors/test_queryconfig.py @@ -63,7 +63,6 @@ privacy_request = PrivacyRequest(id="234544") -@pytest.mark.skip(reason="move to plus in progress") class TestSQLQueryConfig: def test_extract_query_components(self): def found_query_keys(node: ExecutionNode, values: Dict[str, Any]) -> Set[str]: @@ -371,7 +370,6 @@ def test_generate_update_stmts_from_multiple_rules( ) # String rewrite masking strategy -@pytest.mark.skip(reason="move to plus in progress") class TestMongoQueryConfig: @pytest.fixture(scope="function") def combined_traversal(self, connection_config, integration_mongodb_config): @@ -629,7 +627,6 @@ def test_generate_update_stmt_multiple_rules( ) -@pytest.mark.skip(reason="move to plus in progress") class TestDynamoDBQueryConfig: @pytest.fixture(scope="function") def identity(self): @@ -744,7 +741,6 @@ def test_put_query_param_formatting_single_key( } -@pytest.mark.skip(reason="move to plus in progress") class TestBigQueryQueryConfig: @pytest.fixture(scope="function") def bigquery_client(self, bigquery_connection_config): @@ -954,7 +950,6 @@ def test_generate_namespaced_delete_stmt( ) -@pytest.mark.skip(reason="move to plus in progress") class TestScyllaDBQueryConfig: @pytest.fixture(scope="function") def complete_execution_node( @@ -1059,7 +1054,6 @@ def test_generate_query_with_invalid_namespace_meta( assert "Field required" in str(exc) -@pytest.mark.skip(reason="move to plus in progress") class TestSQLLikeQueryConfig: def test_missing_namespace_meta_schema(self): From e37bd3f8ec1b8b4693509389da7546a557c3a1a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Torres=20Marroqu=C3=ADn?= Date: Tue, 29 Oct 2024 12:42:33 -0600 Subject: [PATCH 05/14] QA --- .../api/v1/endpoints/test_privacy_request_endpoints.py | 4 ++-- tests/ops/task/test_graph_task.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/ops/api/v1/endpoints/test_privacy_request_endpoints.py b/tests/ops/api/v1/endpoints/test_privacy_request_endpoints.py index db2da05573..ef4c3c6361 100644 --- a/tests/ops/api/v1/endpoints/test_privacy_request_endpoints.py +++ b/tests/ops/api/v1/endpoints/test_privacy_request_endpoints.py @@ -3501,7 +3501,7 @@ def test_request_preview( if response["collectionAddress"]["dataset"] == "postgres" if response["collectionAddress"]["collection"] == "subscriptions" ) - == 'SELECT email, id FROM "subscriptions" WHERE email = ?' + == 'SELECT email, id FROM "subscriptions" WHERE (email = ?)' ) def test_request_preview_incorrect_body( @@ -3578,7 +3578,7 @@ def test_request_preview_all( if response["collectionAddress"]["dataset"] == "postgres" if response["collectionAddress"]["collection"] == "subscriptions" ) - == 'SELECT email, id FROM "subscriptions" WHERE email = ?' + == 'SELECT email, id FROM "subscriptions" WHERE (email = ?)' ) assert ( next( diff --git a/tests/ops/task/test_graph_task.py b/tests/ops/task/test_graph_task.py index 03f3b49b6b..c287303fa8 100644 --- a/tests/ops/task/test_graph_task.py +++ b/tests/ops/task/test_graph_task.py @@ -431,22 +431,22 @@ def test_sql_dry_run_queries(db) -> None: assert ( env[CollectionAddress("mysql", "Customer")] - == 'SELECT customer_id, name, email, contact_address_id FROM "Customer" WHERE email = ?' + == 'SELECT customer_id, name, email, contact_address_id FROM "Customer" WHERE (email = ?)' ) assert ( env[CollectionAddress("mysql", "User")] - == 'SELECT id, user_id, name FROM "User" WHERE user_id = ?' + == 'SELECT id, user_id, name FROM "User" WHERE (user_id = ?)' ) assert ( env[CollectionAddress("postgres", "Order")] - == 'SELECT order_id, customer_id, shipping_address_id, billing_address_id FROM "Order" WHERE customer_id IN (?, ?)' + == 'SELECT order_id, customer_id, shipping_address_id, billing_address_id FROM "Order" WHERE (customer_id IN (?, ?))' ) assert ( env[CollectionAddress("mysql", "Address")] - == 'SELECT id, street, city, state, zip FROM "Address" WHERE id IN (?, ?)' + == 'SELECT id, street, city, state, zip FROM "Address" WHERE (id IN (?, ?))' ) assert ( From e92b8b8721f27d7f1ef1890568796a4e2d1961ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Torres=20Marroqu=C3=ADn?= Date: Thu, 31 Oct 2024 10:01:01 -0600 Subject: [PATCH 06/14] Added test case when using more than one identifyind field and partitioning --- CHANGELOG.md | 1 + .../dataset/bigquery_example_test_dataset.yml | 5 ++ tests/fixtures/bigquery_fixtures.py | 51 +++++++++++-------- .../connectors/test_bigquery_connector.py | 35 ++++++++++--- .../service/connectors/test_queryconfig.py | 10 +--- 5 files changed, 66 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d98aa8e5cf..74e971b527 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ The types of changes are: ### Fixed - API router sanitizer being too aggressive with NextJS Catch-all Segments [#5438](https://github.com/ethyca/fides/pull/5438) + - Fix BigQuery partitioning support bug - WHERE queries are not properly generated [#5432](https://github.com/ethyca/fides/pull/5432) ## [2.48.0](https://github.com/ethyca/fidesplus/compare/2.47.1...2.48.0) diff --git a/data/dataset/bigquery_example_test_dataset.yml b/data/dataset/bigquery_example_test_dataset.yml index 46b76a4a31..0420f04f55 100644 --- a/data/dataset/bigquery_example_test_dataset.yml +++ b/data/dataset/bigquery_example_test_dataset.yml @@ -40,6 +40,11 @@ dataset: fides_meta: identity: email data_type: string + - name: custom_id + data_categories: [user.unique_id] + fides_meta: + identity: custom_id + data_type: string - name: id data_categories: [user.unique_id] fides_meta: diff --git a/tests/fixtures/bigquery_fixtures.py b/tests/fixtures/bigquery_fixtures.py index acac500bf2..f676d03d63 100644 --- a/tests/fixtures/bigquery_fixtures.py +++ b/tests/fixtures/bigquery_fixtures.py @@ -35,7 +35,7 @@ def bigquery_connection_config_without_secrets(db: Session) -> Generator: @pytest.fixture(scope="function") -def bigquery_connection_config(db: Session) -> Generator: +def bigquery_connection_config(db: Session, bigquery_keyfile_creds) -> Generator: connection_config = ConnectionConfig.create( db=db, data={ @@ -46,14 +46,11 @@ def bigquery_connection_config(db: Session) -> Generator: }, ) # Pulling from integration config file or GitHub secrets - keyfile_creds = integration_config.get("bigquery", {}).get( - "keyfile_creds" - ) or ast.literal_eval(os.environ.get("BIGQUERY_KEYFILE_CREDS")) dataset = integration_config.get("bigquery", {}).get("dataset") or os.environ.get( "BIGQUERY_DATASET" ) - if keyfile_creds: - schema = BigQuerySchema(keyfile_creds=keyfile_creds, dataset=dataset) + if bigquery_keyfile_creds: + schema = BigQuerySchema(keyfile_creds=bigquery_keyfile_creds, dataset=dataset) connection_config.secrets = schema.model_dump(mode="json") connection_config.save(db=db) @@ -62,7 +59,28 @@ def bigquery_connection_config(db: Session) -> Generator: @pytest.fixture(scope="function") -def bigquery_connection_config_without_default_dataset(db: Session) -> Generator: +def bigquery_keyfile_creds(): + """ + Pulling from integration config file or GitHub secrets + """ + keyfile_creds = integration_config.get("bigquery", {}).get("keyfile_creds") + + if keyfile_creds: + return keyfile_creds + + if "BIGQUERY_KEYFILE_CREDS" in os.environ: + keyfile_creds = ast.literal_eval(os.environ.get("BIGQUERY_KEYFILE_CREDS")) + + if not keyfile_creds: + raise RuntimeError("Missing keyfile_creds for BigQuery") + + yield keyfile_creds + + +@pytest.fixture(scope="function") +def bigquery_connection_config_without_default_dataset( + db: Session, bigquery_keyfile_creds +) -> Generator: connection_config = ConnectionConfig.create( db=db, data={ @@ -72,12 +90,8 @@ def bigquery_connection_config_without_default_dataset(db: Session) -> Generator "access": AccessLevel.write, }, ) - # Pulling from integration config file or GitHub secrets - keyfile_creds = integration_config.get("bigquery", {}).get( - "keyfile_creds" - ) or ast.literal_eval(os.environ.get("BIGQUERY_KEYFILE_CREDS")) - if keyfile_creds: - schema = BigQuerySchema(keyfile_creds=keyfile_creds) + if bigquery_keyfile_creds: + schema = BigQuerySchema(keyfile_creds=bigquery_keyfile_creds) connection_config.secrets = schema.model_dump(mode="json") connection_config.save(db=db) @@ -150,7 +164,7 @@ def bigquery_example_test_dataset_config_with_namespace_and_partitioning_meta( bigquery_connection_config_without_default_dataset: ConnectionConfig, db: Session, example_datasets: List[Dict], -) -> Generator: +) -> Generator[DatasetConfig, None, None]: bigquery_dataset = example_datasets[7] bigquery_dataset["fides_meta"] = { "namespace": { @@ -360,7 +374,7 @@ def bigquery_resources_with_namespace_meta( @pytest.fixture(scope="session") -def bigquery_test_engine() -> Generator: +def bigquery_test_engine(bigquery_keyfile_creds) -> Generator: """Return a connection to a Google BigQuery Warehouse""" connection_config = ConnectionConfig( @@ -370,14 +384,11 @@ def bigquery_test_engine() -> Generator: ) # Pulling from integration config file or GitHub secrets - keyfile_creds = integration_config.get("bigquery", {}).get( - "keyfile_creds" - ) or ast.literal_eval(os.environ.get("BIGQUERY_KEYFILE_CREDS")) dataset = integration_config.get("bigquery", {}).get("dataset") or os.environ.get( "BIGQUERY_DATASET" ) - if keyfile_creds: - schema = BigQuerySchema(keyfile_creds=keyfile_creds, dataset=dataset) + if bigquery_keyfile_creds: + schema = BigQuerySchema(keyfile_creds=bigquery_keyfile_creds, dataset=dataset) connection_config.secrets = schema.model_dump(mode="json") connector: BigQueryConnector = get_connector(connection_config) diff --git a/tests/ops/service/connectors/test_bigquery_connector.py b/tests/ops/service/connectors/test_bigquery_connector.py index 7a9ef245b8..06cf8841cf 100644 --- a/tests/ops/service/connectors/test_bigquery_connector.py +++ b/tests/ops/service/connectors/test_bigquery_connector.py @@ -1,7 +1,9 @@ +import logging from typing import Generator import pytest from fideslang.models import Dataset +from loguru import logger from fides.api.graph.config import CollectionAddress from fides.api.graph.graph import DatasetGraph @@ -67,7 +69,9 @@ def execution_node_with_namespace_and_partitioning_meta( dataset_config.connection_config.key, ) dataset_graph = DatasetGraph(graph_dataset) - traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + traversal = Traversal( + dataset_graph, {"email": "customer-1@example.com", "custom_id": "123"} + ) yield traversal.traversal_node_dict[ CollectionAddress("bigquery_example_test_dataset", "customer") @@ -164,6 +168,7 @@ def test_retrieve_partitioned_data( execution_node_with_namespace_and_partitioning_meta, policy, privacy_request_with_email_identity, + caplog, ): """Unit test of BigQueryQueryConfig.generate_delete specifically for a partitioned table""" dataset_config = ( @@ -171,13 +176,27 @@ def test_retrieve_partitioned_data( ) connector = BigQueryConnector(dataset_config.connection_config) - results = connector.retrieve_data( - node=execution_node_with_namespace_and_partitioning_meta, - policy=policy, - privacy_request=privacy_request_with_email_identity, - request_task=RequestTask(), - input_data={"email": ["customer-1@example.com"]}, - ) + handler_id = logger.add(caplog.handler, format="{message}") + with caplog.at_level(logging.INFO): + results = connector.retrieve_data( + node=execution_node_with_namespace_and_partitioning_meta, + policy=policy, + privacy_request=privacy_request_with_email_identity, + request_task=RequestTask(), + input_data={ + "email": ["customer-1@example.com"], + "custom_id": ["123"], + }, + ) + assert ( + "SELECT address_id, created, custom_id, email, id, name FROM `silken-precinct-284918.fidesopstest.customer` WHERE (email = %(email)s OR custom_id = %(custom_id)s) AND (`created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP())" + in caplog.text + ) + assert ( + "SELECT address_id, created, custom_id, email, id, name FROM `silken-precinct-284918.fidesopstest.customer` WHERE (email = %(email)s OR custom_id = %(custom_id)s) AND (`created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2000 DAY) AND `created` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY))" + in caplog.text + ) + logger.remove(handler_id) assert len(results) == 1 assert results[0]["email"] == "customer-1@example.com" diff --git a/tests/ops/service/connectors/test_queryconfig.py b/tests/ops/service/connectors/test_queryconfig.py index 838f69183c..aad5ee1106 100644 --- a/tests/ops/service/connectors/test_queryconfig.py +++ b/tests/ops/service/connectors/test_queryconfig.py @@ -741,6 +741,8 @@ def test_put_query_param_formatting_single_key( } +@pytest.mark.integration_external +@pytest.mark.integration_bigquery class TestBigQueryQueryConfig: @pytest.fixture(scope="function") def bigquery_client(self, bigquery_connection_config): @@ -769,8 +771,6 @@ def address_node(self, dataset_graph): CollectionAddress("bigquery_example_test_dataset", "address") ].to_mock_execution_node() - @pytest.mark.integration_external - @pytest.mark.integration_bigquery def test_generate_update_stmt( self, db, @@ -812,8 +812,6 @@ def test_generate_update_stmt( == "UPDATE `address` SET `house`=%(house:STRING)s, `street`=%(street:STRING)s, `city`=%(city:STRING)s, `state`=%(state:STRING)s, `zip`=%(zip:STRING)s WHERE `address`.`id` = %(id_1:STRING)s" ) - @pytest.mark.integration_external - @pytest.mark.integration_bigquery def test_generate_namespaced_update_stmt( self, db, @@ -860,8 +858,6 @@ def test_generate_namespaced_update_stmt( == "UPDATE `cool_project.first_dataset.address` SET `house`=%(house:STRING)s, `street`=%(street:STRING)s, `city`=%(city:STRING)s, `state`=%(state:STRING)s, `zip`=%(zip:STRING)s WHERE `address`.`id` = %(id_1:STRING)s" ) - @pytest.mark.integration_external - @pytest.mark.integration_bigquery def test_generate_delete_stmt( self, db, @@ -902,8 +898,6 @@ def test_generate_delete_stmt( == "DELETE FROM `employee` WHERE `employee`.`id` = %(id_1:STRING)s" ) - @pytest.mark.integration_external - @pytest.mark.integration_bigquery def test_generate_namespaced_delete_stmt( self, db, From 628bb8215eaf786c5b3a2d40bff403d8f8facc93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Torres=20Marroqu=C3=ADn?= Date: Thu, 31 Oct 2024 11:14:47 -0600 Subject: [PATCH 07/14] QA --- .../connectors/test_bigquery_connector.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/ops/service/connectors/test_bigquery_connector.py b/tests/ops/service/connectors/test_bigquery_connector.py index 06cf8841cf..907d3aa48e 100644 --- a/tests/ops/service/connectors/test_bigquery_connector.py +++ b/tests/ops/service/connectors/test_bigquery_connector.py @@ -168,6 +168,30 @@ def test_retrieve_partitioned_data( execution_node_with_namespace_and_partitioning_meta, policy, privacy_request_with_email_identity, + ): + """Unit test of BigQueryQueryConfig.generate_delete specifically for a partitioned table""" + dataset_config = ( + bigquery_example_test_dataset_config_with_namespace_and_partitioning_meta + ) + connector = BigQueryConnector(dataset_config.connection_config) + + results = connector.retrieve_data( + node=execution_node_with_namespace_and_partitioning_meta, + policy=policy, + privacy_request=privacy_request_with_email_identity, + request_task=RequestTask(), + input_data={"email": ["customer-1@example.com"]}, + ) + + assert len(results) == 1 + assert results[0]["email"] == "customer-1@example.com" + + def test_retrieve_partitioned_data_with_multiple_identifying_fields( + self, + bigquery_example_test_dataset_config_with_namespace_and_partitioning_meta: DatasetConfig, + execution_node_with_namespace_and_partitioning_meta, + policy, + privacy_request_with_email_identity, caplog, ): """Unit test of BigQueryQueryConfig.generate_delete specifically for a partitioned table""" From ccb271b38d7b6de68158fd648d488534068a0022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Torres=20Marroqu=C3=ADn?= Date: Thu, 31 Oct 2024 11:24:10 -0600 Subject: [PATCH 08/14] QA --- .../service/connectors/test_bigquery_connector.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/ops/service/connectors/test_bigquery_connector.py b/tests/ops/service/connectors/test_bigquery_connector.py index 907d3aa48e..7d8e89455c 100644 --- a/tests/ops/service/connectors/test_bigquery_connector.py +++ b/tests/ops/service/connectors/test_bigquery_connector.py @@ -192,16 +192,15 @@ def test_retrieve_partitioned_data_with_multiple_identifying_fields( execution_node_with_namespace_and_partitioning_meta, policy, privacy_request_with_email_identity, - caplog, + loguru_caplog, ): - """Unit test of BigQueryQueryConfig.generate_delete specifically for a partitioned table""" + """Unit test of BigQueryQueryConfig.generate_delete specifically for a partitioned table with multiple identifying fields""" dataset_config = ( bigquery_example_test_dataset_config_with_namespace_and_partitioning_meta ) connector = BigQueryConnector(dataset_config.connection_config) - handler_id = logger.add(caplog.handler, format="{message}") - with caplog.at_level(logging.INFO): + with loguru_caplog.at_level(logging.INFO): results = connector.retrieve_data( node=execution_node_with_namespace_and_partitioning_meta, policy=policy, @@ -214,13 +213,12 @@ def test_retrieve_partitioned_data_with_multiple_identifying_fields( ) assert ( "SELECT address_id, created, custom_id, email, id, name FROM `silken-precinct-284918.fidesopstest.customer` WHERE (email = %(email)s OR custom_id = %(custom_id)s) AND (`created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP())" - in caplog.text + in loguru_caplog.text ) assert ( "SELECT address_id, created, custom_id, email, id, name FROM `silken-precinct-284918.fidesopstest.customer` WHERE (email = %(email)s OR custom_id = %(custom_id)s) AND (`created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2000 DAY) AND `created` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY))" - in caplog.text + in loguru_caplog.text ) - logger.remove(handler_id) assert len(results) == 1 assert results[0]["email"] == "customer-1@example.com" From 435b184ed89904ba5357c503413665357f0e0348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Torres=20Marroqu=C3=ADn?= Date: Thu, 31 Oct 2024 11:59:39 -0600 Subject: [PATCH 09/14] QA --- tests/ops/service/connectors/test_bigquery_connector.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/ops/service/connectors/test_bigquery_connector.py b/tests/ops/service/connectors/test_bigquery_connector.py index 7d8e89455c..df62351d75 100644 --- a/tests/ops/service/connectors/test_bigquery_connector.py +++ b/tests/ops/service/connectors/test_bigquery_connector.py @@ -3,7 +3,6 @@ import pytest from fideslang.models import Dataset -from loguru import logger from fides.api.graph.config import CollectionAddress from fides.api.graph.graph import DatasetGraph @@ -211,14 +210,18 @@ def test_retrieve_partitioned_data_with_multiple_identifying_fields( "custom_id": ["123"], }, ) + # Check that the correct SQL queries were executed and logged by sqlalchemy.engine.Engine assert ( - "SELECT address_id, created, custom_id, email, id, name FROM `silken-precinct-284918.fidesopstest.customer` WHERE (email = %(email)s OR custom_id = %(custom_id)s) AND (`created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP())" + "INFO sqlalchemy.engine.Engine:log.py:117 SELECT address_id, created, custom_id, email, id, name FROM `silken-precinct-284918.fidesopstest.customer` WHERE (email = %(email)s OR custom_id = %(custom_id)s) AND (`created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP())" in loguru_caplog.text ) assert ( - "SELECT address_id, created, custom_id, email, id, name FROM `silken-precinct-284918.fidesopstest.customer` WHERE (email = %(email)s OR custom_id = %(custom_id)s) AND (`created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2000 DAY) AND `created` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY))" + "INFO sqlalchemy.engine.Engine:log.py:117 SELECT address_id, created, custom_id, email, id, name FROM `silken-precinct-284918.fidesopstest.customer` WHERE (email = %(email)s OR custom_id = %(custom_id)s) AND (`created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2000 DAY) AND `created` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY))" in loguru_caplog.text ) + print("loguru_caplog.text") + print(loguru_caplog.text) + print("loguru_caplog.text") assert len(results) == 1 assert results[0]["email"] == "customer-1@example.com" From 025d351f358c87a976f3b0e2800d5e16b5e33015 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Torres=20Marroqu=C3=ADn?= Date: Thu, 31 Oct 2024 12:03:06 -0600 Subject: [PATCH 10/14] QA --- tests/ops/service/connectors/test_bigquery_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ops/service/connectors/test_bigquery_connector.py b/tests/ops/service/connectors/test_bigquery_connector.py index df62351d75..9bec5e6e17 100644 --- a/tests/ops/service/connectors/test_bigquery_connector.py +++ b/tests/ops/service/connectors/test_bigquery_connector.py @@ -168,7 +168,7 @@ def test_retrieve_partitioned_data( policy, privacy_request_with_email_identity, ): - """Unit test of BigQueryQueryConfig.generate_delete specifically for a partitioned table""" + """Unit test of BigQueryQueryConfig.retrieve_data specifically for a partitioned table""" dataset_config = ( bigquery_example_test_dataset_config_with_namespace_and_partitioning_meta ) @@ -193,7 +193,7 @@ def test_retrieve_partitioned_data_with_multiple_identifying_fields( privacy_request_with_email_identity, loguru_caplog, ): - """Unit test of BigQueryQueryConfig.generate_delete specifically for a partitioned table with multiple identifying fields""" + """Unit test of BigQueryQueryConfig.retrieve_data specifically for a partitioned table with multiple identifying fields""" dataset_config = ( bigquery_example_test_dataset_config_with_namespace_and_partitioning_meta ) From b4742e3d3fc9071ad6e7bcba3670d03d4502bb01 Mon Sep 17 00:00:00 2001 From: Andres Torres Date: Thu, 31 Oct 2024 12:15:57 -0600 Subject: [PATCH 11/14] Update CHANGELOG.md Co-authored-by: Neville Samuell --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74e971b527..e52e6930e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,7 @@ The types of changes are: ### Fixed - API router sanitizer being too aggressive with NextJS Catch-all Segments [#5438](https://github.com/ethyca/fides/pull/5438) - - Fix BigQuery partitioning support bug - WHERE queries are not properly generated [#5432](https://github.com/ethyca/fides/pull/5432) + - Fix BigQuery `partitioning` queries to properly support multiple identity clauses [#5432](https://github.com/ethyca/fides/pull/5432) ## [2.48.0](https://github.com/ethyca/fidesplus/compare/2.47.1...2.48.0) From 68e9aea49ccf46ebd4c7b4ccd12b19d5f6bbff65 Mon Sep 17 00:00:00 2001 From: Andres Torres Date: Thu, 31 Oct 2024 12:27:10 -0600 Subject: [PATCH 12/14] Update tests/ops/service/connectors/test_bigquery_connector.py --- tests/ops/service/connectors/test_bigquery_connector.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/ops/service/connectors/test_bigquery_connector.py b/tests/ops/service/connectors/test_bigquery_connector.py index 9bec5e6e17..e9f1bdf3d0 100644 --- a/tests/ops/service/connectors/test_bigquery_connector.py +++ b/tests/ops/service/connectors/test_bigquery_connector.py @@ -219,9 +219,6 @@ def test_retrieve_partitioned_data_with_multiple_identifying_fields( "INFO sqlalchemy.engine.Engine:log.py:117 SELECT address_id, created, custom_id, email, id, name FROM `silken-precinct-284918.fidesopstest.customer` WHERE (email = %(email)s OR custom_id = %(custom_id)s) AND (`created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2000 DAY) AND `created` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY))" in loguru_caplog.text ) - print("loguru_caplog.text") - print(loguru_caplog.text) - print("loguru_caplog.text") assert len(results) == 1 assert results[0]["email"] == "customer-1@example.com" From f3a386b38f8e489999335e15f30012033563a756 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Torres=20Marroqu=C3=ADn?= Date: Thu, 31 Oct 2024 12:29:08 -0600 Subject: [PATCH 13/14] QA --- tests/ops/service/connectors/test_bigquery_connector.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/ops/service/connectors/test_bigquery_connector.py b/tests/ops/service/connectors/test_bigquery_connector.py index e9f1bdf3d0..98ad524204 100644 --- a/tests/ops/service/connectors/test_bigquery_connector.py +++ b/tests/ops/service/connectors/test_bigquery_connector.py @@ -211,6 +211,9 @@ def test_retrieve_partitioned_data_with_multiple_identifying_fields( }, ) # Check that the correct SQL queries were executed and logged by sqlalchemy.engine.Engine + # This may be not be the best way to test this, but it's the best I could come up with + # without modifying the BigQueryConnector class to allow for a SQL queries generation + # that's decoupled from the actual execution of the queries. assert ( "INFO sqlalchemy.engine.Engine:log.py:117 SELECT address_id, created, custom_id, email, id, name FROM `silken-precinct-284918.fidesopstest.customer` WHERE (email = %(email)s OR custom_id = %(custom_id)s) AND (`created` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `created` <= CURRENT_TIMESTAMP())" in loguru_caplog.text From 59f86bed0706f1885c8f70ed7dd0adf19191d45a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Torres=20Marroqu=C3=ADn?= Date: Thu, 31 Oct 2024 13:13:16 -0600 Subject: [PATCH 14/14] QA --- tests/fixtures/bigquery_fixtures.py | 2 +- .../test_external_database_connections.py | 24 +++++++++---------- .../service/connectors/test_queryconfig.py | 6 ++--- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/fixtures/bigquery_fixtures.py b/tests/fixtures/bigquery_fixtures.py index f676d03d63..9f282f5c4d 100644 --- a/tests/fixtures/bigquery_fixtures.py +++ b/tests/fixtures/bigquery_fixtures.py @@ -58,7 +58,7 @@ def bigquery_connection_config(db: Session, bigquery_keyfile_creds) -> Generator connection_config.delete(db) -@pytest.fixture(scope="function") +@pytest.fixture(scope="session") def bigquery_keyfile_creds(): """ Pulling from integration config file or GitHub secrets diff --git a/tests/ops/integration_tests/test_external_database_connections.py b/tests/ops/integration_tests/test_external_database_connections.py index d5077bbc10..04013d68d3 100644 --- a/tests/ops/integration_tests/test_external_database_connections.py +++ b/tests/ops/integration_tests/test_external_database_connections.py @@ -169,18 +169,18 @@ def test_bigquery_example_data(bigquery_test_engine): inspector = inspect(bigquery_test_engine) assert sorted(inspector.get_table_names(schema="fidesopstest")) == sorted( [ - "address", - "customer", - "employee", - "login", - "order_item", - "orders", - "payment_card", - "product", - "report", - "service_request", - "visit", - "visit_partitioned", + "fidesopstest.address", + "fidesopstest.customer", + "fidesopstest.employee", + "fidesopstest.login", + "fidesopstest.order_item", + "fidesopstest.orders", + "fidesopstest.payment_card", + "fidesopstest.product", + "fidesopstest.report", + "fidesopstest.service_request", + "fidesopstest.visit", + "fidesopstest.visit_partitioned", ] ) diff --git a/tests/ops/service/connectors/test_queryconfig.py b/tests/ops/service/connectors/test_queryconfig.py index aad5ee1106..1b2871cab4 100644 --- a/tests/ops/service/connectors/test_queryconfig.py +++ b/tests/ops/service/connectors/test_queryconfig.py @@ -1014,16 +1014,16 @@ def execution_node( BigQueryNamespaceMeta( project_id="cool_project", dataset_id="first_dataset" ), - "SELECT address_id, created, email, id, name FROM `cool_project.first_dataset.customer` WHERE (email = :email)", + "SELECT address_id, created, custom_id, email, id, name FROM `cool_project.first_dataset.customer` WHERE (email = :email)", ), # Namespace meta will be a dict / JSON when retrieved from the DB ( {"project_id": "cool_project", "dataset_id": "first_dataset"}, - "SELECT address_id, created, email, id, name FROM `cool_project.first_dataset.customer` WHERE (email = :email)", + "SELECT address_id, created, custom_id, email, id, name FROM `cool_project.first_dataset.customer` WHERE (email = :email)", ), ( None, - "SELECT address_id, created, email, id, name FROM `customer` WHERE (email = :email)", + "SELECT address_id, created, custom_id, email, id, name FROM `customer` WHERE (email = :email)", ), ], )