From 89bf82de72a841379d56e1b353580f7756bd7913 Mon Sep 17 00:00:00 2001 From: Matti Remes Date: Sun, 26 Feb 2017 15:24:10 -0500 Subject: [PATCH] Use name and type comparising when appending a dataframe into table I modified GbqConnector.verify_schema function to parse name and type from the remote schema (basically dropping mode) and include those in the compared fields. Currently, when appending to a BQ table, comparison between the destination table's schema and a dataframe schema is done over superset of a BQ schema definition (name, type, mode) when _generate_bq_schema parses only name and type from a dataframe. IMO it would be inconvenient to make the mode check in the module by generating completeness of columns (includes null values or not). So raising a generic GBQ error is more convenient here. closes #13 Author: Matti Remes Closes #14 from mremes/master and squashes the following commits: bf8c378 [Matti Remes] added reference to issue #13 77b1fd5 [Matti Remes] changelog for verify_schema changes 70d08ef [Matti Remes] make the syntax of the test flake-pretty 45826f1 [Matti Remes] Merge remote-tracking branch 'upstream/master' 66aa616 [Matti Remes] Added test for validate_schema ignoring field mode when comparing schemas 5dafd55 [Matti Remes] fix bug with selecting key 631d66c [Matti Remes] Use name and type of fields for comparing remote and local schemas when appending to a table --- docs/source/changelog.rst | 6 ++++-- docs/source/conf.py | 5 ++--- pandas_gbq/gbq.py | 6 +++++- pandas_gbq/tests/test_gbq.py | 28 ++++++++++++++++++++++++++++ 4 files changed, 39 insertions(+), 6 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 3ec2a3df..b2a6b83e 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,8 +1,10 @@ Changelog ========= -0.2.0 / 2017-? --------------- +0.2.0 / 2017-03-?? +------------------ + +- Bug with appending to a BigQuery table where fields have modes (NULLABLE,REQUIRED,REPEATED) specified. These modes were compared versus the remote schema and writing a table via ``to_gbq`` would previously raise. (:issue:`13`) 0.1.2 / 2017-02-23 ------------------ diff --git a/docs/source/conf.py b/docs/source/conf.py index 32c2fcfc..94c8d229 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -353,6 +353,5 @@ intersphinx_mapping = {'https://docs.python.org/': None} extlinks = {'issue': ('https://github.com/pydata/pandas-gbq/issues/%s', - 'GH'), - 'wiki': ('https://github.com/pydata/pandas-gbq/wiki/%s', - 'wiki ')} + 'GH#'), + 'pr': ('https://github.com/pydata/pandas-gbq/pull/%s', 'GH#')} diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 9759e379..060724ed 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -563,8 +563,12 @@ def verify_schema(self, dataset_id, table_id, schema): datasetId=dataset_id, tableId=table_id).execute()['schema'] + remote_fields = [{'name': field_remote['name'], + 'type': field_remote['type']} + for field_remote in remote_schema['fields']] + fields_remote = set([json.dumps(field_remote) - for field_remote in remote_schema['fields']]) + for field_remote in remote_fields]) fields_local = set(json.dumps(field_local) for field_local in schema['fields']) diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 036e8330..6a3cad19 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -1161,6 +1161,34 @@ def test_upload_data_flexible_column_order(self): _get_project_id(), if_exists='append', private_key=_get_private_key_path()) + def test_verify_schema_ignores_field_mode(self): + test_id = "14" + test_schema_1 = {'fields': [{'name': 'A', + 'type': 'FLOAT', + 'mode': 'NULLABLE'}, + {'name': 'B', + 'type': 'FLOAT', + 'mode': 'NULLABLE'}, + {'name': 'C', + 'type': 'STRING', + 'mode': 'NULLABLE'}, + {'name': 'D', + 'type': 'TIMESTAMP', + 'mode': 'REQUIRED'}]} + test_schema_2 = {'fields': [{'name': 'A', + 'type': 'FLOAT'}, + {'name': 'B', + 'type': 'FLOAT'}, + {'name': 'C', + 'type': 'STRING'}, + {'name': 'D', + 'type': 'TIMESTAMP'}]} + + self.table.create(TABLE_ID + test_id, test_schema_1) + self.assertTrue(self.sut.verify_schema( + self.dataset_prefix + "1", TABLE_ID + test_id, test_schema_2), + 'Expected schema to match') + def test_list_dataset(self): dataset_id = self.dataset_prefix + "1" self.assertTrue(dataset_id in self.dataset.datasets(),