From c23b1a4c8cb4ac87c9e71703285393e5904e2a8a Mon Sep 17 00:00:00 2001 From: Piotr Chromiec Date: Thu, 9 Feb 2017 12:08:02 -0500 Subject: [PATCH] BUG: fix read_gbq lost precision for longs above 2^53 and floats above 10k closes #14020 closes #14305 Author: Piotr Chromiec Closes #14064 from tworec/read_gbq_full_long_support and squashes the following commits: 788ccee [Piotr Chromiec] BUG: fix read_gbq lost numeric precision --- doc/source/install.rst | 13 +- doc/source/io.rst | 61 +++++-- doc/source/whatsnew/v0.20.0.txt | 5 +- pandas/io/gbq.py | 24 +-- pandas/io/tests/test_gbq.py | 288 +++++++++++++++++++++----------- 5 files changed, 263 insertions(+), 128 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index 158a6e5562b7a..4b3ea19624a0e 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -250,9 +250,9 @@ Optional Dependencies * `Feather Format `__: necessary for feather-based storage, version 0.3.1 or higher. * `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs `__. Some common drivers are: - - `psycopg2 `__: for PostgreSQL - - `pymysql `__: for MySQL. - - `SQLite `__: for SQLite, this is included in Python's standard library by default. + * `psycopg2 `__: for PostgreSQL + * `pymysql `__: for MySQL. + * `SQLite `__: for SQLite, this is included in Python's standard library by default. * `matplotlib `__: for plotting * For Excel I/O: @@ -272,11 +272,8 @@ Optional Dependencies `__, or `xclip `__: necessary to use :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. -* Google's `python-gflags <`__ , - `oauth2client `__ , - `httplib2 `__ - and `google-api-python-client `__ - : Needed for :mod:`~pandas.io.gbq` +* For Google BigQuery I/O - see :ref:`here `. + * `Backports.lzma `__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library. * One of the following combinations of libraries is needed to use the top-level :func:`~pandas.read_html` function: diff --git a/doc/source/io.rst b/doc/source/io.rst index 4c78758a0e2d2..22eac33a715ba 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -39,7 +39,7 @@ object. * :ref:`read_json` * :ref:`read_msgpack` * :ref:`read_html` - * :ref:`read_gbq` + * :ref:`read_gbq` * :ref:`read_stata` * :ref:`read_sas` * :ref:`read_clipboard` @@ -55,7 +55,7 @@ The corresponding ``writer`` functions are object methods that are accessed like * :ref:`to_json` * :ref:`to_msgpack` * :ref:`to_html` - * :ref:`to_gbq` + * :ref:`to_gbq` * :ref:`to_stata` * :ref:`to_clipboard` * :ref:`to_pickle` @@ -4648,16 +4648,11 @@ DataFrame with a shape and data types derived from the source table. Additionally, DataFrames can be inserted into new BigQuery tables or appended to existing tables. -You will need to install some additional dependencies: - -- Google's `python-gflags `__ -- `httplib2 `__ -- `google-api-python-client `__ - .. warning:: To use this module, you will need a valid BigQuery account. Refer to the - `BigQuery Documentation `__ for details on the service itself. + `BigQuery Documentation `__ + for details on the service itself. The key functions are: @@ -4671,7 +4666,44 @@ The key functions are: .. currentmodule:: pandas -.. _io.bigquery_reader: + +Supported Data Types +++++++++++++++++++++ + +Pandas supports all these `BigQuery data types `__: +``STRING``, ``INTEGER`` (64bit), ``FLOAT`` (64 bit), ``BOOLEAN`` and +``TIMESTAMP`` (microsecond precision). Data types ``BYTES`` and ``RECORD`` +are not supported. + +Integer and boolean ``NA`` handling ++++++++++++++++++++++++++++++++++++ + +.. versionadded:: 0.20 + +Since all columns in BigQuery queries are nullable, and NumPy lacks of ``NA`` +support for integer and boolean types, this module will store ``INTEGER`` or +``BOOLEAN`` columns with at least one ``NULL`` value as ``dtype=object``. +Otherwise those columns will be stored as ``dtype=int64`` or ``dtype=bool`` +respectively. + +This is opposite to default pandas behaviour which will promote integer +type to float in order to store NAs. See the :ref:`gotchas` +for detailed explaination. + +While this trade-off works well for most cases, it breaks down for storing +values greater than 2**53. Such values in BigQuery can represent identifiers +and unnoticed precision lost for identifier is what we want to avoid. + +.. _io.bigquery_deps: + +Dependencies +++++++++++++ + +This module requires following additional dependencies: + +- `httplib2 `__: HTTP client +- `google-api-python-client `__: Google's API client +- `oauth2client `__: authentication and authorization for Google's API .. _io.bigquery_authentication: @@ -4686,7 +4718,7 @@ Is possible to authenticate with either user account credentials or service acco Authenticating with user account credentials is as simple as following the prompts in a browser window which will be automatically opened for you. You will be authenticated to the specified ``BigQuery`` account using the product name ``pandas GBQ``. It is only possible on local host. -The remote authentication using user account credentials is not currently supported in Pandas. +The remote authentication using user account credentials is not currently supported in pandas. Additional information on the authentication mechanism can be found `here `__. @@ -4695,8 +4727,6 @@ is particularly useful when working on remote servers (eg. jupyter iPython noteb Additional information on service accounts can be found `here `__. -You will need to install an additional dependency: `oauth2client `__. - Authentication via ``application default credentials`` is also possible. This is only valid if the parameter ``private_key`` is not provided. This method also requires that the credentials can be fetched from the environment the code is running in. @@ -4716,6 +4746,7 @@ Additional information on A private key can be obtained from the Google developers console by clicking `here `__. Use JSON key type. +.. _io.bigquery_reader: Querying '''''''' @@ -4775,7 +4806,6 @@ For more information about query configuration parameters see .. _io.bigquery_writer: - Writing DataFrames '''''''''''''''''' @@ -4865,6 +4895,8 @@ For example: often as the service seems to be changing and evolving. BiqQuery is best for analyzing large sets of data quickly, but it is not a direct replacement for a transactional database. +.. _io.bigquery_create_tables: + Creating BigQuery Tables '''''''''''''''''''''''' @@ -4894,6 +4926,7 @@ produce the dictionary representation schema of the specified pandas DataFrame. the new table with a different name. Refer to `Google BigQuery issue 191 `__. + .. _io.stata: Stata Format diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e765cdef4d219..9eae2b7a33923 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -369,7 +369,9 @@ Other API Changes - ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`) - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` - - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) +- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) +- The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no more casted to ``int64`` which also caused precision lost (:issue: `14064`, :issue:`14305`). + .. _whatsnew_0200.deprecations: Deprecations @@ -439,6 +441,7 @@ Bug Fixes - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) + - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 966f53e9d75ef..76c228418a616 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -603,18 +603,14 @@ def _parse_data(schema, rows): # see: # http://pandas.pydata.org/pandas-docs/dev/missing_data.html # #missing-data-casting-rules-and-indexing - dtype_map = {'INTEGER': np.dtype(float), - 'FLOAT': np.dtype(float), - # This seems to be buggy without nanosecond indicator + dtype_map = {'FLOAT': np.dtype(float), 'TIMESTAMP': 'M8[ns]'} fields = schema['fields'] col_types = [field['type'] for field in fields] col_names = [str(field['name']) for field in fields] col_dtypes = [dtype_map.get(field['type'], object) for field in fields] - page_array = np.zeros((len(rows),), - dtype=lzip(col_names, col_dtypes)) - + page_array = np.zeros((len(rows),), dtype=lzip(col_names, col_dtypes)) for row_num, raw_row in enumerate(rows): entries = raw_row.get('f', []) for col_num, field_type in enumerate(col_types): @@ -628,7 +624,9 @@ def _parse_data(schema, rows): def _parse_entry(field_value, field_type): if field_value is None or field_value == 'null': return None - if field_type == 'INTEGER' or field_type == 'FLOAT': + if field_type == 'INTEGER': + return int(field_value) + elif field_type == 'FLOAT': return float(field_value) elif field_type == 'TIMESTAMP': timestamp = datetime.utcfromtimestamp(float(field_value)) @@ -757,10 +755,14 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, 'Column order does not match this DataFrame.' ) - # Downcast floats to integers and objects to booleans - # if there are no NaN's. This is presently due to a - # limitation of numpy in handling missing data. - final_df._data = final_df._data.downcast(dtypes='infer') + # cast BOOLEAN and INTEGER columns from object to bool/int + # if they dont have any nulls + type_map = {'BOOLEAN': bool, 'INTEGER': int} + for field in schema['fields']: + if field['type'] in type_map and \ + final_df[field['name']].notnull().all(): + final_df[field['name']] = \ + final_df[field['name']].astype(type_map[field['type']]) connector.print_elapsed_seconds( 'Total time taken', diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 457e2d218cb33..1157482d7ae67 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -46,6 +46,11 @@ def _skip_if_no_project_id(): "Cannot run integration tests without a project id") +def _skip_local_auth_if_in_travis_env(): + if _in_travis_environment(): + raise nose.SkipTest("Cannot run local auth in travis environment") + + def _skip_if_no_private_key_path(): if not _get_private_key_path(): raise nose.SkipTest("Cannot run integration tests without a " @@ -248,14 +253,14 @@ def test_generate_bq_schema_deprecated(): gbq.generate_bq_schema(df) -class TestGBQConnectorIntegration(tm.TestCase): +class TestGBQConnectorIntegrationWithLocalUserAccountAuth(tm.TestCase): def setUp(self): _setup_common() _skip_if_no_project_id() + _skip_local_auth_if_in_travis_env() - self.sut = gbq.GbqConnector(_get_project_id(), - private_key=_get_private_key_path()) + self.sut = gbq.GbqConnector(_get_project_id()) def test_should_be_able_to_make_a_connector(self): self.assertTrue(self.sut is not None, @@ -293,8 +298,7 @@ def test_get_application_default_credentials_returns_credentials(self): self.assertTrue(isinstance(credentials, GoogleCredentials)) -class TestGBQConnectorServiceAccountKeyPathIntegration(tm.TestCase): - +class TestGBQConnectorIntegrationWithServiceAccountKeyPath(tm.TestCase): def setUp(self): _setup_common() @@ -325,16 +329,15 @@ def test_should_be_able_to_get_results_from_query(self): self.assertTrue(pages is not None) -class TestGBQConnectorServiceAccountKeyContentsIntegration(tm.TestCase): - +class TestGBQConnectorIntegrationWithServiceAccountKeyContents(tm.TestCase): def setUp(self): _setup_common() _skip_if_no_project_id() - _skip_if_no_private_key_path() + _skip_if_no_private_key_contents() self.sut = gbq.GbqConnector(_get_project_id(), - private_key=_get_private_key_path()) + private_key=_get_private_key_contents()) def test_should_be_able_to_make_a_connector(self): self.assertTrue(self.sut is not None, @@ -373,9 +376,9 @@ def test_import_google_api_python_client(self): from googleapiclient.discovery import build # noqa from googleapiclient.errors import HttpError # noqa - def test_should_return_bigquery_integers_as_python_floats(self): + def test_should_return_bigquery_integers_as_python_ints(self): result = gbq._parse_entry(1, 'INTEGER') - tm.assert_equal(result, float(1)) + tm.assert_equal(result, int(1)) def test_should_return_bigquery_floats_as_python_floats(self): result = gbq._parse_entry(1, 'FLOAT') @@ -403,15 +406,15 @@ def test_to_gbq_with_no_project_id_given_should_fail(self): def test_read_gbq_with_no_project_id_given_should_fail(self): with tm.assertRaises(TypeError): - gbq.read_gbq('SELECT "1" as NUMBER_1') + gbq.read_gbq('SELECT 1') def test_that_parse_data_works_properly(self): test_schema = {'fields': [ - {'mode': 'NULLABLE', 'name': 'VALID_STRING', 'type': 'STRING'}]} + {'mode': 'NULLABLE', 'name': 'valid_string', 'type': 'STRING'}]} test_page = [{'f': [{'v': 'PI'}]}] test_output = gbq._parse_data(test_schema, test_page) - correct_output = DataFrame({'VALID_STRING': ['PI']}) + correct_output = DataFrame({'valid_string': ['PI']}) tm.assert_frame_equal(test_output, correct_output) def test_read_gbq_with_invalid_private_key_json_should_fail(self): @@ -435,12 +438,12 @@ def test_read_gbq_with_empty_private_key_file_should_fail(self): private_key=empty_file_path) def test_read_gbq_with_corrupted_private_key_json_should_fail(self): - _skip_if_no_private_key_path() + _skip_if_no_private_key_contents() with tm.assertRaises(gbq.InvalidPrivateKeyFormat): gbq.read_gbq( 'SELECT 1', project_id='x', - private_key=re.sub('[a-z]', '9', _get_private_key_path())) + private_key=re.sub('[a-z]', '9', _get_private_key_contents())) class TestReadGBQIntegration(tm.TestCase): @@ -475,112 +478,207 @@ def tearDown(self): pass def test_should_read_as_user_account(self): - if _in_travis_environment(): - raise nose.SkipTest("Cannot run local auth in travis environment") + _skip_local_auth_if_in_travis_env() - query = 'SELECT "PI" as VALID_STRING' + query = 'SELECT "PI" AS valid_string' df = gbq.read_gbq(query, project_id=_get_project_id()) - tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) + tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) def test_should_read_as_service_account_with_key_path(self): _skip_if_no_private_key_path() - query = 'SELECT "PI" as VALID_STRING' + query = 'SELECT "PI" AS valid_string' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) + tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) def test_should_read_as_service_account_with_key_contents(self): _skip_if_no_private_key_contents() - query = 'SELECT "PI" as VALID_STRING' + query = 'SELECT "PI" AS valid_string' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_contents()) - tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) + tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) + + +class TestReadGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): + + @classmethod + def setUpClass(cls): + # - GLOBAL CLASS FIXTURES - + # put here any instruction you want to execute only *ONCE* *BEFORE* + # executing *ALL* tests described below. + + _skip_if_no_project_id() + _skip_if_no_private_key_path() + + _setup_common() + + def setUp(self): + # - PER-TEST FIXTURES - + # put here any instruction you want to be run *BEFORE* *EVERY* test is + # executed. + pass + + @classmethod + def tearDownClass(cls): + # - GLOBAL CLASS FIXTURES - + # put here any instruction you want to execute only *ONCE* *AFTER* + # executing all tests. + pass + + def tearDown(self): + # - PER-TEST FIXTURES - + # put here any instructions you want to be run *AFTER* *EVERY* test is + # executed. + pass def test_should_properly_handle_valid_strings(self): - query = 'SELECT "PI" as VALID_STRING' + query = 'SELECT "PI" AS valid_string' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) + tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) def test_should_properly_handle_empty_strings(self): - query = 'SELECT "" as EMPTY_STRING' + query = 'SELECT "" AS empty_string' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'EMPTY_STRING': [""]})) + tm.assert_frame_equal(df, DataFrame({'empty_string': [""]})) def test_should_properly_handle_null_strings(self): - query = 'SELECT STRING(NULL) as NULL_STRING' + query = 'SELECT STRING(NULL) AS null_string' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'NULL_STRING': [None]})) + tm.assert_frame_equal(df, DataFrame({'null_string': [None]})) def test_should_properly_handle_valid_integers(self): - query = 'SELECT INTEGER(3) as VALID_INTEGER' + query = 'SELECT INTEGER(3) AS valid_integer' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal(df, DataFrame({'valid_integer': [3]})) + + def test_should_properly_handle_nullable_integers(self): + query = '''SELECT * FROM + (SELECT 1 AS nullable_integer), + (SELECT NULL AS nullable_integer)''' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'VALID_INTEGER': [3]})) + tm.assert_frame_equal( + df, DataFrame({'nullable_integer': [1, None]}).astype(object)) + + def test_should_properly_handle_valid_longs(self): + query = 'SELECT 1 << 62 AS valid_long' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal( + df, DataFrame({'valid_long': [1 << 62]})) + + def test_should_properly_handle_nullable_longs(self): + query = '''SELECT * FROM + (SELECT 1 << 62 AS nullable_long), + (SELECT NULL AS nullable_long)''' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal( + df, DataFrame({'nullable_long': [1 << 62, None]}).astype(object)) def test_should_properly_handle_null_integers(self): - query = 'SELECT INTEGER(NULL) as NULL_INTEGER' + query = 'SELECT INTEGER(NULL) AS null_integer' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'NULL_INTEGER': [np.nan]})) + tm.assert_frame_equal(df, DataFrame({'null_integer': [None]})) def test_should_properly_handle_valid_floats(self): - query = 'SELECT PI() as VALID_FLOAT' + from math import pi + query = 'SELECT PI() AS valid_float' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal(df, DataFrame( + {'valid_float': [pi]})) + + def test_should_properly_handle_nullable_floats(self): + from math import pi + query = '''SELECT * FROM + (SELECT PI() AS nullable_float), + (SELECT NULL AS nullable_float)''' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal( + df, DataFrame({'nullable_float': [pi, None]})) + + def test_should_properly_handle_valid_doubles(self): + from math import pi + query = 'SELECT PI() * POW(10, 307) AS valid_double' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame( - {'VALID_FLOAT': [3.141592653589793]})) + {'valid_double': [pi * 10 ** 307]})) + + def test_should_properly_handle_nullable_doubles(self): + from math import pi + query = '''SELECT * FROM + (SELECT PI() * POW(10, 307) AS nullable_double), + (SELECT NULL AS nullable_double)''' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal( + df, DataFrame({'nullable_double': [pi * 10 ** 307, None]})) def test_should_properly_handle_null_floats(self): - query = 'SELECT FLOAT(NULL) as NULL_FLOAT' + query = 'SELECT FLOAT(NULL) AS null_float' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'NULL_FLOAT': [np.nan]})) + tm.assert_frame_equal(df, DataFrame({'null_float': [np.nan]})) def test_should_properly_handle_timestamp_unix_epoch(self): - query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") as UNIX_EPOCH' + query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") AS unix_epoch' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame( - {'UNIX_EPOCH': [np.datetime64('1970-01-01T00:00:00.000000Z')]})) + {'unix_epoch': [np.datetime64('1970-01-01T00:00:00.000000Z')]})) def test_should_properly_handle_arbitrary_timestamp(self): - query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") as VALID_TIMESTAMP' + query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({ - 'VALID_TIMESTAMP': [np.datetime64('2004-09-15T05:00:00.000000Z')] + 'valid_timestamp': [np.datetime64('2004-09-15T05:00:00.000000Z')] })) def test_should_properly_handle_null_timestamp(self): - query = 'SELECT TIMESTAMP(NULL) as NULL_TIMESTAMP' + query = 'SELECT TIMESTAMP(NULL) AS null_timestamp' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'NULL_TIMESTAMP': [NaT]})) + tm.assert_frame_equal(df, DataFrame({'null_timestamp': [NaT]})) def test_should_properly_handle_true_boolean(self): - query = 'SELECT BOOLEAN(TRUE) as TRUE_BOOLEAN' + query = 'SELECT BOOLEAN(TRUE) AS true_boolean' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'TRUE_BOOLEAN': [True]})) + tm.assert_frame_equal(df, DataFrame({'true_boolean': [True]})) def test_should_properly_handle_false_boolean(self): - query = 'SELECT BOOLEAN(FALSE) as FALSE_BOOLEAN' + query = 'SELECT BOOLEAN(FALSE) AS false_boolean' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'FALSE_BOOLEAN': [False]})) + tm.assert_frame_equal(df, DataFrame({'false_boolean': [False]})) def test_should_properly_handle_null_boolean(self): - query = 'SELECT BOOLEAN(NULL) as NULL_BOOLEAN' + query = 'SELECT BOOLEAN(NULL) AS null_boolean' + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) + tm.assert_frame_equal(df, DataFrame({'null_boolean': [None]})) + + def test_should_properly_handle_nullable_booleans(self): + query = '''SELECT * FROM + (SELECT BOOLEAN(TRUE) AS nullable_boolean), + (SELECT NULL AS nullable_boolean)''' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({'NULL_BOOLEAN': [None]})) + tm.assert_frame_equal( + df, DataFrame({'nullable_boolean': [True, None]}).astype(object)) def test_unicode_string_conversion_and_normalization(self): correct_test_datatype = DataFrame( - {'UNICODE_STRING': [u("\xe9\xfc")]} + {'unicode_string': [u("\xe9\xfc")]} ) unicode_string = "\xc3\xa9\xc3\xbc" @@ -588,40 +686,40 @@ def test_unicode_string_conversion_and_normalization(self): if compat.PY3: unicode_string = unicode_string.encode('latin-1').decode('utf8') - query = 'SELECT "{0}" as UNICODE_STRING'.format(unicode_string) + query = 'SELECT "{0}" AS unicode_string'.format(unicode_string) df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal(df, correct_test_datatype) def test_index_column(self): - query = "SELECT 'a' as STRING_1, 'b' as STRING_2" + query = "SELECT 'a' AS string_1, 'b' AS string_2" result_frame = gbq.read_gbq(query, project_id=_get_project_id(), - index_col="STRING_1", + index_col="string_1", private_key=_get_private_key_path()) correct_frame = DataFrame( - {'STRING_1': ['a'], 'STRING_2': ['b']}).set_index("STRING_1") + {'string_1': ['a'], 'string_2': ['b']}).set_index("string_1") tm.assert_equal(result_frame.index.name, correct_frame.index.name) def test_column_order(self): - query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" - col_order = ['STRING_3', 'STRING_1', 'STRING_2'] + query = "SELECT 'a' AS string_1, 'b' AS string_2, 'c' AS string_3" + col_order = ['string_3', 'string_1', 'string_2'] result_frame = gbq.read_gbq(query, project_id=_get_project_id(), col_order=col_order, private_key=_get_private_key_path()) - correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': [ - 'b'], 'STRING_3': ['c']})[col_order] + correct_frame = DataFrame({'string_1': ['a'], 'string_2': [ + 'b'], 'string_3': ['c']})[col_order] tm.assert_frame_equal(result_frame, correct_frame) def test_column_order_plus_index(self): - query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" - col_order = ['STRING_3', 'STRING_2'] + query = "SELECT 'a' AS string_1, 'b' AS string_2, 'c' AS string_3" + col_order = ['string_3', 'string_2'] result_frame = gbq.read_gbq(query, project_id=_get_project_id(), - index_col='STRING_1', col_order=col_order, + index_col='string_1', col_order=col_order, private_key=_get_private_key_path()) correct_frame = DataFrame( - {'STRING_1': ['a'], 'STRING_2': ['b'], 'STRING_3': ['c']}) - correct_frame.set_index('STRING_1', inplace=True) + {'string_1': ['a'], 'string_2': ['b'], 'string_3': ['c']}) + correct_frame.set_index('string_1', inplace=True) correct_frame = correct_frame[col_order] tm.assert_frame_equal(result_frame, correct_frame) @@ -655,14 +753,17 @@ def test_download_dataset_larger_than_200k_rows(self): def test_zero_rows(self): # Bug fix for https://github.com/pandas-dev/pandas/issues/10273 - df = gbq.read_gbq("SELECT title, id " + df = gbq.read_gbq("SELECT title, id, is_bot, " + "SEC_TO_TIMESTAMP(timestamp) ts " "FROM [publicdata:samples.wikipedia] " "WHERE timestamp=-9999999", project_id=_get_project_id(), private_key=_get_private_key_path()) page_array = np.zeros( - (0,), dtype=[('title', object), ('id', np.dtype(float))]) - expected_result = DataFrame(page_array, columns=['title', 'id']) + (0,), dtype=[('title', object), ('id', np.dtype(int)), + ('is_bot', np.dtype(bool)), ('ts', 'M8[ns]')]) + expected_result = DataFrame( + page_array, columns=['title', 'id', 'is_bot', 'ts']) self.assert_frame_equal(df, expected_result) def test_legacy_sql(self): @@ -715,7 +816,7 @@ def test_invalid_option_for_sql_dialect(self): dialect='standard', private_key=_get_private_key_path()) def test_query_with_parameters(self): - sql_statement = "SELECT @param1 + @param2 as VALID_RESULT" + sql_statement = "SELECT @param1 + @param2 AS valid_result" config = { 'query': { "useLegacySql": False, @@ -753,11 +854,11 @@ def test_query_with_parameters(self): df = gbq.read_gbq(sql_statement, project_id=_get_project_id(), private_key=_get_private_key_path(), configuration=config) - tm.assert_frame_equal(df, DataFrame({'VALID_RESULT': [3]})) + tm.assert_frame_equal(df, DataFrame({'valid_result': [3]})) def test_query_inside_configuration(self): - query_no_use = 'SELECT "PI_WRONG" as VALID_STRING' - query = 'SELECT "PI" as VALID_STRING' + query_no_use = 'SELECT "PI_WRONG" AS valid_string' + query = 'SELECT "PI" AS valid_string' config = { 'query': { "query": query, @@ -774,7 +875,7 @@ def test_query_inside_configuration(self): df = gbq.read_gbq(None, project_id=_get_project_id(), private_key=_get_private_key_path(), configuration=config) - tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) + tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) def test_configuration_without_query(self): sql_statement = 'SELECT 1' @@ -800,7 +901,7 @@ def test_configuration_without_query(self): configuration=config) -class TestToGBQIntegration(tm.TestCase): +class TestToGBQIntegrationWithServiceAccountKeyPath(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. # Make sure to modify the for loop range in the tearDownClass when a new @@ -814,6 +915,7 @@ def setUpClass(cls): # executing *ALL* tests described below. _skip_if_no_project_id() + _skip_if_no_private_key_path() _setup_common() clean_gbq_environment(_get_private_key_path()) @@ -859,11 +961,11 @@ def test_upload_data(self): sleep(30) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" + result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) - self.assertEqual(result['NUM_ROWS'][0], test_size) + self.assertEqual(result['num_rows'][0], test_size) def test_upload_data_if_table_exists_fail(self): destination_table = DESTINATION_TABLE + "2" @@ -899,11 +1001,11 @@ def test_upload_data_if_table_exists_append(self): sleep(30) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" + result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) - self.assertEqual(result['NUM_ROWS'][0], test_size * 2) + self.assertEqual(result['num_rows'][0], test_size * 2) # Try inserting with a different schema, confirm failure with tm.assertRaises(gbq.InvalidSchema): @@ -932,11 +1034,11 @@ def test_upload_data_if_table_exists_replace(self): sleep(30) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" + result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) - self.assertEqual(result['NUM_ROWS'][0], 5) + self.assertEqual(result['num_rows'][0], 5) @tm.slow def test_google_upload_errors_should_raise_exception(self): @@ -1113,7 +1215,7 @@ def test_dataset_does_not_exist(self): DATASET_ID + "_not_found"), 'Expected dataset not to exist') -class TestToGBQIntegrationServiceAccountKeyPath(tm.TestCase): +class TestToGBQIntegrationWithLocalUserAccountAuth(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. # Make sure to modify the for loop range in the tearDownClass when a new @@ -1128,10 +1230,10 @@ def setUpClass(cls): # executing *ALL* tests described below. _skip_if_no_project_id() - _skip_if_no_private_key_path() + _skip_local_auth_if_in_travis_env() _setup_common() - clean_gbq_environment(_get_private_key_path()) + clean_gbq_environment() def setUp(self): # - PER-TEST FIXTURES - @@ -1145,7 +1247,7 @@ def tearDownClass(cls): # put here any instruction you want to execute only *ONCE* *AFTER* # executing all tests. - clean_gbq_environment(_get_private_key_path()) + clean_gbq_environment() def tearDown(self): # - PER-TEST FIXTURES - @@ -1153,26 +1255,24 @@ def tearDown(self): # is executed. pass - def test_upload_data_as_service_account_with_key_path(self): + def test_upload_data(self): destination_table = "{0}.{1}".format(DATASET_ID + "2", TABLE_ID + "1") test_size = 10 df = make_mixed_dataframe_v2(test_size) - gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, - private_key=_get_private_key_path()) + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000) sleep(30) # <- Curses Google!!! result = gbq.read_gbq( - "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), - project_id=_get_project_id(), - private_key=_get_private_key_path()) + "SELECT COUNT(*) AS num_rows FROM {0}".format(destination_table), + project_id=_get_project_id()) - self.assertEqual(result['NUM_ROWS'][0], test_size) + self.assertEqual(result['num_rows'][0], test_size) -class TestToGBQIntegrationServiceAccountKeyContents(tm.TestCase): +class TestToGBQIntegrationWithServiceAccountKeyContents(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. # Make sure to modify the for loop range in the tearDownClass when a new @@ -1212,7 +1312,7 @@ def tearDown(self): # is executed. pass - def test_upload_data_as_service_account_with_key_contents(self): + def test_upload_data(self): destination_table = "{0}.{1}".format(DATASET_ID + "3", TABLE_ID + "1") test_size = 10 @@ -1224,7 +1324,7 @@ def test_upload_data_as_service_account_with_key_contents(self): sleep(30) # <- Curses Google!!! result = gbq.read_gbq( - "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), + "SELECT COUNT(*) AS num_rows FROM {0}".format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_contents()) - self.assertEqual(result['NUM_ROWS'][0], test_size) + self.assertEqual(result['num_rows'][0], test_size)