Skip to content

Commit

Permalink
feat!: use nullable Int64 and boolean dtypes if available (#445)
Browse files Browse the repository at this point in the history
* feat: use nullable Int64 and boolean dtypes if available

* allow google-cloud-bigquery 3.x

* document dtypes mapping
  • Loading branch information
tswast authored Dec 9, 2021
1 parent e13abaf commit 89078f8
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 26 deletions.
8 changes: 5 additions & 3 deletions docs/reading.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,13 @@ column, based on the BigQuery table schema.
================== =========================
BigQuery Data Type dtype
================== =========================
FLOAT float
TIMESTAMP :class:`~pandas.DatetimeTZDtype` with ``unit='ns'`` and ``tz='UTC'``
DATE datetime64[ns]
DATETIME datetime64[ns]
BOOL boolean
FLOAT float
INT64 Int64
TIME datetime64[ns]
DATE datetime64[ns]
TIMESTAMP :class:`~pandas.DatetimeTZDtype` with ``unit='ns'`` and ``tz='UTC'``
================== =========================

.. _reading-bqstorage-api:
Expand Down
8 changes: 8 additions & 0 deletions pandas_gbq/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
BIGQUERY_BQSTORAGE_VERSION = "1.24.0"
BIGQUERY_FROM_DATAFRAME_CSV_VERSION = "2.6.0"
PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0"
PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0"
PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION = "1.1.0"


Expand Down Expand Up @@ -90,6 +91,13 @@ def pandas_has_deprecated_verbose(self):
)
return self.pandas_installed_version >= pandas_verbosity_deprecation

@property
def pandas_has_boolean_dtype(self):
import pkg_resources

desired_version = pkg_resources.parse_version(PANDAS_BOOLEAN_DTYPE_VERSION)
return self.pandas_installed_version >= desired_version

@property
def pandas_has_parquet_with_lossless_timestamp(self):
import pkg_resources
Expand Down
7 changes: 6 additions & 1 deletion pandas_gbq/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,12 +579,13 @@ def _bqschema_to_nullsafe_dtypes(schema_fields):
#missing-data-casting-rules-and-indexing
"""
# If you update this mapping, also update the table at
# `docs/source/reading.rst`.
# `docs/reading.rst`.
dtype_map = {
"DATE": "datetime64[ns]",
"DATETIME": "datetime64[ns]",
"FLOAT": np.dtype(float),
"GEOMETRY": "object",
"INTEGER": "Int64",
"RECORD": "object",
"STRING": "object",
# datetime.time objects cannot be case to datetime64.
Expand All @@ -596,6 +597,10 @@ def _bqschema_to_nullsafe_dtypes(schema_fields):
"TIMESTAMP": "datetime64[ns]",
}

# Amend dtype_map with newer extension types if pandas version allows.
if FEATURES.pandas_has_boolean_dtype:
dtype_map["BOOLEAN"] = "boolean"

dtypes = {}
for field in schema_fields:
name = str(field["name"])
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"google-auth-oauthlib",
# 2.4.* has a bug where waiting for the query can hang indefinitely.
# https://github.com/pydata/pandas-gbq/issues/343
"google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<3.0.0dev,!=2.4.*",
"google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<4.0.0dev,!=2.4.*",
]
extras = {
"tqdm": "tqdm>=4.23.0",
Expand Down
70 changes: 51 additions & 19 deletions tests/system/test_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import numpy as np
import pandas
import pandas.api.types
import pandas.util.testing as tm
import pandas.testing as tm
from pandas import DataFrame, NaT

try:
Expand All @@ -21,6 +21,7 @@
import pytz

from pandas_gbq import gbq
from pandas_gbq.features import FEATURES
import pandas_gbq.schema


Expand All @@ -32,6 +33,18 @@ def test_imports():
gbq._test_google_api_imports()


def make_mixed_dataframe_v1():
# Re-implementation of private pandas.util.testing.makeMixedDataFrame
return pandas.DataFrame(
{
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
"D": pandas.bdate_range("1/1/2009", periods=5),
}
)


def make_mixed_dataframe_v2(test_size):
# create df to test for all BQ datatypes except RECORD
bools = np.random.randint(2, size=(1, test_size)).astype(bool)
Expand Down Expand Up @@ -168,7 +181,7 @@ def test_should_properly_handle_valid_integers(self, project_id):
credentials=self.credentials,
dialect="standard",
)
tm.assert_frame_equal(df, DataFrame({"valid_integer": [3]}))
tm.assert_frame_equal(df, DataFrame({"valid_integer": [3]}, dtype="Int64"))

def test_should_properly_handle_nullable_integers(self, project_id):
query = """SELECT * FROM
Expand All @@ -194,7 +207,7 @@ def test_should_properly_handle_valid_longs(self, project_id):
credentials=self.credentials,
dialect="standard",
)
tm.assert_frame_equal(df, DataFrame({"valid_long": [1 << 62]}))
tm.assert_frame_equal(df, DataFrame({"valid_long": [1 << 62]}, dtype="Int64"))

def test_should_properly_handle_nullable_longs(self, project_id):
query = """SELECT * FROM
Expand Down Expand Up @@ -433,7 +446,10 @@ def test_should_properly_handle_null_boolean(self, project_id):
credentials=self.credentials,
dialect="legacy",
)
tm.assert_frame_equal(df, DataFrame({"null_boolean": [None]}))
expected_dtype = "boolean" if FEATURES.pandas_has_boolean_dtype else None
tm.assert_frame_equal(
df, DataFrame({"null_boolean": [None]}, dtype=expected_dtype)
)

def test_should_properly_handle_nullable_booleans(self, project_id):
query = """SELECT * FROM
Expand All @@ -445,8 +461,9 @@ def test_should_properly_handle_nullable_booleans(self, project_id):
credentials=self.credentials,
dialect="legacy",
)
expected_dtype = "boolean" if FEATURES.pandas_has_boolean_dtype else None
tm.assert_frame_equal(
df, DataFrame({"nullable_boolean": [True, None]}).astype(object)
df, DataFrame({"nullable_boolean": [True, None]}, dtype=expected_dtype)
)

def test_unicode_string_conversion_and_normalization(self, project_id):
Expand Down Expand Up @@ -629,7 +646,7 @@ def test_one_row_one_column(self, project_id):
credentials=self.credentials,
dialect="standard",
)
expected_result = DataFrame(dict(v=[3]))
expected_result = DataFrame(dict(v=[3]), dtype="Int64")
tm.assert_frame_equal(df, expected_result)

def test_legacy_sql(self, project_id):
Expand Down Expand Up @@ -719,7 +736,7 @@ def test_query_with_parameters(self, project_id):
configuration=config,
dialect="legacy",
)
tm.assert_frame_equal(df, DataFrame({"valid_result": [3]}))
tm.assert_frame_equal(df, DataFrame({"valid_result": [3]}, dtype="Int64"))

def test_query_inside_configuration(self, project_id):
query_no_use = 'SELECT "PI_WRONG" AS valid_string'
Expand Down Expand Up @@ -842,7 +859,11 @@ def test_struct(self, project_id):
dialect="standard",
)
expected = DataFrame(
[[1, {"letter": "a", "num": 1}]], columns=["int_field", "struct_field"],
{
"int_field": pandas.Series([1], dtype="Int64"),
"struct_field": [{"letter": "a", "num": 1}],
},
columns=["int_field", "struct_field"],
)
tm.assert_frame_equal(df, expected)

Expand Down Expand Up @@ -874,7 +895,12 @@ def test_array_length_zero(self, project_id):
dialect="standard",
)
expected = DataFrame(
[["a", [""], 1], ["b", [], 0]], columns=["letter", "array_field", "len"],
{
"letter": ["a", "b"],
"array_field": [[""], []],
"len": pandas.Series([1, 0], dtype="Int64"),
},
columns=["letter", "array_field", "len"],
)
tm.assert_frame_equal(df, expected)

Expand Down Expand Up @@ -908,7 +934,13 @@ def test_array_of_floats(self, project_id):
credentials=self.credentials,
dialect="standard",
)
tm.assert_frame_equal(df, DataFrame([[[1.1, 2.2, 3.3], 4]], columns=["a", "b"]))
tm.assert_frame_equal(
df,
DataFrame(
{"a": [[1.1, 2.2, 3.3]], "b": pandas.Series([4], dtype="Int64")},
columns=["a", "b"],
),
)

def test_tokyo(self, tokyo_dataset, tokyo_table, project_id):
df = gbq.read_gbq(
Expand Down Expand Up @@ -1021,7 +1053,7 @@ def test_upload_data_if_table_exists_append(self, project_id):
test_id = "3"
test_size = 10
df = make_mixed_dataframe_v2(test_size)
df_different_schema = tm.makeMixedDataFrame()
df_different_schema = make_mixed_dataframe_v1()

# Initialize table with sample data
gbq.to_gbq(
Expand Down Expand Up @@ -1101,7 +1133,7 @@ def test_upload_data_if_table_exists_replace(self, project_id):
test_id = "4"
test_size = 10
df = make_mixed_dataframe_v2(test_size)
df_different_schema = tm.makeMixedDataFrame()
df_different_schema = make_mixed_dataframe_v1()

# Initialize table with sample data
gbq.to_gbq(
Expand Down Expand Up @@ -1225,7 +1257,7 @@ def test_upload_data_with_newlines(self, project_id):
result = result_df["s"].sort_values()
expected = df["s"].sort_values()

tm.assert_numpy_array_equal(expected.values, result.values)
tm.assert_series_equal(expected, result)

def test_upload_data_flexible_column_order(self, project_id):
test_id = "13"
Expand Down Expand Up @@ -1254,7 +1286,7 @@ def test_upload_data_flexible_column_order(self, project_id):
def test_upload_data_with_valid_user_schema(self, project_id):
# Issue #46; tests test scenarios with user-provided
# schemas
df = tm.makeMixedDataFrame()
df = make_mixed_dataframe_v1()
test_id = "18"
test_schema = [
{"name": "A", "type": "FLOAT"},
Expand All @@ -1276,7 +1308,7 @@ def test_upload_data_with_valid_user_schema(self, project_id):
)

def test_upload_data_with_invalid_user_schema_raises_error(self, project_id):
df = tm.makeMixedDataFrame()
df = make_mixed_dataframe_v1()
test_id = "19"
test_schema = [
{"name": "A", "type": "FLOAT"},
Expand All @@ -1295,7 +1327,7 @@ def test_upload_data_with_invalid_user_schema_raises_error(self, project_id):
)

def test_upload_data_with_missing_schema_fields_raises_error(self, project_id):
df = tm.makeMixedDataFrame()
df = make_mixed_dataframe_v1()
test_id = "20"
test_schema = [
{"name": "A", "type": "FLOAT"},
Expand Down Expand Up @@ -1351,7 +1383,7 @@ def test_upload_data_with_timestamp(self, project_id):
tm.assert_series_equal(expected, result)

def test_upload_data_with_different_df_and_user_schema(self, project_id):
df = tm.makeMixedDataFrame()
df = make_mixed_dataframe_v1()
df["A"] = df["A"].astype(str)
df["B"] = df["B"].astype(str)
test_id = "22"
Expand Down Expand Up @@ -1460,13 +1492,13 @@ def test_dataset_does_not_exist(gbq_dataset, random_dataset_id):


def test_create_table(gbq_table):
schema = gbq._generate_bq_schema(tm.makeMixedDataFrame())
schema = gbq._generate_bq_schema(make_mixed_dataframe_v1())
gbq_table.create("test_create_table", schema)
assert gbq_table.exists("test_create_table")


def test_create_table_already_exists(gbq_table):
schema = gbq._generate_bq_schema(tm.makeMixedDataFrame())
schema = gbq._generate_bq_schema(make_mixed_dataframe_v1())
gbq_table.create("test_create_table_exists", schema)
with pytest.raises(gbq.TableCreationError):
gbq_table.create("test_create_table_exists", schema)
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ def no_auth(monkeypatch):
@pytest.mark.parametrize(
("type_", "expected"),
[
("INTEGER", None), # Can't handle NULL
("BOOLEAN", None), # Can't handle NULL
("SOME_NEW_UNKNOWN_TYPE", None),
("INTEGER", "Int64"),
("FLOAT", numpy.dtype(float)),
# TIMESTAMP will be localized after DataFrame construction.
("TIMESTAMP", "datetime64[ns]"),
Expand Down

0 comments on commit 89078f8

Please sign in to comment.