From 77ae6cbf6882e9d94964baa12c79f6edf04ab9c7 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Tue, 7 Sep 2021 02:18:17 -0700 Subject: [PATCH 1/3] Update read_orc_statistics to read sum only if exists (defaults to 0 without the check) --- python/cudf/cudf/io/orc.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 8f6002bb577..feac4dd4985 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -44,11 +44,17 @@ def _parse_column_statistics(cs, column_statistics_blob): if cs.HasField("intStatistics"): column_statistics["minimum"] = cs.intStatistics.minimum column_statistics["maximum"] = cs.intStatistics.maximum - column_statistics["sum"] = cs.intStatistics.sum + column_statistics["sum"] = ( + cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None + ) elif cs.HasField("doubleStatistics"): column_statistics["minimum"] = cs.doubleStatistics.minimum column_statistics["maximum"] = cs.doubleStatistics.maximum - column_statistics["sum"] = cs.doubleStatistics.sum + column_statistics["sum"] = ( + cs.doubleStatistics.sum + if cs.doubleStatistics.HasField("sum") + else None + ) elif cs.HasField("stringStatistics"): column_statistics["minimum"] = cs.stringStatistics.minimum column_statistics["maximum"] = cs.stringStatistics.maximum From f7758a1297425c8a35638ab6ad28b9b8f8203223 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Tue, 7 Sep 2021 02:24:26 -0700 Subject: [PATCH 2/3] Add test for reading orc data with empty sum statistics --- python/cudf/cudf/tests/test_orc.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index efa4359c68e..866d6c3617a 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1203,3 +1203,24 @@ def test_names_in_struct_dtype_nesting(datadir): edf = cudf.DataFrame(expect.to_pandas()) # test schema assert edf.dtypes.equals(got.dtypes) + + +def test_statistics_sum_overflow(): + maxint64 = np.iinfo(np.int64).max + minint64 = np.iinfo(np.int64).min + + buff = BytesIO() + with po.Writer( + buff, po.Struct(a=po.BigInt(), b=po.BigInt(), c=po.BigInt()) + ) as writer: + writer.write((maxint64, minint64, minint64)) + writer.write((1, -1, 1)) + + file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) + assert file_stats[0]["a"]["sum"] is None + assert file_stats[0]["b"]["sum"] is None + assert file_stats[0]["c"]["sum"] == minint64 + 1 + + assert stripe_stats[0]["a"]["sum"] is None + assert stripe_stats[0]["b"]["sum"] is None + assert stripe_stats[0]["c"]["sum"] == minint64 + 1 From 2e3fa15f7bc6e7105d18a7f602302b7fc9994855 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Tue, 14 Sep 2021 05:40:12 -0700 Subject: [PATCH 3/3] Add checks & testcase where min/max stats can be null, if the column is empty --- python/cudf/cudf/io/orc.py | 76 ++++++++++++++++++++++++------ python/cudf/cudf/tests/test_orc.py | 62 +++++++++++++++++++++--- 2 files changed, 118 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index feac4dd4985..3d1bbd7e71f 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -41,43 +41,90 @@ def _parse_column_statistics(cs, column_statistics_blob): column_statistics["number_of_values"] = cs.numberOfValues if cs.HasField("hasNull"): column_statistics["has_null"] = cs.hasNull + if cs.HasField("intStatistics"): - column_statistics["minimum"] = cs.intStatistics.minimum - column_statistics["maximum"] = cs.intStatistics.maximum + column_statistics["minimum"] = ( + cs.intStatistics.minimum + if cs.intStatistics.HasField("minimum") + else None + ) + column_statistics["maximum"] = ( + cs.intStatistics.maximum + if cs.intStatistics.HasField("maximum") + else None + ) column_statistics["sum"] = ( cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None ) + elif cs.HasField("doubleStatistics"): - column_statistics["minimum"] = cs.doubleStatistics.minimum - column_statistics["maximum"] = cs.doubleStatistics.maximum + column_statistics["minimum"] = ( + cs.doubleStatistics.minimum + if cs.doubleStatistics.HasField("minimum") + else None + ) + column_statistics["maximum"] = ( + cs.doubleStatistics.maximum + if cs.doubleStatistics.HasField("maximum") + else None + ) column_statistics["sum"] = ( cs.doubleStatistics.sum if cs.doubleStatistics.HasField("sum") else None ) + elif cs.HasField("stringStatistics"): - column_statistics["minimum"] = cs.stringStatistics.minimum - column_statistics["maximum"] = cs.stringStatistics.maximum + column_statistics["minimum"] = ( + cs.stringStatistics.minimum + if cs.stringStatistics.HasField("minimum") + else None + ) + column_statistics["maximum"] = ( + cs.stringStatistics.maximum + if cs.stringStatistics.HasField("maximum") + else None + ) column_statistics["sum"] = cs.stringStatistics.sum + elif cs.HasField("bucketStatistics"): column_statistics["true_count"] = cs.bucketStatistics.count[0] column_statistics["false_count"] = ( column_statistics["number_of_values"] - column_statistics["true_count"] ) + elif cs.HasField("decimalStatistics"): - column_statistics["minimum"] = cs.decimalStatistics.minimum - column_statistics["maximum"] = cs.decimalStatistics.maximum + column_statistics["minimum"] = ( + cs.decimalStatistics.minimum + if cs.decimalStatistics.HasField("minimum") + else None + ) + column_statistics["maximum"] = ( + cs.decimalStatistics.maximum + if cs.decimalStatistics.HasField("maximum") + else None + ) column_statistics["sum"] = cs.decimalStatistics.sum + elif cs.HasField("dateStatistics"): - column_statistics["minimum"] = datetime.datetime.fromtimestamp( - datetime.timedelta(cs.dateStatistics.minimum).total_seconds(), - datetime.timezone.utc, + column_statistics["minimum"] = ( + datetime.datetime.fromtimestamp( + datetime.timedelta(cs.dateStatistics.minimum).total_seconds(), + datetime.timezone.utc, + ) + if cs.dateStatistics.HasField("minimum") + else None ) - column_statistics["maximum"] = datetime.datetime.fromtimestamp( - datetime.timedelta(cs.dateStatistics.maximum).total_seconds(), - datetime.timezone.utc, + column_statistics["maximum"] = ( + datetime.datetime.fromtimestamp( + datetime.timedelta(cs.dateStatistics.maximum).total_seconds(), + datetime.timezone.utc, + ) + if cs.dateStatistics.HasField("maximum") + else None ) + elif cs.HasField("timestampStatistics"): # Before ORC-135, the local timezone offset was included and they were # stored as minimum and maximum. After ORC-135, the timestamp is @@ -93,6 +140,7 @@ def _parse_column_statistics(cs, column_statistics_blob): column_statistics["maximum"] = datetime.datetime.fromtimestamp( cs.timestampStatistics.maximumUtc / 1000, datetime.timezone.utc ) + elif cs.HasField("binaryStatistics"): column_statistics["sum"] = cs.binaryStatistics.sum diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 866d6c3617a..eb9b4bfd99b 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -5,6 +5,7 @@ import os import random from io import BytesIO +from string import ascii_lowercase import numpy as np import pandas as pd @@ -1217,10 +1218,59 @@ def test_statistics_sum_overflow(): writer.write((1, -1, 1)) file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) - assert file_stats[0]["a"]["sum"] is None - assert file_stats[0]["b"]["sum"] is None - assert file_stats[0]["c"]["sum"] == minint64 + 1 + assert file_stats[0]["a"].get("sum") is None + assert file_stats[0]["b"].get("sum") is None + assert file_stats[0]["c"].get("sum") == minint64 + 1 - assert stripe_stats[0]["a"]["sum"] is None - assert stripe_stats[0]["b"]["sum"] is None - assert stripe_stats[0]["c"]["sum"] == minint64 + 1 + assert stripe_stats[0]["a"].get("sum") is None + assert stripe_stats[0]["b"].get("sum") is None + assert stripe_stats[0]["c"].get("sum") == minint64 + 1 + + +def test_empty_statistics(): + buff = BytesIO() + orc_schema = po.Struct( + a=po.BigInt(), + b=po.Double(), + c=po.String(), + d=po.Decimal(11, 2), + e=po.Date(), + f=po.Timestamp(), + g=po.Boolean(), + h=po.Binary(), + i=po.BigInt(), + # One column with non null value, else cudf/pyorc readers crash + ) + data = tuple([None] * (len(orc_schema.fields) - 1) + [1]) + with po.Writer(buff, orc_schema) as writer: + writer.write(data) + + got = cudf.io.orc.read_orc_statistics([buff]) + + # Check for both file and stripe stats + for stats in got: + # Similar expected stats for the first 6 columns in this case + for col_name in ascii_lowercase[:6]: + assert stats[0][col_name].get("number_of_values") == 0 + assert stats[0][col_name].get("has_null") is True + assert stats[0][col_name].get("minimum") is None + assert stats[0][col_name].get("maximum") is None + for col_name in ascii_lowercase[:3]: + assert stats[0][col_name].get("sum") == 0 + # Sum for decimal column is a string + assert stats[0]["d"].get("sum") == "0" + + assert stats[0]["g"].get("number_of_values") == 0 + assert stats[0]["g"].get("has_null") is True + assert stats[0]["g"].get("true_count") == 0 + assert stats[0]["g"].get("false_count") == 0 + + assert stats[0]["h"].get("number_of_values") == 0 + assert stats[0]["h"].get("has_null") is True + assert stats[0]["h"].get("sum") == 0 + + assert stats[0]["i"].get("number_of_values") == 1 + assert stats[0]["i"].get("has_null") is False + assert stats[0]["i"].get("minimum") == 1 + assert stats[0]["i"].get("maximum") == 1 + assert stats[0]["i"].get("sum") == 1