Skip to content

Commit

Permalink
Add checks & testcase where min/max stats can be null, if the column …
Browse files Browse the repository at this point in the history
…is empty
  • Loading branch information
ayushdg committed Sep 14, 2021
1 parent f7758a1 commit 2e3fa15
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 20 deletions.
76 changes: 62 additions & 14 deletions python/cudf/cudf/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,43 +41,90 @@ def _parse_column_statistics(cs, column_statistics_blob):
column_statistics["number_of_values"] = cs.numberOfValues
if cs.HasField("hasNull"):
column_statistics["has_null"] = cs.hasNull

if cs.HasField("intStatistics"):
column_statistics["minimum"] = cs.intStatistics.minimum
column_statistics["maximum"] = cs.intStatistics.maximum
column_statistics["minimum"] = (
cs.intStatistics.minimum
if cs.intStatistics.HasField("minimum")
else None
)
column_statistics["maximum"] = (
cs.intStatistics.maximum
if cs.intStatistics.HasField("maximum")
else None
)
column_statistics["sum"] = (
cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None
)

elif cs.HasField("doubleStatistics"):
column_statistics["minimum"] = cs.doubleStatistics.minimum
column_statistics["maximum"] = cs.doubleStatistics.maximum
column_statistics["minimum"] = (
cs.doubleStatistics.minimum
if cs.doubleStatistics.HasField("minimum")
else None
)
column_statistics["maximum"] = (
cs.doubleStatistics.maximum
if cs.doubleStatistics.HasField("maximum")
else None
)
column_statistics["sum"] = (
cs.doubleStatistics.sum
if cs.doubleStatistics.HasField("sum")
else None
)

elif cs.HasField("stringStatistics"):
column_statistics["minimum"] = cs.stringStatistics.minimum
column_statistics["maximum"] = cs.stringStatistics.maximum
column_statistics["minimum"] = (
cs.stringStatistics.minimum
if cs.stringStatistics.HasField("minimum")
else None
)
column_statistics["maximum"] = (
cs.stringStatistics.maximum
if cs.stringStatistics.HasField("maximum")
else None
)
column_statistics["sum"] = cs.stringStatistics.sum

elif cs.HasField("bucketStatistics"):
column_statistics["true_count"] = cs.bucketStatistics.count[0]
column_statistics["false_count"] = (
column_statistics["number_of_values"]
- column_statistics["true_count"]
)

elif cs.HasField("decimalStatistics"):
column_statistics["minimum"] = cs.decimalStatistics.minimum
column_statistics["maximum"] = cs.decimalStatistics.maximum
column_statistics["minimum"] = (
cs.decimalStatistics.minimum
if cs.decimalStatistics.HasField("minimum")
else None
)
column_statistics["maximum"] = (
cs.decimalStatistics.maximum
if cs.decimalStatistics.HasField("maximum")
else None
)
column_statistics["sum"] = cs.decimalStatistics.sum

elif cs.HasField("dateStatistics"):
column_statistics["minimum"] = datetime.datetime.fromtimestamp(
datetime.timedelta(cs.dateStatistics.minimum).total_seconds(),
datetime.timezone.utc,
column_statistics["minimum"] = (
datetime.datetime.fromtimestamp(
datetime.timedelta(cs.dateStatistics.minimum).total_seconds(),
datetime.timezone.utc,
)
if cs.dateStatistics.HasField("minimum")
else None
)
column_statistics["maximum"] = datetime.datetime.fromtimestamp(
datetime.timedelta(cs.dateStatistics.maximum).total_seconds(),
datetime.timezone.utc,
column_statistics["maximum"] = (
datetime.datetime.fromtimestamp(
datetime.timedelta(cs.dateStatistics.maximum).total_seconds(),
datetime.timezone.utc,
)
if cs.dateStatistics.HasField("maximum")
else None
)

elif cs.HasField("timestampStatistics"):
# Before ORC-135, the local timezone offset was included and they were
# stored as minimum and maximum. After ORC-135, the timestamp is
Expand All @@ -93,6 +140,7 @@ def _parse_column_statistics(cs, column_statistics_blob):
column_statistics["maximum"] = datetime.datetime.fromtimestamp(
cs.timestampStatistics.maximumUtc / 1000, datetime.timezone.utc
)

elif cs.HasField("binaryStatistics"):
column_statistics["sum"] = cs.binaryStatistics.sum

Expand Down
62 changes: 56 additions & 6 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import random
from io import BytesIO
from string import ascii_lowercase

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -1217,10 +1218,59 @@ def test_statistics_sum_overflow():
writer.write((1, -1, 1))

file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff])
assert file_stats[0]["a"]["sum"] is None
assert file_stats[0]["b"]["sum"] is None
assert file_stats[0]["c"]["sum"] == minint64 + 1
assert file_stats[0]["a"].get("sum") is None
assert file_stats[0]["b"].get("sum") is None
assert file_stats[0]["c"].get("sum") == minint64 + 1

assert stripe_stats[0]["a"]["sum"] is None
assert stripe_stats[0]["b"]["sum"] is None
assert stripe_stats[0]["c"]["sum"] == minint64 + 1
assert stripe_stats[0]["a"].get("sum") is None
assert stripe_stats[0]["b"].get("sum") is None
assert stripe_stats[0]["c"].get("sum") == minint64 + 1


def test_empty_statistics():
buff = BytesIO()
orc_schema = po.Struct(
a=po.BigInt(),
b=po.Double(),
c=po.String(),
d=po.Decimal(11, 2),
e=po.Date(),
f=po.Timestamp(),
g=po.Boolean(),
h=po.Binary(),
i=po.BigInt(),
# One column with non null value, else cudf/pyorc readers crash
)
data = tuple([None] * (len(orc_schema.fields) - 1) + [1])
with po.Writer(buff, orc_schema) as writer:
writer.write(data)

got = cudf.io.orc.read_orc_statistics([buff])

# Check for both file and stripe stats
for stats in got:
# Similar expected stats for the first 6 columns in this case
for col_name in ascii_lowercase[:6]:
assert stats[0][col_name].get("number_of_values") == 0
assert stats[0][col_name].get("has_null") is True
assert stats[0][col_name].get("minimum") is None
assert stats[0][col_name].get("maximum") is None
for col_name in ascii_lowercase[:3]:
assert stats[0][col_name].get("sum") == 0
# Sum for decimal column is a string
assert stats[0]["d"].get("sum") == "0"

assert stats[0]["g"].get("number_of_values") == 0
assert stats[0]["g"].get("has_null") is True
assert stats[0]["g"].get("true_count") == 0
assert stats[0]["g"].get("false_count") == 0

assert stats[0]["h"].get("number_of_values") == 0
assert stats[0]["h"].get("has_null") is True
assert stats[0]["h"].get("sum") == 0

assert stats[0]["i"].get("number_of_values") == 1
assert stats[0]["i"].get("has_null") is False
assert stats[0]["i"].get("minimum") == 1
assert stats[0]["i"].get("maximum") == 1
assert stats[0]["i"].get("sum") == 1

0 comments on commit 2e3fa15

Please sign in to comment.