Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix logic while parsing the sum statistic for numerical orc columns #9183

Merged
merged 4 commits into from
Sep 23, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 70 additions & 16 deletions python/cudf/cudf/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,37 +41,90 @@ def _parse_column_statistics(cs, column_statistics_blob):
column_statistics["number_of_values"] = cs.numberOfValues
if cs.HasField("hasNull"):
column_statistics["has_null"] = cs.hasNull

if cs.HasField("intStatistics"):
column_statistics["minimum"] = cs.intStatistics.minimum
column_statistics["maximum"] = cs.intStatistics.maximum
column_statistics["sum"] = cs.intStatistics.sum
column_statistics["minimum"] = (
cs.intStatistics.minimum
if cs.intStatistics.HasField("minimum")
else None
)
column_statistics["maximum"] = (
cs.intStatistics.maximum
if cs.intStatistics.HasField("maximum")
else None
)
column_statistics["sum"] = (
cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None
)

elif cs.HasField("doubleStatistics"):
column_statistics["minimum"] = cs.doubleStatistics.minimum
column_statistics["maximum"] = cs.doubleStatistics.maximum
column_statistics["sum"] = cs.doubleStatistics.sum
column_statistics["minimum"] = (
cs.doubleStatistics.minimum
if cs.doubleStatistics.HasField("minimum")
else None
)
column_statistics["maximum"] = (
cs.doubleStatistics.maximum
if cs.doubleStatistics.HasField("maximum")
else None
)
column_statistics["sum"] = (
cs.doubleStatistics.sum
if cs.doubleStatistics.HasField("sum")
else None
)

elif cs.HasField("stringStatistics"):
column_statistics["minimum"] = cs.stringStatistics.minimum
column_statistics["maximum"] = cs.stringStatistics.maximum
column_statistics["minimum"] = (
cs.stringStatistics.minimum
if cs.stringStatistics.HasField("minimum")
else None
)
column_statistics["maximum"] = (
cs.stringStatistics.maximum
if cs.stringStatistics.HasField("maximum")
else None
)
column_statistics["sum"] = cs.stringStatistics.sum

elif cs.HasField("bucketStatistics"):
column_statistics["true_count"] = cs.bucketStatistics.count[0]
column_statistics["false_count"] = (
column_statistics["number_of_values"]
- column_statistics["true_count"]
)

elif cs.HasField("decimalStatistics"):
column_statistics["minimum"] = cs.decimalStatistics.minimum
column_statistics["maximum"] = cs.decimalStatistics.maximum
column_statistics["minimum"] = (
cs.decimalStatistics.minimum
if cs.decimalStatistics.HasField("minimum")
else None
)
column_statistics["maximum"] = (
cs.decimalStatistics.maximum
if cs.decimalStatistics.HasField("maximum")
else None
)
column_statistics["sum"] = cs.decimalStatistics.sum

elif cs.HasField("dateStatistics"):
column_statistics["minimum"] = datetime.datetime.fromtimestamp(
datetime.timedelta(cs.dateStatistics.minimum).total_seconds(),
datetime.timezone.utc,
column_statistics["minimum"] = (
datetime.datetime.fromtimestamp(
datetime.timedelta(cs.dateStatistics.minimum).total_seconds(),
datetime.timezone.utc,
)
if cs.dateStatistics.HasField("minimum")
else None
)
column_statistics["maximum"] = datetime.datetime.fromtimestamp(
datetime.timedelta(cs.dateStatistics.maximum).total_seconds(),
datetime.timezone.utc,
column_statistics["maximum"] = (
datetime.datetime.fromtimestamp(
datetime.timedelta(cs.dateStatistics.maximum).total_seconds(),
datetime.timezone.utc,
)
if cs.dateStatistics.HasField("maximum")
else None
)

elif cs.HasField("timestampStatistics"):
# Before ORC-135, the local timezone offset was included and they were
# stored as minimum and maximum. After ORC-135, the timestamp is
Expand All @@ -87,6 +140,7 @@ def _parse_column_statistics(cs, column_statistics_blob):
column_statistics["maximum"] = datetime.datetime.fromtimestamp(
cs.timestampStatistics.maximumUtc / 1000, datetime.timezone.utc
)

elif cs.HasField("binaryStatistics"):
column_statistics["sum"] = cs.binaryStatistics.sum

Expand Down
71 changes: 71 additions & 0 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import random
from io import BytesIO
from string import ascii_lowercase

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -1203,3 +1204,73 @@ def test_names_in_struct_dtype_nesting(datadir):
edf = cudf.DataFrame(expect.to_pandas())
# test schema
assert edf.dtypes.equals(got.dtypes)


def test_statistics_sum_overflow():
maxint64 = np.iinfo(np.int64).max
minint64 = np.iinfo(np.int64).min

buff = BytesIO()
with po.Writer(
buff, po.Struct(a=po.BigInt(), b=po.BigInt(), c=po.BigInt())
) as writer:
writer.write((maxint64, minint64, minint64))
writer.write((1, -1, 1))

file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff])
assert file_stats[0]["a"].get("sum") is None
assert file_stats[0]["b"].get("sum") is None
assert file_stats[0]["c"].get("sum") == minint64 + 1

assert stripe_stats[0]["a"].get("sum") is None
assert stripe_stats[0]["b"].get("sum") is None
assert stripe_stats[0]["c"].get("sum") == minint64 + 1


def test_empty_statistics():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should there be one more test to check all the statistics value for a proper table of all types of column with values.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, right now the other read_orc_statistics test only checks for int and bool types. I'll add one/modify existing to include the different column types.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the interest of time, I'll add one more test in a followup PR.

buff = BytesIO()
orc_schema = po.Struct(
a=po.BigInt(),
b=po.Double(),
c=po.String(),
d=po.Decimal(11, 2),
e=po.Date(),
f=po.Timestamp(),
g=po.Boolean(),
h=po.Binary(),
i=po.BigInt(),
# One column with non null value, else cudf/pyorc readers crash
)
data = tuple([None] * (len(orc_schema.fields) - 1) + [1])
with po.Writer(buff, orc_schema) as writer:
writer.write(data)

got = cudf.io.orc.read_orc_statistics([buff])

# Check for both file and stripe stats
for stats in got:
# Similar expected stats for the first 6 columns in this case
for col_name in ascii_lowercase[:6]:
assert stats[0][col_name].get("number_of_values") == 0
assert stats[0][col_name].get("has_null") is True
assert stats[0][col_name].get("minimum") is None
assert stats[0][col_name].get("maximum") is None
for col_name in ascii_lowercase[:3]:
assert stats[0][col_name].get("sum") == 0
# Sum for decimal column is a string
assert stats[0]["d"].get("sum") == "0"

assert stats[0]["g"].get("number_of_values") == 0
assert stats[0]["g"].get("has_null") is True
assert stats[0]["g"].get("true_count") == 0
assert stats[0]["g"].get("false_count") == 0

assert stats[0]["h"].get("number_of_values") == 0
assert stats[0]["h"].get("has_null") is True
assert stats[0]["h"].get("sum") == 0

assert stats[0]["i"].get("number_of_values") == 1
assert stats[0]["i"].get("has_null") is False
assert stats[0]["i"].get("minimum") == 1
assert stats[0]["i"].get("maximum") == 1
assert stats[0]["i"].get("sum") == 1