Skip to content

Commit

Permalink
ORC-663: [C++] Support timestamp statistics with nanosecond (#543)
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

1. Store nanosecond into TimestampColumnStatistics in C++ writer.
2. Be aware of nanosecond in the TimestampColumnStatistics.
3. Adde new functions in TimestampColumnStatistics to get nanosecond while keeping backward compatibility.
4. Fix PPD to utilize nanosecond or use a default value for timestamp columns.

### Why are the changes needed?

To be consistent with the java side.

### How was this patch tested?

Several new unit tests have been added to TestPredicateLeaf.cc, TestColumnStatistics.cc, and TestSearchArgument.cc.
  • Loading branch information
wgtmac authored Oct 30, 2020
1 parent b36b091 commit 248e78a
Show file tree
Hide file tree
Showing 10 changed files with 373 additions and 27 deletions.
22 changes: 16 additions & 6 deletions c++/include/orc/Statistics.hh
Original file line number Diff line number Diff line change
Expand Up @@ -305,26 +305,26 @@ namespace orc {
virtual ~TimestampColumnStatistics();

/**
* Check whether column minimum.
* Check whether minimum timestamp exists.
* @return true if has minimum
*/
virtual bool hasMinimum() const = 0;

/**
* Check whether column maximum.
* Check whether maximum timestamp exists.
* @return true if has maximum
*/
virtual bool hasMaximum() const = 0;

/**
* Get the minimum value for the column.
* @return minimum value
* Get the millisecond of minimum timestamp in UTC.
* @return minimum value in millisecond
*/
virtual int64_t getMinimum() const = 0;

/**
* Get the maximum value for the column.
* @return maximum value
* Get the millisecond of maximum timestamp in UTC.
* @return maximum value in millisecond
*/
virtual int64_t getMaximum() const = 0;

Expand Down Expand Up @@ -352,7 +352,17 @@ namespace orc {
*/
virtual int64_t getUpperBound() const = 0;

/**
* Get the last 6 digits of nanosecond of minimum timestamp.
* @return last 6 digits of nanosecond of minimum timestamp.
*/
virtual int32_t getMinimumNanos() const = 0;

/**
* Get the last 6 digits of nanosecond of maximum timestamp.
* @return last 6 digits of nanosecond of maximum timestamp.
*/
virtual int32_t getMaximumNanos() const = 0;
};

class Statistics {
Expand Down
38 changes: 35 additions & 3 deletions c++/include/orc/sargs/Literal.hh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,33 @@ namespace orc {
*/
class Literal {
public:
struct Timestamp {
Timestamp() = default;
Timestamp(const Timestamp&) = default;
Timestamp(Timestamp&&) = default;
~Timestamp() = default;
Timestamp(int64_t second_, int32_t nanos_): second(second_), nanos(nanos_) {
// PASS
}
Timestamp& operator=(const Timestamp&) = default;
Timestamp& operator=(Timestamp&&) = default;
bool operator==(const Timestamp& r) const {
return second == r.second && nanos == r.nanos;
}
bool operator<(const Timestamp& r) const {
return second < r.second || (second == r.second && nanos < r.nanos);
}
bool operator<=(const Timestamp& r) const {
return second < r.second || (second == r.second && nanos <= r.nanos);
}
bool operator!=(const Timestamp& r) const { return !(*this == r); }
bool operator>(const Timestamp& r) const { return r < *this; }
bool operator>=(const Timestamp& r) const { return r <= *this; }
int64_t getMillis() const { return second * 1000 + nanos / 1000000; }
int64_t second;
int32_t nanos;
};

Literal(const Literal &r);
~Literal();
Literal& operator=(const Literal& r);
Expand Down Expand Up @@ -63,10 +90,15 @@ namespace orc {
Literal(bool val);

/**
* Create a literal of Timestamp or DATE type
* Create a literal of DATE type
*/
Literal(PredicateDataType type, int64_t val);

/**
* Create a literal of TIMESTAMP type
*/
Literal(int64_t second, int32_t nanos);

/**
* Create a literal of STRING type
*/
Expand All @@ -82,7 +114,7 @@ namespace orc {
*/
int64_t getLong() const;
int64_t getDate() const;
int64_t getTimestamp() const;
Timestamp getTimestamp() const;
double getFloat() const;
std::string getString() const;
bool getBool() const;
Expand All @@ -105,7 +137,7 @@ namespace orc {
double DoubleVal;
int64_t DateVal;
char * Buffer;
int64_t TimeStampVal;
Timestamp TimeStampVal;
Int128 DecimalVal;
bool BooleanVal;

Expand Down
2 changes: 1 addition & 1 deletion c++/src/ColumnWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1802,7 +1802,7 @@ namespace orc {
if (enableBloomFilter) {
bloomFilter->addLong(millsUTC);
}
tsStats->update(millsUTC);
tsStats->update(millsUTC, static_cast<int32_t>(nanos[i] % 1000000));

if (secs[i] < 0 && nanos[i] != 0) {
secs[i] += 1;
Expand Down
8 changes: 8 additions & 0 deletions c++/src/Statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,8 @@ namespace orc {
_stats.setMaximum(0);
_lowerBound = 0;
_upperBound = 0;
_minimumNanos = DEFAULT_MIN_NANOS;
_maximumNanos = DEFAULT_MAX_NANOS;
}else{
const proto::TimestampStatistics& stats = pb.timestampstatistics();
_stats.setHasMinimum(
Expand All @@ -327,6 +329,12 @@ namespace orc {
(stats.has_maximum() && (statContext.writerTimezone != nullptr)));
_hasLowerBound = stats.has_minimumutc() || stats.has_minimum();
_hasUpperBound = stats.has_maximumutc() || stats.has_maximum();
// to be consistent with java side, non-default minimumnanos and maximumnanos
// are added by one in their serialized form.
_minimumNanos = stats.has_minimumnanos() ?
stats.minimumnanos() - 1 : DEFAULT_MIN_NANOS;
_maximumNanos = stats.has_maximumnanos() ?
stats.maximumnanos() - 1 : DEFAULT_MAX_NANOS;

// Timestamp stats are stored in milliseconds
if (stats.has_minimumutc()) {
Expand Down
84 changes: 83 additions & 1 deletion c++/src/Statistics.hh
Original file line number Diff line number Diff line change
Expand Up @@ -1214,6 +1214,10 @@ namespace orc {
bool _hasUpperBound;
int64_t _lowerBound;
int64_t _upperBound;
int32_t _minimumNanos; // last 6 digits of nanosecond of minimum timestamp
int32_t _maximumNanos; // last 6 digits of nanosecond of maximum timestamp
static constexpr int32_t DEFAULT_MIN_NANOS = 0;
static constexpr int32_t DEFAULT_MAX_NANOS = 999999;

public:
TimestampColumnStatisticsImpl() { reset(); }
Expand Down Expand Up @@ -1279,14 +1283,68 @@ namespace orc {
_stats.updateMinMax(value);
}

void update(int64_t milli, int32_t nano) {
if (!_stats.hasMinimum()) {
_stats.setHasMinimum(true);
_stats.setHasMaximum(true);
_stats.setMinimum(milli);
_stats.setMaximum(milli);
_maximumNanos = _minimumNanos = nano;
} else {
if (milli <= _stats.getMinimum()) {
if (milli < _stats.getMinimum() || nano < _minimumNanos) {
_minimumNanos = nano;
}
_stats.setMinimum(milli);
}

if (milli >= _stats.getMaximum()) {
if (milli > _stats.getMaximum() || nano > _maximumNanos) {
_maximumNanos = nano;
}
_stats.setMaximum(milli);
}
}
}

void merge(const MutableColumnStatistics& other) override {
const TimestampColumnStatisticsImpl& tsStats =
dynamic_cast<const TimestampColumnStatisticsImpl&>(other);
_stats.merge(tsStats._stats);

_stats.setHasNull(_stats.hasNull() || tsStats.hasNull());
_stats.setNumberOfValues(_stats.getNumberOfValues() + tsStats.getNumberOfValues());

if (tsStats.hasMinimum()) {
if (!_stats.hasMinimum()) {
_stats.setHasMinimum(true);
_stats.setHasMaximum(true);
_stats.setMinimum(tsStats.getMinimum());
_stats.setMaximum(tsStats.getMaximum());
_minimumNanos = tsStats.getMinimumNanos();
_maximumNanos = tsStats.getMaximumNanos();
} else {
if (tsStats.getMaximum() >= _stats.getMaximum()) {
if (tsStats.getMaximum() > _stats.getMaximum() ||
tsStats.getMaximumNanos() > _maximumNanos) {
_maximumNanos = tsStats.getMaximumNanos();
}
_stats.setMaximum(tsStats.getMaximum());
}
if (tsStats.getMinimum() <= _stats.getMinimum()) {
if (tsStats.getMinimum() < _stats.getMinimum() ||
tsStats.getMinimumNanos() < _minimumNanos) {
_minimumNanos = tsStats.getMinimumNanos();
}
_stats.setMinimum(tsStats.getMinimum());
}
}
}
}

void reset() override {
_stats.reset();
_minimumNanos = DEFAULT_MIN_NANOS;
_maximumNanos = DEFAULT_MAX_NANOS;
}

void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
Expand All @@ -1298,9 +1356,17 @@ namespace orc {
if (_stats.hasMinimum()) {
tsStats->set_minimumutc(_stats.getMinimum());
tsStats->set_maximumutc(_stats.getMaximum());
if (_minimumNanos != DEFAULT_MIN_NANOS) {
tsStats->set_minimumnanos(_minimumNanos + 1);
}
if (_maximumNanos != DEFAULT_MAX_NANOS) {
tsStats->set_maximumnanos(_maximumNanos + 1);
}
} else {
tsStats->clear_minimumutc();
tsStats->clear_maximumutc();
tsStats->clear_minimumnanos();
tsStats->clear_maximumnanos();
}
}

Expand Down Expand Up @@ -1379,6 +1445,22 @@ namespace orc {
throw ParseError("UpperBound is not defined.");
}
}

int32_t getMinimumNanos() const override {
if (hasMinimum()) {
return _minimumNanos;
} else {
throw ParseError("Minimum is not defined.");
}
}

int32_t getMaximumNanos() const override {
if (hasMaximum()) {
return _maximumNanos;
} else {
throw ParseError("Maximum is not defined.");
}
}
};

ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
Expand Down
27 changes: 22 additions & 5 deletions c++/src/sargs/Literal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ namespace orc {
}

Literal::Literal(PredicateDataType type, int64_t val) {
if (type != PredicateDataType::DATE && type != PredicateDataType::TIMESTAMP) {
throw std::invalid_argument("only DATE & TIMESTAMP are supported here!");
if (type != PredicateDataType::DATE) {
throw std::invalid_argument("only DATE is supported here!");
}
mType = type;
mValue.IntVal = val;
Expand Down Expand Up @@ -99,6 +99,17 @@ namespace orc {
mHashCode = hashCode();
}

Literal::Literal(int64_t second, int32_t nanos) {
mType = PredicateDataType::TIMESTAMP;
mValue.TimeStampVal.second = second;
mValue.TimeStampVal.nanos = nanos;
mPrecision = 0;
mScale = 0;
mSize = sizeof(Timestamp);
mIsNull = false;
mHashCode = hashCode();
}

Literal::Literal(const Literal& r): mType(r.mType)
, mSize(r.mSize)
, mIsNull(r.mIsNull)
Expand All @@ -112,6 +123,8 @@ namespace orc {
mPrecision = r.mPrecision;
mScale = r.mScale;
mValue = r.mValue;
} else if (mType == PredicateDataType::TIMESTAMP) {
mValue.TimeStampVal = r.mValue.TimeStampVal;
} else {
mValue = r.mValue;
mPrecision = 0;
Expand Down Expand Up @@ -141,6 +154,8 @@ namespace orc {
if (mType == PredicateDataType::STRING) {
mValue.Buffer = new char[r.mSize];
memcpy(mValue.Buffer, r.mValue.Buffer, r.mSize);
} else if (mType == PredicateDataType::TIMESTAMP) {
mValue.TimeStampVal = r.mValue.TimeStampVal;
} else {
mValue = r.mValue;
}
Expand All @@ -163,7 +178,8 @@ namespace orc {
sstream << mValue.DateVal;
break;
case PredicateDataType::TIMESTAMP:
sstream << mValue.TimeStampVal;
sstream << mValue.TimeStampVal.second << "."
<< mValue.TimeStampVal.nanos;
break;
case PredicateDataType::FLOAT:
sstream << mValue.DoubleVal;
Expand Down Expand Up @@ -192,7 +208,8 @@ namespace orc {
case PredicateDataType::DATE:
return std::hash<int64_t>{}(mValue.DateVal);
case PredicateDataType::TIMESTAMP:
return std::hash<int64_t>{}(mValue.TimeStampVal);
return std::hash<int64_t>{}(mValue.TimeStampVal.second) * 17 +
std::hash<int32_t>{}(mValue.TimeStampVal.nanos);
case PredicateDataType::FLOAT:
return std::hash<double>{}(mValue.DoubleVal);
case PredicateDataType::BOOLEAN:
Expand Down Expand Up @@ -267,7 +284,7 @@ namespace orc {
return mValue.DateVal;
}

int64_t Literal::getTimestamp() const {
Literal::Timestamp Literal::getTimestamp() const {
validate(mIsNull, mType, PredicateDataType::TIMESTAMP);
return mValue.TimeStampVal;
}
Expand Down
Loading

0 comments on commit 248e78a

Please sign in to comment.