Skip to content

Commit

Permalink
Use Statistics.Builder instead of mutating Statistics in Parquet Meta…
Browse files Browse the repository at this point in the history
…dataReader
  • Loading branch information
takezoe authored and raunaqmorarka committed May 30, 2024
1 parent 97b054e commit bf18de4
Showing 1 changed file with 9 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ public static org.apache.parquet.column.statistics.Statistics<?> readStats(Optio
&& statistics.isSetMin() && statistics.isSetMax() // the min,max fields used for UTF8 before Parquet PARQUET-1025
&& columnStatistics.genericGetMin() == null && columnStatistics.genericGetMax() == null
&& !CorruptStatistics.shouldIgnoreStatistics(fileCreatedBy.orElse(null), type.getPrimitiveTypeName())) {
tryReadOldUtf8Stats(statistics, (BinaryStatistics) columnStatistics);
columnStatistics = tryReadOldUtf8Stats(statistics, (BinaryStatistics) columnStatistics);
}

return columnStatistics;
Expand All @@ -294,7 +294,7 @@ public Optional<Boolean> visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation
.orElse(FALSE);
}

private static void tryReadOldUtf8Stats(Statistics statistics, BinaryStatistics columnStatistics)
private static org.apache.parquet.column.statistics.Statistics<?> tryReadOldUtf8Stats(Statistics statistics, BinaryStatistics columnStatistics)
{
byte[] min = statistics.getMin();
byte[] max = statistics.getMax();
Expand Down Expand Up @@ -324,18 +324,20 @@ private static void tryReadOldUtf8Stats(Statistics statistics, BinaryStatistics
}
if (maxGoodLength == 0) {
// We can return just min bound, but code downstream likely expects both are present or both are absent.
return;
return columnStatistics;
}

min = Arrays.copyOf(min, minGoodLength);
max = Arrays.copyOf(max, maxGoodLength);
max[maxGoodLength - 1]++;
}

columnStatistics.setMinMaxFromBytes(min, max);
if (!columnStatistics.isNumNullsSet() && statistics.isSetNull_count()) {
columnStatistics.setNumNulls(statistics.getNull_count());
}
return org.apache.parquet.column.statistics.Statistics
.getBuilderForReading(columnStatistics.type())
.withMin(min)
.withMax(max)
.withNumNulls(!columnStatistics.isNumNullsSet() && statistics.isSetNull_count() ? statistics.getNull_count() : columnStatistics.getNumNulls())
.build();
}

private static boolean isAscii(byte b)
Expand Down

0 comments on commit bf18de4

Please sign in to comment.