Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-2988: Supports disabling statistics for specific columns #2989

Merged
merged 5 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ public class ParquetProperties {
public static final double DEFAULT_BLOOM_FILTER_FPP = 0.01;
public static final boolean DEFAULT_ADAPTIVE_BLOOM_FILTER_ENABLED = false;
public static final int DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER = 5;
public static final boolean DEFAULT_STATISTICS_ENABLED = true;

public static final boolean DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED = true;

Expand Down Expand Up @@ -122,6 +123,7 @@ public static WriterVersion fromString(String name) {
private final boolean pageWriteChecksumEnabled;
private final ColumnProperty<ByteStreamSplitMode> byteStreamSplitEnabled;
private final Map<String, String> extraMetaData;
private final ColumnProperty<Boolean> statistics;

private ParquetProperties(Builder builder) {
this.pageSizeThreshold = builder.pageSize;
Expand Down Expand Up @@ -149,6 +151,7 @@ private ParquetProperties(Builder builder) {
this.pageWriteChecksumEnabled = builder.pageWriteChecksumEnabled;
this.byteStreamSplitEnabled = builder.byteStreamSplitEnabled.build();
this.extraMetaData = builder.extraMetaData;
this.statistics = builder.statistics.build();
}

public static Builder builder() {
Expand Down Expand Up @@ -330,6 +333,10 @@ public Map<String, String> getExtraMetaData() {
return extraMetaData;
}

public boolean getStatisticsEnabled(ColumnDescriptor column) {
return statistics.getValue(column);
}

@Override
public String toString() {
return "Parquet page size to " + getPageSizeThreshold() + '\n'
Expand Down Expand Up @@ -372,6 +379,7 @@ public static class Builder {
private boolean pageWriteChecksumEnabled = DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED;
private final ColumnProperty.Builder<ByteStreamSplitMode> byteStreamSplitEnabled;
private Map<String, String> extraMetaData = new HashMap<>();
private final ColumnProperty.Builder<Boolean> statistics;

private Builder() {
enableDict = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_IS_DICTIONARY_ENABLED);
Expand All @@ -387,6 +395,7 @@ private Builder() {
ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_ADAPTIVE_BLOOM_FILTER_ENABLED);
numBloomFilterCandidates =
ColumnProperty.<Integer>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER);
statistics = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_STATISTICS_ENABLED);
}

private Builder(ParquetProperties toCopy) {
Expand All @@ -409,6 +418,7 @@ private Builder(ParquetProperties toCopy) {
this.maxBloomFilterBytes = toCopy.maxBloomFilterBytes;
this.byteStreamSplitEnabled = ColumnProperty.builder(toCopy.byteStreamSplitEnabled);
this.extraMetaData = toCopy.extraMetaData;
this.statistics = ColumnProperty.builder(toCopy.statistics);
}

/**
Expand Down Expand Up @@ -657,6 +667,18 @@ public Builder withExtraMetaData(Map<String, String> extraMetaData) {
return this;
}

/**
* Enable or disable the statistics for given column. All column statistics are enabled by default.
*
* @param columnPath the given column
* @param enabled enable or disable
* @return this builder for method chaining
*/
public Builder withStatisticsEnabled(String columnPath, boolean enabled) {
this.statistics.withValue(columnPath, enabled);
return this;
}

public ParquetProperties build() {
ParquetProperties properties = new ParquetProperties(this);
// we pass a constructed but uninitialized factory to ParquetProperties above as currently
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,23 @@
class ColumnValueCollector {

private final ColumnDescriptor path;
private final boolean statisticsEnabled;
private BloomFilterWriter bloomFilterWriter;
private BloomFilter bloomFilter;
private Statistics<?> statistics;
private SizeStatistics.Builder sizeStatisticsBuilder;

ColumnValueCollector(ColumnDescriptor path, BloomFilterWriter bloomFilterWriter, ParquetProperties props) {
this.path = path;
this.statisticsEnabled = props.getStatisticsEnabled(path);
resetPageStatistics();
initBloomFilter(bloomFilterWriter, props);
}

void resetPageStatistics() {
this.statistics = Statistics.createStats(path.getPrimitiveType());
this.statistics = statisticsEnabled
? Statistics.createStats(path.getPrimitiveType())
: Statistics.noopStats(path.getPrimitiveType());
this.sizeStatisticsBuilder = SizeStatistics.newBuilder(
path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.column.statistics;

import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.PrimitiveType;

/**
* A noop statistics which always return empty.
*/
class NoopStatistics<T extends Comparable<T>> extends Statistics<T> {

NoopStatistics(PrimitiveType type) {
super(type);
}

@Override
public void updateStats(int value) {}

@Override
public void updateStats(long value) {}

@Override
public void updateStats(float value) {}

@Override
public void updateStats(double value) {}

@Override
public void updateStats(boolean value) {}

@Override
public void updateStats(Binary value) {}

@Override
public boolean equals(Object other) {
if (other == this) return true;
if (!(other instanceof Statistics)) return false;
Statistics stats = (Statistics) other;
return type().equals(stats.type());
}

@Override
public int hashCode() {
return 31 * type().hashCode();
}

@Override
protected void mergeStatisticsMinMax(Statistics stats) {}

@Override
public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes) {}

@Override
public T genericGetMin() {
throw new UnsupportedOperationException(
"genericGetMin is not supported by " + getClass().getName());
}

@Override
public T genericGetMax() {
throw new UnsupportedOperationException(
"genericGetMax is not supported by " + getClass().getName());
}

@Override
public byte[] getMaxBytes() {
throw new UnsupportedOperationException(
"getMaxBytes is not supported by " + getClass().getName());
}

@Override
public byte[] getMinBytes() {
throw new UnsupportedOperationException(
"getMinBytes is not supported by " + getClass().getName());
}

@Override
String stringify(T value) {
throw new UnsupportedOperationException(
"stringify is not supported by " + getClass().getName());
}

@Override
public boolean isSmallerThan(long size) {
throw new UnsupportedOperationException(
"isSmallerThan is not supported by " + getClass().getName());
}

@Override
public long getNumNulls() {
return -1;
}

@Override
public boolean isEmpty() {
return true;
}

@Override
public boolean hasNonNullValue() {
return false;
}

@Override
public boolean isNumNullsSet() {
return false;
}

@Override
public Statistics<T> copy() {
return new NoopStatistics<>(this.type());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,15 @@ public static Statistics<?> createStats(Type type) {
}
}

/**
* Creates a noop {@code NoopStatistics} statistics instance. This is only used when the user disables statistics for the specified column.
* @param type type of the column
* @return a noop statistics
*/
public static Statistics<?> noopStats(Type type) {
return new NoopStatistics<>(type.asPrimitiveType());
}

/**
* Returns a builder to create new statistics object. Used to read the statistics from the parquet file.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96;
import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertThrows;
import static org.junit.Assert.assertTrue;

import java.nio.ByteBuffer;
Expand Down Expand Up @@ -900,4 +902,29 @@ public void testSpecBuilderForDouble() {
assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin()));
assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax()));
}

@Test
public void testNoopStatistics() {
// Test basic max/min
integerArray = new int[] {1, 3, 14, 54, 66, 8, 0, 23, 54};
Statistics<?> stats = Statistics.noopStats(new PrimitiveType(REQUIRED, INT32, "int32"));
assertTrue(stats.isEmpty());

for (int i : integerArray) {
stats.updateStats(i);
}

assertEquals(stats.getNumNulls(), -1);
assertFalse(stats.hasNonNullValue());
assertFalse(stats.isNumNullsSet());
assertTrue(stats.isEmpty());

assertThrows(UnsupportedOperationException.class, stats::genericGetMax);
assertThrows(UnsupportedOperationException.class, stats::genericGetMin);
assertThrows(UnsupportedOperationException.class, stats::getMaxBytes);
assertThrows(UnsupportedOperationException.class, stats::getMinBytes);
assertThrows(UnsupportedOperationException.class, stats::maxAsString);
assertThrows(UnsupportedOperationException.class, stats::minAsString);
assertThrows(UnsupportedOperationException.class, () -> stats.isSmallerThan(0));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -897,6 +897,18 @@ public SELF config(String property, String value) {
return self();
}

/**
* Sets the statistics enabled/disabled for the specified column. All column statistics are enabled by default.
*
* @param columnPath the path of the column (dot-string)
* @param enabled whether to write calculate statistics for the column
* @return this builder for method chaining
*/
public SELF withStatisticsEnabled(String columnPath, boolean enabled) {
encodingPropsBuilder.withStatisticsEnabled(columnPath, enabled);
return self();
}

/**
* Build a {@link ParquetWriter} with the accumulated configuration.
*
Expand Down
Loading