Skip to content

Commit

Permalink
Skip Hive boolean column stats when false count is -1
Browse files Browse the repository at this point in the history
Impala generates boolean column count statistics with -1 for the
number of false values and 1 for the number of true values,
disregarding actual statistics.
  • Loading branch information
VicoWu authored and electrum committed Apr 29, 2019
1 parent a1698e5 commit c7cc72d
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -518,9 +518,16 @@ public static HiveColumnStatistics fromMetastoreApiColumnStatistics(ColumnStatis
}
if (columnStatistics.getStatsData().isSetBooleanStats()) {
BooleanColumnStatsData booleanStatsData = columnStatistics.getStatsData().getBooleanStats();
OptionalLong trueCount = OptionalLong.empty();
OptionalLong falseCount = OptionalLong.empty();
// Impala 'COMPUTE STATS' writes 1 as the numTrue and -1 as the numFalse
if (booleanStatsData.isSetNumTrues() && booleanStatsData.isSetNumFalses() && (booleanStatsData.getNumFalses() != -1)) {
trueCount = OptionalLong.of(booleanStatsData.getNumTrues());
falseCount = OptionalLong.of(booleanStatsData.getNumFalses());
}
return createBooleanColumnStatistics(
booleanStatsData.isSetNumTrues() ? OptionalLong.of(booleanStatsData.getNumTrues()) : OptionalLong.empty(),
booleanStatsData.isSetNumFalses() ? OptionalLong.of(booleanStatsData.getNumFalses()) : OptionalLong.empty(),
trueCount,
falseCount,
booleanStatsData.isSetNumNulls() ? fromMetastoreNullsCount(booleanStatsData.getNumNulls()) : OptionalLong.empty());
}
if (columnStatistics.getStatsData().isSetStringStats()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,24 @@ public void testBooleanStatsToColumnStatistics()
assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty());
}

@Test
public void testImpalaGeneratedBooleanStatistics()
{
BooleanColumnStatsData statsData = new BooleanColumnStatsData(1L, -1L, 2L);
ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BOOLEAN_TYPE_NAME, booleanStats(statsData));
HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty());

assertEquals(actual.getIntegerStatistics(), Optional.empty());
assertEquals(actual.getDoubleStatistics(), Optional.empty());
assertEquals(actual.getDecimalStatistics(), Optional.empty());
assertEquals(actual.getDateStatistics(), Optional.empty());
assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty());
assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty());
assertEquals(actual.getNullsCount(), OptionalLong.of(2));
assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty());
assertEquals(actual.getBooleanStatistics(), Optional.of(new BooleanStatistics(OptionalLong.empty(), OptionalLong.empty())));
}

@Test
public void testEmptyBooleanStatsToColumnStatistics()
{
Expand Down

0 comments on commit c7cc72d

Please sign in to comment.