Skip to content

Commit

Permalink
Fix the edge case when handling non numeric values of double type in …
Browse files Browse the repository at this point in the history
…delta stats

When reading the delta snapshot and load the information into Delta object AddFile, the non-numeric values of float or double type (example, "NaN", "-Infinity") from col stats become string type.
These special values need special handling and see how delta handled: https://github.com/delta-io/delta/blob/master/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/data/DefaultJsonRow.java#L210
  • Loading branch information
Hanzhi Wang authored and the-other-tim-brown committed Sep 18, 2024
1 parent a184a36 commit 486c407
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@
public class DeltaValueConverter {
private static final String DATE_FORMAT_STR = "yyyy-MM-dd HH:mm:ss";
private static final TimeZone TIME_ZONE = TimeZone.getTimeZone("UTC");
protected static final String NAN_VALUE = "NaN";
protected static final String INFINITY_VALUE = "Infinity";
protected static final String POSITIVE_INFINITY_VALUE = "+Infinity";
protected static final String POSITIVE_INF_VALUE = "+INF";
protected static final String NEGATIVE_INFINITY_VALUE = "-Infinity";
protected static final String NEGATIVE_INF_VALUE = "-INF";

static DateFormat getDateFormat(String dataFormatString) {
DateFormat dateFormat = new SimpleDateFormat(dataFormatString);
Expand Down Expand Up @@ -194,9 +200,35 @@ public static Object convertFromDeltaPartitionValue(

private static Object castObjectToInternalType(Object value, InternalType valueType) {
switch (valueType) {
case DOUBLE:
if (value instanceof String)
switch (value.toString()) {
case NAN_VALUE:
return Double.NaN;
case POSITIVE_INF_VALUE:
case POSITIVE_INFINITY_VALUE:
case INFINITY_VALUE:
return Double.POSITIVE_INFINITY;
case NEGATIVE_INF_VALUE:
case NEGATIVE_INFINITY_VALUE:
return Double.NEGATIVE_INFINITY;
}
break;
case FLOAT:
if (value instanceof Double) {
return ((Double) value).floatValue();
} else if (value instanceof String) {
switch (value.toString()) {
case NAN_VALUE:
return Float.NaN;
case POSITIVE_INF_VALUE:
case POSITIVE_INFINITY_VALUE:
case INFINITY_VALUE:
return Float.POSITIVE_INFINITY;
case NEGATIVE_INF_VALUE:
case NEGATIVE_INFINITY_VALUE:
return Float.NEGATIVE_INFINITY;
}
}
break;
case DECIMAL:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,15 @@ void parseWrongDateTime() throws ParseException {
assertThrows(ParseException.class, () -> strictDateFormat.parse(wrongDateTime));
}

@ParameterizedTest
@MethodSource("nonNumericValuesForColStats")
void formattedDifferentNonNumericValuesFromDeltaColumnStat(
Object fieldValue, InternalSchema fieldSchema, Object expectedDeltaValue) {
Object internalRepresentation =
DeltaValueConverter.convertFromDeltaColumnStatValue(fieldValue, fieldSchema);
assertEquals(internalRepresentation, expectedDeltaValue);
}

private static Stream<Arguments> valuesWithSchemaProviderForColStats() {
return Stream.of(
Arguments.of(
Expand Down Expand Up @@ -187,4 +196,22 @@ private static Stream<Arguments> valuesWithSchemaProviderForPartitions() {
"yyyy",
"2022"));
}

private static Stream<Arguments> nonNumericValuesForColStats() {
InternalSchema doubleSchema =
InternalSchema.builder().name("double").dataType(InternalType.DOUBLE).build();
InternalSchema floatSchema =
InternalSchema.builder().name("float").dataType(InternalType.FLOAT).build();
return Stream.of(
Arguments.of("NaN", doubleSchema, Double.NaN),
Arguments.of("Infinity", doubleSchema, Double.POSITIVE_INFINITY),
Arguments.of("-Infinity", doubleSchema, Double.NEGATIVE_INFINITY),
Arguments.of("+Infinity", doubleSchema, Double.POSITIVE_INFINITY),
Arguments.of("NaN", floatSchema, Float.NaN),
Arguments.of("Infinity", floatSchema, Float.POSITIVE_INFINITY),
Arguments.of("-Infinity", floatSchema, Float.NEGATIVE_INFINITY),
Arguments.of("+Infinity", floatSchema, Float.POSITIVE_INFINITY),
Arguments.of(Double.NaN, doubleSchema, Double.NaN),
Arguments.of(Double.POSITIVE_INFINITY, doubleSchema, Double.POSITIVE_INFINITY));
}
}

0 comments on commit 486c407

Please sign in to comment.