Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix coercion gaps for Hive tables in ORC format #22325

Merged
merged 2 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@
import io.trino.plugin.hive.coercions.BooleanCoercer.OrcVarcharToBooleanCoercer;
import io.trino.plugin.hive.coercions.DateCoercer.DateToVarcharCoercer;
import io.trino.plugin.hive.coercions.DateCoercer.VarcharToDateCoercer;
import io.trino.plugin.hive.coercions.DoubleToFloatCoercer;
import io.trino.plugin.hive.coercions.FloatToDoubleCoercer;
import io.trino.plugin.hive.coercions.IntegerNumberToDoubleCoercer;
import io.trino.plugin.hive.coercions.IntegerNumberToVarcharCoercer;
import io.trino.plugin.hive.coercions.IntegerNumberUpscaleCoercer;
import io.trino.plugin.hive.coercions.TimestampCoercer.LongTimestampToDateCoercer;
import io.trino.plugin.hive.coercions.TimestampCoercer.LongTimestampToVarcharCoercer;
import io.trino.plugin.hive.coercions.TimestampCoercer.VarcharToLongTimestampCoercer;
Expand Down Expand Up @@ -56,16 +59,21 @@
import static io.trino.orc.metadata.OrcType.OrcTypeKind.STRING;
import static io.trino.orc.metadata.OrcType.OrcTypeKind.TIMESTAMP;
import static io.trino.orc.metadata.OrcType.OrcTypeKind.VARCHAR;
import static io.trino.plugin.hive.coercions.DecimalCoercers.createDecimalToDecimalCoercer;
import static io.trino.plugin.hive.coercions.DecimalCoercers.createDecimalToDoubleCoercer;
import static io.trino.plugin.hive.coercions.DecimalCoercers.createDecimalToInteger;
import static io.trino.plugin.hive.coercions.DecimalCoercers.createDecimalToRealCoercer;
import static io.trino.plugin.hive.coercions.DecimalCoercers.createDecimalToVarcharCoercer;
import static io.trino.plugin.hive.coercions.DecimalCoercers.createDoubleToDecimalCoercer;
import static io.trino.plugin.hive.coercions.DecimalCoercers.createIntegerNumberToDecimalCoercer;
import static io.trino.plugin.hive.coercions.DecimalCoercers.createRealToDecimalCoercer;
import static io.trino.plugin.hive.coercions.DoubleToVarcharCoercers.createDoubleToVarcharCoercer;
import static io.trino.plugin.hive.coercions.FloatToVarcharCoercers.createFloatToVarcharCoercer;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.SmallintType.SMALLINT;
import static io.trino.spi.type.TimestampType.TIMESTAMP_NANOS;
import static io.trino.spi.type.TinyintType.TINYINT;
import static io.trino.spi.type.VarcharType.createUnboundedVarcharType;

public final class OrcTypeTranslator
{
Expand All @@ -74,103 +82,121 @@ private OrcTypeTranslator() {}
public static Optional<TypeCoercer<? extends Type, ? extends Type>> createCoercer(OrcType fromOrcType, Type toTrinoType)
{
OrcTypeKind fromOrcTypeKind = fromOrcType.getOrcTypeKind();
if (fromOrcTypeKind == TIMESTAMP) {
if (toTrinoType instanceof VarcharType varcharType) {
return Optional.of(new LongTimestampToVarcharCoercer(TIMESTAMP_NANOS, varcharType));
}
if (toTrinoType instanceof DateType toDateType) {
return Optional.of(new LongTimestampToDateCoercer(TIMESTAMP_NANOS, toDateType));
}
return Optional.empty();

if (fromOrcTypeKind == BOOLEAN) {
return switch (toTrinoType) {
case VarcharType varcharType -> Optional.of(new BooleanToVarcharCoercer(varcharType));
default -> Optional.empty();
};
}
if (fromOrcTypeKind == DATE && toTrinoType instanceof VarcharType varcharType) {
return Optional.of(new DateToVarcharCoercer(varcharType));

if (fromOrcTypeKind == BYTE) {
return switch (toTrinoType) {
case SmallintType smallintType -> Optional.of(new IntegerNumberUpscaleCoercer<>(TINYINT, smallintType));
case IntegerType integerType -> Optional.of(new IntegerNumberUpscaleCoercer<>(TINYINT, integerType));
case BigintType bigintType -> Optional.of(new IntegerNumberUpscaleCoercer<>(TINYINT, bigintType));
case DoubleType ignored -> Optional.of(new IntegerNumberToDoubleCoercer<>(TINYINT));
case DecimalType decimalType -> Optional.of(createIntegerNumberToDecimalCoercer(TINYINT, decimalType));
case VarcharType varcharType -> Optional.of(new IntegerNumberToVarcharCoercer<>(TINYINT, varcharType));
default -> Optional.empty();
};
}
if (isVarcharType(fromOrcTypeKind)) {
if (toTrinoType instanceof BooleanType) {
return Optional.of(new OrcVarcharToBooleanCoercer(createUnboundedVarcharType()));
}
if (toTrinoType instanceof TimestampType timestampType) {
if (timestampType.isShort()) {
return Optional.of(new VarcharToShortTimestampCoercer(createUnboundedVarcharType(), timestampType));
}
return Optional.of(new VarcharToLongTimestampCoercer(createUnboundedVarcharType(), timestampType));
}
if (toTrinoType instanceof DateType toDateType) {
return Optional.of(new VarcharToDateCoercer(createUnboundedVarcharType(), toDateType));
}
if (toTrinoType instanceof RealType) {
return Optional.of(new VarcharToFloatCoercer(createUnboundedVarcharType(), true));
}
if (toTrinoType instanceof DoubleType) {
return Optional.of(new VarcharToDoubleCoercer(createUnboundedVarcharType(), true));
}
if (toTrinoType instanceof TinyintType tinyintType) {
return Optional.of(new OrcVarcharToIntegralNumericCoercer<>(createUnboundedVarcharType(), tinyintType));
}
if (toTrinoType instanceof SmallintType smallintType) {
return Optional.of(new OrcVarcharToIntegralNumericCoercer<>(createUnboundedVarcharType(), smallintType));
}
if (toTrinoType instanceof IntegerType integerType) {
return Optional.of(new OrcVarcharToIntegralNumericCoercer<>(createUnboundedVarcharType(), integerType));
}
if (toTrinoType instanceof BigintType bigintType) {
return Optional.of(new OrcVarcharToIntegralNumericCoercer<>(createUnboundedVarcharType(), bigintType));
}
return Optional.empty();

if (fromOrcTypeKind == SHORT) {
return switch (toTrinoType) {
case IntegerType integerType -> Optional.of(new IntegerNumberUpscaleCoercer<>(SMALLINT, integerType));
case BigintType bigintType -> Optional.of(new IntegerNumberUpscaleCoercer<>(SMALLINT, bigintType));
case DoubleType ignored -> Optional.of(new IntegerNumberToDoubleCoercer<>(SMALLINT));
case DecimalType decimalType -> Optional.of(createIntegerNumberToDecimalCoercer(SMALLINT, decimalType));
case VarcharType varcharType -> Optional.of(new IntegerNumberToVarcharCoercer<>(SMALLINT, varcharType));
default -> Optional.empty();
};
}
if (fromOrcTypeKind == FLOAT && toTrinoType instanceof VarcharType varcharType) {
return Optional.of(createFloatToVarcharCoercer(varcharType, true));

if (fromOrcTypeKind == INT) {
return switch (toTrinoType) {
case BigintType bigintType -> Optional.of(new IntegerNumberUpscaleCoercer<>(INTEGER, bigintType));
case DoubleType ignored -> Optional.of(new IntegerNumberToDoubleCoercer<>(INTEGER));
case DecimalType decimalType -> Optional.of(createIntegerNumberToDecimalCoercer(INTEGER, decimalType));
case VarcharType varcharType -> Optional.of(new IntegerNumberToVarcharCoercer<>(INTEGER, varcharType));
default -> Optional.empty();
};
}
if (fromOrcTypeKind == DOUBLE && toTrinoType instanceof VarcharType varcharType) {
return Optional.of(createDoubleToVarcharCoercer(varcharType, true));

if (fromOrcTypeKind == LONG) {
return switch (toTrinoType) {
case DoubleType ignored -> Optional.of(new IntegerNumberToDoubleCoercer<>(BIGINT));
case DecimalType decimalType -> Optional.of(createIntegerNumberToDecimalCoercer(BIGINT, decimalType));
case VarcharType varcharType -> Optional.of(new IntegerNumberToVarcharCoercer<>(BIGINT, varcharType));
default -> Optional.empty();
};
}
if (fromOrcTypeKind == BOOLEAN && toTrinoType instanceof VarcharType varcharType) {
return Optional.of(new BooleanToVarcharCoercer(varcharType));

if (fromOrcTypeKind == FLOAT) {
return switch (toTrinoType) {
case DoubleType ignored -> Optional.of(new FloatToDoubleCoercer());
case DecimalType decimalType -> Optional.of(createRealToDecimalCoercer(decimalType));
case VarcharType varcharType -> Optional.of(createFloatToVarcharCoercer(varcharType, true));
default -> Optional.empty();
};
}
if (toTrinoType instanceof DoubleType) {
if (fromOrcTypeKind == BYTE) {
return Optional.of(new IntegerNumberToDoubleCoercer<>(TINYINT));
}
if (fromOrcTypeKind == SHORT) {
return Optional.of(new IntegerNumberToDoubleCoercer<>(SMALLINT));
}
if (fromOrcTypeKind == INT) {
return Optional.of(new IntegerNumberToDoubleCoercer<>(INTEGER));
}
if (fromOrcTypeKind == LONG) {
return Optional.of(new IntegerNumberToDoubleCoercer<>(BIGINT));
}

if (fromOrcTypeKind == DOUBLE) {
return switch (toTrinoType) {
case RealType ignored -> Optional.of(new DoubleToFloatCoercer());
case DecimalType decimalType -> Optional.of(createDoubleToDecimalCoercer(decimalType));
case VarcharType varcharType -> Optional.of(createDoubleToVarcharCoercer(varcharType, true));
default -> Optional.empty();
};
}
if (toTrinoType instanceof DecimalType decimalType) {
if (fromOrcTypeKind == BYTE) {
return Optional.of(createIntegerNumberToDecimalCoercer(TINYINT, decimalType));
}
if (fromOrcTypeKind == SHORT) {
return Optional.of(createIntegerNumberToDecimalCoercer(SMALLINT, decimalType));
}
if (fromOrcTypeKind == INT) {
return Optional.of(createIntegerNumberToDecimalCoercer(INTEGER, decimalType));
}
if (fromOrcTypeKind == LONG) {
return Optional.of(createIntegerNumberToDecimalCoercer(BIGINT, decimalType));
}

if (fromOrcTypeKind == DECIMAL) {
DecimalType sourceType = DecimalType.createDecimalType(fromOrcType.getPrecision().orElseThrow(), fromOrcType.getScale().orElseThrow());
return switch (toTrinoType) {
case TinyintType tinyintType -> Optional.of(createDecimalToInteger(sourceType, tinyintType));
case SmallintType smallintType -> Optional.of(createDecimalToInteger(sourceType, smallintType));
case IntegerType integerType -> Optional.of(createDecimalToInteger(sourceType, integerType));
case BigintType bigintType -> Optional.of(createDecimalToInteger(sourceType, bigintType));
case RealType ignored -> Optional.of(createDecimalToRealCoercer(sourceType));
case DoubleType ignored -> Optional.of(createDecimalToDoubleCoercer(sourceType));
case DecimalType decimalType -> Optional.of(createDecimalToDecimalCoercer(sourceType, decimalType));
case VarcharType varcharType -> Optional.of(createDecimalToVarcharCoercer(sourceType, varcharType));
default -> Optional.empty();
};
}
if ((fromOrcTypeKind == BYTE || fromOrcTypeKind == SHORT || fromOrcTypeKind == INT || fromOrcTypeKind == LONG) && toTrinoType instanceof VarcharType varcharType) {
Type fromType = switch (fromOrcTypeKind) {
case BYTE -> TINYINT;
case SHORT -> SMALLINT;
case INT -> INTEGER;
case LONG -> BIGINT;
default -> throw new UnsupportedOperationException("Unsupported ORC type: " + fromOrcType);

if (fromOrcTypeKind == DATE) {
return switch (toTrinoType) {
case VarcharType varcharType -> Optional.of(new DateToVarcharCoercer(varcharType));
default -> Optional.empty();
};
return Optional.of(new IntegerNumberToVarcharCoercer<>(fromType, varcharType));
}
if (fromOrcTypeKind == DECIMAL && toTrinoType instanceof VarcharType varcharType) {
return Optional.of(createDecimalToVarcharCoercer(
DecimalType.createDecimalType(fromOrcType.getPrecision().orElseThrow(), fromOrcType.getScale().orElseThrow()),
varcharType));

if (fromOrcTypeKind == TIMESTAMP) {
return switch (toTrinoType) {
case DateType dateType -> Optional.of(new LongTimestampToDateCoercer(TIMESTAMP_NANOS, dateType));
case VarcharType varcharType -> Optional.of(new LongTimestampToVarcharCoercer(TIMESTAMP_NANOS, varcharType));
default -> Optional.empty();
};
}

if (isVarcharType(fromOrcTypeKind)) {
return switch (toTrinoType) {
case BooleanType ignored -> Optional.of(new OrcVarcharToBooleanCoercer(VarcharType.VARCHAR));
case TinyintType tinyintType -> Optional.of(new OrcVarcharToIntegralNumericCoercer<>(VarcharType.VARCHAR, tinyintType));
case SmallintType smallintType -> Optional.of(new OrcVarcharToIntegralNumericCoercer<>(VarcharType.VARCHAR, smallintType));
case IntegerType integerType -> Optional.of(new OrcVarcharToIntegralNumericCoercer<>(VarcharType.VARCHAR, integerType));
case BigintType bigintType -> Optional.of(new OrcVarcharToIntegralNumericCoercer<>(VarcharType.VARCHAR, bigintType));
case RealType ignored -> Optional.of(new VarcharToFloatCoercer(VarcharType.VARCHAR, true));
case DoubleType ignored -> Optional.of(new VarcharToDoubleCoercer(VarcharType.VARCHAR, true));
case DateType dateType -> Optional.of(new VarcharToDateCoercer(VarcharType.VARCHAR, dateType));
case TimestampType timestampType -> Optional.of(timestampType.isShort()
? new VarcharToShortTimestampCoercer(VarcharType.VARCHAR, timestampType)
: new VarcharToLongTimestampCoercer(VarcharType.VARCHAR, timestampType));
default -> Optional.empty();
};
}

return Optional.empty();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,27 +239,6 @@ protected Map<ColumnContext, String> expectedExceptionsWithTrinoContext()
.put(columnContext("orc", "row_to_row"), "Cannot read SQL type 'smallint' from ORC stream '.row_to_row.ti2si' of type BYTE")
.put(columnContext("orc", "list_to_list"), "Cannot read SQL type 'integer' from ORC stream '.list_to_list.item.ti2int' of type BYTE")
.put(columnContext("orc", "map_to_map"), "Cannot read SQL type 'integer' from ORC stream '.map_to_map.key' of type BYTE")
.put(columnContext("orc", "tinyint_to_smallint"), "Cannot read SQL type 'smallint' from ORC stream '.tinyint_to_smallint' of type BYTE")
.put(columnContext("orc", "tinyint_to_int"), "Cannot read SQL type 'integer' from ORC stream '.tinyint_to_int' of type BYTE")
.put(columnContext("orc", "tinyint_to_bigint"), "Cannot read SQL type 'bigint' from ORC stream '.tinyint_to_bigint' of type BYTE")
.put(columnContext("orc", "double_to_float"), "Cannot read SQL type 'real' from ORC stream '.double_to_float' of type DOUBLE")
.put(columnContext("orc", "longdecimal_to_shortdecimal"), "Decimal does not fit long (invalid table schema?)")
.put(columnContext("orc", "float_to_decimal"), "Cannot read SQL type 'decimal(10,5)' from ORC stream '.float_to_decimal' of type FLOAT")
.put(columnContext("orc", "double_to_decimal"), "Cannot read SQL type 'decimal(10,5)' from ORC stream '.double_to_decimal' of type DOUBLE")
.put(columnContext("orc", "decimal_to_float"), "Cannot read SQL type 'real' from ORC stream '.decimal_to_float' of type DECIMAL")
.put(columnContext("orc", "decimal_to_double"), "Cannot read SQL type 'double' from ORC stream '.decimal_to_double' of type DECIMAL")
.put(columnContext("orc", "longdecimal_to_tinyint"), "Cannot read SQL type 'tinyint' from ORC stream '.longdecimal_to_tinyint' of type DECIMAL")
.put(columnContext("orc", "shortdecimal_to_tinyint"), "Cannot read SQL type 'tinyint' from ORC stream '.shortdecimal_to_tinyint' of type DECIMAL")
.put(columnContext("orc", "longdecimal_to_smallint"), "Cannot read SQL type 'smallint' from ORC stream '.longdecimal_to_smallint' of type DECIMAL")
.put(columnContext("orc", "shortdecimal_to_smallint"), "Cannot read SQL type 'smallint' from ORC stream '.shortdecimal_to_smallint' of type DECIMAL")
.put(columnContext("orc", "too_big_shortdecimal_to_smallint"), "Cannot read SQL type 'smallint' from ORC stream '.too_big_shortdecimal_to_smallint' of type DECIMAL")
.put(columnContext("orc", "longdecimal_to_int"), "Cannot read SQL type 'integer' from ORC stream '.longdecimal_to_int' of type DECIMAL")
.put(columnContext("orc", "shortdecimal_to_int"), "Cannot read SQL type 'integer' from ORC stream '.shortdecimal_to_int' of type DECIMAL")
.put(columnContext("orc", "shortdecimal_with_0_scale_to_int"), "Cannot read SQL type 'integer' from ORC stream '.shortdecimal_with_0_scale_to_int' of type DECIMAL")
.put(columnContext("orc", "longdecimal_to_bigint"), "Cannot read SQL type 'bigint' from ORC stream '.longdecimal_to_bigint' of type DECIMAL")
.put(columnContext("orc", "shortdecimal_to_bigint"), "Cannot read SQL type 'bigint' from ORC stream '.shortdecimal_to_bigint' of type DECIMAL")
.put(columnContext("orc", "short_decimal_to_bounded_varchar"), "Cannot read SQL type 'varchar(30)' from ORC stream '.short_decimal_to_bounded_varchar' of type DECIMAL")
.put(columnContext("orc", "long_decimal_to_bounded_varchar"), "Cannot read SQL type 'varchar(30)' from ORC stream '.long_decimal_to_bounded_varchar' of type DECIMAL")
.put(columnContext("orc", "timestamp_row_to_row"), "Cannot read SQL type 'varchar' from ORC stream '.timestamp_row_to_row.timestamp2string' of type TIMESTAMP with attributes {}")
.put(columnContext("orc", "timestamp_list_to_list"), "Cannot read SQL type 'varchar' from ORC stream '.timestamp_row_to_row.timestamp2string' of type TIMESTAMP with attributes {}")
.put(columnContext("orc", "timestamp_map_to_map"), "Cannot read SQL type 'varchar' from ORC stream '.timestamp_row_to_row.timestamp2string' of type TIMESTAMP with attributes {}")
Expand Down
Loading