Skip to content

Commit

Permalink
Fix Spark cast(int as binary)
Browse files Browse the repository at this point in the history
  • Loading branch information
rui-mo committed May 23, 2024
1 parent 066a72f commit 1d4bbb0
Show file tree
Hide file tree
Showing 10 changed files with 181 additions and 0 deletions.
19 changes: 19 additions & 0 deletions velox/docs/functions/spark/conversion.rst
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,22 @@ Valid example
SELECT cast(' -3E+2' as decimal(12, 2)); -- -300.00
SELECT cast('-3E+2 ' as decimal(12, 2)); -- -300.00
SELECT cast(' -3E+2 ' as decimal(12, 2)); -- -300.00

Cast to Varbinary
---------------

From integral types
^^^^^^^^^^^^^^^^^^^

Casting integral value to varbinary type is allowed.
Bytes of input value are converted into an array of bytes in little-endian order.
Supported types are tinyint, smallint, integer and bigint.

Valid example

::

SELECT cast(cast(18 as tinyint) as binary); -- 12
SELECT cast(cast(180 as smallint) as binary); -- 00 B4
SELECT cast(cast(180000 as integer) as binary); -- 00 02 BF 20
SELECT cast(cast(180000 as bigint) as binary); -- 00 00 00 00 00 02 BF 20
55 changes: 55 additions & 0 deletions velox/expression/CastExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,26 @@ void CastExpr::applyPeeled(
fromType->asRow(),
toType);
break;
case TypeKind::VARBINARY: {
switch (fromType->kind()) {
case TypeKind::TINYINT:
result = applyIntToBinaryCast<int8_t>(rows, context, input);
return;
case TypeKind::SMALLINT:
result = applyIntToBinaryCast<int16_t>(rows, context, input);
return;
case TypeKind::INTEGER:
result = applyIntToBinaryCast<int32_t>(rows, context, input);
return;
case TypeKind::BIGINT:
result = applyIntToBinaryCast<int64_t>(rows, context, input);
return;
default:
// Other from types will be handles by the default branch of parent
// switch clause.
}
[[fallthrough]];
}
default: {
// Handle primitive type conversions.
VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
Expand Down Expand Up @@ -838,6 +858,41 @@ VectorPtr CastExpr::applyTimestampToVarcharCast(
return result;
}

template <typename TInput>
VectorPtr CastExpr::applyIntToBinaryCast(
const SelectivityVector& rows,
exec::EvalCtx& context,
const BaseVector& input) {
VELOX_USER_CHECK(
hooks_->canCastIntToBinary(),
"Cannot cast {} to VARBINARY.",
CppToType<TInput>::create()->toString());

VectorPtr result;
context.ensureWritable(rows, VARBINARY(), result);
(*result).clearNulls(rows);
auto flatResult = result->asFlatVector<StringView>();
const auto simpleInput = input.as<SimpleVector<TInput>>();

// The created string view is always inlined for int types.
char inlined[sizeof(TInput)];
applyToSelectedNoThrowLocal(context, rows, result, [&](vector_size_t row) {
TInput input = simpleInput->valueAt(row);
if constexpr (std::is_same_v<TInput, int8_t>) {
inlined[0] = static_cast<char>(input & 0xFF);
} else {
for (int i = sizeof(TInput) - 1; i >= 0; --i) {
inlined[i] = static_cast<char>(input & 0xFF);
input >>= 8;
}
}
const auto stringView = StringView(inlined, sizeof(TInput));
flatResult->setNoCopy(row, stringView);
});

return result;
}

void CastExpr::apply(
const SelectivityVector& rows,
const VectorPtr& input,
Expand Down
6 changes: 6 additions & 0 deletions velox/expression/CastExpr.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,12 @@ class CastExpr : public SpecialForm {
const TypePtr& toType,
VectorPtr& castResult);

template <typename TInput>
VectorPtr applyIntToBinaryCast(
const SelectivityVector& rows,
exec::EvalCtx& context,
const BaseVector& input);

template <typename TInput, typename TOutput>
void applyFloatingPointToDecimalCastKernel(
const SelectivityVector& rows,
Expand Down
3 changes: 3 additions & 0 deletions velox/expression/CastHooks.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ class CastHooks {
public:
virtual ~CastHooks() = default;

// Returns whether it is supported to cast int types as binary.
virtual bool canCastIntToBinary() const = 0;

virtual Timestamp castStringToTimestamp(const StringView& view) const = 0;

virtual int32_t castStringToDate(const StringView& dateString) const = 0;
Expand Down
4 changes: 4 additions & 0 deletions velox/expression/PrestoCastHooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ PrestoCastHooks::PrestoCastHooks(const core::QueryConfig& config)
}
}

bool PrestoCastHooks::canCastIntToBinary() const {
return false;
}

Timestamp PrestoCastHooks::castStringToTimestamp(const StringView& view) const {
auto result = util::fromTimestampWithTimezoneString(view.data(), view.size());

Expand Down
3 changes: 3 additions & 0 deletions velox/expression/PrestoCastHooks.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ class PrestoCastHooks : public CastHooks {
public:
explicit PrestoCastHooks(const core::QueryConfig& config);

// Returns false.
bool canCastIntToBinary() const override;

// Uses the default implementation of 'castFromDateString'.
Timestamp castStringToTimestamp(const StringView& view) const override;

Expand Down
11 changes: 11 additions & 0 deletions velox/expression/tests/CastExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1704,6 +1704,17 @@ TEST_F(CastExprTest, decimalToDecimal) {
"Cannot cast DECIMAL '-99999999999999999999999999999999999999' to DECIMAL(38, 1)");
}

TEST_F(CastExprTest, integerToBinary) {
testInvalidCast<int8_t>(
"varbinary", {12}, "Cannot cast TINYINT to VARBINARY.");
testInvalidCast<int16_t>(
"varbinary", {12}, "Cannot cast SMALLINT to VARBINARY.");
testInvalidCast<int32_t>(
"varbinary", {12}, "Cannot cast INTEGER to VARBINARY.");
testInvalidCast<int64_t>(
"varbinary", {12}, "Cannot cast BIGINT to VARBINARY.");
}

TEST_F(CastExprTest, integerToDecimal) {
testIntToDecimalCasts<int8_t>();
testIntToDecimalCasts<int16_t>();
Expand Down
4 changes: 4 additions & 0 deletions velox/functions/sparksql/specialforms/SparkCastHooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@

namespace facebook::velox::functions::sparksql {

bool SparkCastHooks::canCastIntToBinary() const {
return true;
}

Timestamp SparkCastHooks::castStringToTimestamp(const StringView& view) const {
return util::fromTimestampString(view.data(), view.size());
}
Expand Down
3 changes: 3 additions & 0 deletions velox/functions/sparksql/specialforms/SparkCastHooks.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ namespace facebook::velox::functions::sparksql {
// This class provides cast hooks following Spark semantics.
class SparkCastHooks : public exec::CastHooks {
public:
// Returns true.
bool canCastIntToBinary() const override;

// TODO: Spark hook allows more string patterns than Presto.
Timestamp castStringToTimestamp(const StringView& view) const override;

Expand Down
73 changes: 73 additions & 0 deletions velox/functions/sparksql/tests/SparkCastExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -556,5 +556,78 @@ TEST_F(SparkCastExprTest, fromString) {
-30000},
DECIMAL(12, 2)));
}

TEST_F(SparkCastExprTest, tinyintToBinary) {
testCast<int8_t, std::string>(
TINYINT(),
VARBINARY(),
{18,
-26,
0,
110,
std::numeric_limits<int8_t>::max(),
std::numeric_limits<int8_t>::min()},
{std::string("\x12", 1),
std::string("\xE6", 1),
std::string("\0", 1),
std::string("\x6E", 1),
std::string("\x7F", 1),
std::string("\x80", 1)});
}

TEST_F(SparkCastExprTest, smallintToBinary) {
testCast<int16_t, std::string>(
SMALLINT(),
VARBINARY(),
{180,
-199,
0,
12300,
std::numeric_limits<int16_t>::max(),
std::numeric_limits<int16_t>::min()},
{std::string("\0\xB4", 2),
std::string("\xFF\x39", 2),
std::string("\0\0", 2),
std::string("\x30\x0C", 2),
std::string("\x7F\xFF", 2),
std::string("\x80\00", 2)});
}

TEST_F(SparkCastExprTest, integerToBinary) {
testCast<int32_t, std::string>(
INTEGER(),
VARBINARY(),
{18,
-26,
0,
180000,
std::numeric_limits<int32_t>::max(),
std::numeric_limits<int32_t>::min()},
{std::string("\0\0\0\x12", 4),
std::string("\xFF\xFF\xFF\xE6", 4),
std::string("\0\0\0\0", 4),
std::string("\0\x02\xBF\x20", 4),
std::string("\x7F\xFF\xFF\xFF", 4),
std::string("\x80\0\0\0", 4)});
}

TEST_F(SparkCastExprTest, bigintToBinary) {
testCast<int64_t, std::string>(
BIGINT(),
VARBINARY(),
{123456,
-256789,
0,
180000,
std::numeric_limits<int64_t>::max(),
std::numeric_limits<int64_t>::min()},
{std::string("\0\0\0\0\0\x01\xE2\x40", 8),
std::string("\xFF\xFF\xFF\xFF\xFF\xFC\x14\xEB", 8),
std::string("\0\0\0\0\0\0\0\0", 8),
std::string("\0\0\0\0\0\x02\xBF\x20", 8),
std::string("\x7F\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8),
std::string("\x80\x00\x00\x00\x00\x00\x00\x00", 8)});
}

} // namespace
} // namespace facebook::velox::test

0 comments on commit 1d4bbb0

Please sign in to comment.