Skip to content
This repository has been archived by the owner on May 6, 2024. It is now read-only.

[POAE7-2544] String LOWER/UPPER op support for Arrow format #147

Merged
merged 8 commits into from
Nov 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions cider/exec/plan/parser/SubstraitToAnalyzerExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ bool getExprUpdatable(std::unordered_map<std::shared_ptr<Analyzer::Expr>, bool>
return map.find(expr) == map.end() || !map.find(expr)->second;
}

bool isStringFunction(std::string function_name) {
std::unordered_set<std::string> supportedStrFunctionSet{"substring"};
bool isStringFunction(const std::string& function_name) {
std::unordered_set<std::string> supportedStrFunctionSet{"substring", "lower", "upper"};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please define the input argument as const std::string& name
and cause the supportedStrFunctionSet only has three names, so if we want make this function more fast, codes can be written:

std::string_view names[] = {"substring", "lower", "upper"};
return std::find(std::cbegin(names), std::cend(names), name) != std::cend(names);

or

std::string_view names[] = {"substring", "lower", "upper"};
for (size_t i = 0; i < sizeof(names); ++i) {
  if (names[i] == name) {
    return true;
  }
return false;

Copy link
Contributor Author

@YBRua YBRua Nov 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the optimization advice!

  1. pass-by-const-ref has been added.

  2. Currently there are only 3 names, but more names are to be added in the future as more string ops are supported. I think leaving this part as is would be fine?

return supportedStrFunctionSet.find(function_name) != supportedStrFunctionSet.end();
}

Expand Down Expand Up @@ -799,10 +799,10 @@ std::shared_ptr<Analyzer::Expr> Substrait2AnalyzerExprConverter::buildStrExpr(
function_name.begin(), function_name.end(), function_name.begin(), ::toupper);
auto string_op_kind = name_to_string_op_kind(function_name);
switch (string_op_kind) {
// case SqlStringOpKind::LOWER:
// return makeExpr<Analyzer::LowerStringOper>(args);
// case SqlStringOpKind::UPPER:
// return makeExpr<Analyzer::UpperStringOper>(args);
case SqlStringOpKind::LOWER:
return makeExpr<Analyzer::LowerStringOper>(args);
case SqlStringOpKind::UPPER:
return makeExpr<Analyzer::UpperStringOper>(args);
// case SqlStringOpKind::INITCAP:
// return makeExpr<Analyzer::InitCapStringOper>(args);
// case SqlStringOpKind::REVERSE:
Expand Down
126 changes: 122 additions & 4 deletions cider/tests/functionality/CiderStringTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ class CiderStringTestArrow : public CiderTestBase {
QueryArrowDataGenerator::generateBatchByTypes(
schema_,
array_,
30,
50,
{"col_1", "col_2"},
{CREATE_SUBSTRAIT_TYPE(I32), CREATE_SUBSTRAIT_TYPE(Varchar)},
{0, 0},
Expand Down Expand Up @@ -120,7 +120,7 @@ class CiderStringNullableTestArrow : public CiderTestBase {
QueryArrowDataGenerator::generateBatchByTypes(
schema_,
array_,
30,
50,
{"col_1", "col_2"},
{CREATE_SUBSTRAIT_TYPE(I32), CREATE_SUBSTRAIT_TYPE(Varchar)},
{2, 2},
Expand Down Expand Up @@ -350,18 +350,88 @@ TEST_F(CiderStringTestArrow, ArrowSubstringNestTest) {
assertQueryArrow("SELECT * FROM test WHERE SUBSTRING(col_2, 1, 3) = 'aaa'");
assertQueryArrow("SELECT * FROM test WHERE SUBSTRING(col_2, 1, 3) <> 'bbb'");
assertQueryArrow("SELECT * FROM test WHERE SUBSTRING(col_2, 1, 3) > 'aaa'");

assertQueryArrow("SELECT SUBSTRING(SUBSTRING(col_2, 1, 8), 1, 4) FROM test ");
}

TEST_F(CiderStringRandomTestArrow, ArrowSubstringNestTest) {
assertQueryArrow("SELECT * FROM test WHERE SUBSTRING(col_2, 1, 3) = 'aaa'");
assertQueryArrow("SELECT * FROM test WHERE SUBSTRING(col_2, 1, 3) <> 'bbb'");
assertQueryArrow("SELECT * FROM test WHERE SUBSTRING(col_2, 1, 3) > 'aaa'");

assertQueryArrow("SELECT SUBSTRING(SUBSTRING(col_2, 1, 8), 1, 4) FROM test ");
}

TEST_F(CiderStringNullableTestArrow, ArrowBasicStringTest) {
assertQueryArrow("SELECT col_2 FROM test ");
assertQueryArrow("SELECT col_1, col_2 FROM test ");
assertQueryArrow("SELECT * FROM test ");
assertQueryArrow("SELECT col_2 FROM test where col_2 = 'aaaa'");
assertQueryArrow("SELECT col_2 FROM test where col_2 = '0000000000'");
assertQueryArrow("SELECT col_2 FROM test where col_2 <> '0000000000'");
assertQueryArrow("SELECT col_1 FROM test where col_2 <> '1111111111'");
assertQueryArrow("SELECT col_1, col_2 FROM test where col_2 <> '2222222222'");
assertQueryArrow("SELECT col_2 FROM test where col_2 <> 'aaaaaaaaaaa'");
assertQueryArrow("SELECT * FROM test where col_2 <> 'abcdefghijklmn'");
assertQueryArrow("SELECT col_2 FROM test where col_2 IS NOT NULL");
assertQueryArrow("SELECT col_2 FROM test where col_2 < 'uuu'");
}

TEST_F(CiderStringNullableTestArrow, ArrowBasicStringLikeTest) {
assertQueryArrow("SELECT col_2 FROM test where col_2 LIKE '%1111'");
assertQueryArrow("SELECT col_2 FROM test where col_2 LIKE '1111%'");
assertQueryArrow("SELECT col_2 FROM test where col_2 LIKE '%1111%'");
assertQueryArrow("SELECT col_2 FROM test where col_2 LIKE '%1234%'");
assertQueryArrow("SELECT col_2 FROM test where col_2 LIKE '22%22'");
assertQueryArrow("SELECT col_2 FROM test where col_2 LIKE '_33%'");
assertQueryArrow("SELECT col_2 FROM test where col_2 LIKE '44_%'");

assertQueryArrow(
"SELECT col_2 FROM test where col_2 LIKE '5555%' OR col_2 LIKE '%6666'");
assertQueryArrow(
"SELECT col_2 FROM test where col_2 LIKE '7777%' AND col_2 LIKE '%8888'");
assertQueryArrow("SELECT col_2 FROM test where col_2 LIKE '%1111'",
"like_wo_cast.json");
assertQueryArrow("SELECT col_2 FROM test where col_2 NOT LIKE '1111%'");
assertQueryArrow("SELECT col_2 FROM test where col_2 NOT LIKE '44_4444444'");
assertQueryArrow(
"SELECT col_2 FROM test where col_2 NOT LIKE '44_4%' and col_2 NOT LIKE '%111%'");
}

TEST_F(CiderStringNullableTestArrow, ArrowSubstringTest) {
// variable source string
assertQueryArrow("SELECT SUBSTRING(col_2, 1, 10) FROM test ");
assertQueryArrow("SELECT SUBSTRING(col_2, 1, 5) FROM test ");

// out of range
assertQueryArrow("SELECT SUBSTRING(col_2, 4, 8) FROM test ");
assertQueryArrow("SELECT SUBSTRING(col_2, 0, 12) FROM test ");
assertQueryArrow("SELECT SUBSTRING(col_2, 12, 0) FROM test ");
assertQueryArrow("SELECT SUBSTRING(col_2, 12, 2) FROM test ");

// from for
assertQueryArrow("SELECT SUBSTRING(col_2 from 2 for 8) FROM test ");

// zero length
assertQueryArrow("SELECT SUBSTRING(col_2, 4, 0) FROM test ");

// negative wrap
assertQueryArrow("SELECT SUBSTRING(col_2, -4, 2) FROM test ");
}

TEST_F(CiderStringTestArrow, ArrowBasicStringTest) {
assertQueryArrow("SELECT col_2 FROM test ");
assertQueryArrow("SELECT col_1, col_2 FROM test ");
assertQueryArrow("SELECT * FROM test ");
assertQueryArrow("SELECT col_2 FROM test where col_2 = 'aaaa'");
assertQueryArrow("SELECT col_2 FROM test where col_2 = '0000000000'");
assertQueryArrow("SELECT col_2 FROM test where col_2 <> '0000000000'");
assertQueryArrow("SELECT col_1 FROM test where col_2 <> '1111111111'");
assertQueryArrow("SELECT col_1, col_2 FROM test where col_2 <> '2222222222'");
assertQueryArrow("SELECT * FROM test where col_2 <> 'aaaaaaaaaaa'");
assertQueryArrow("SELECT * FROM test where col_2 <> 'abcdefghijklmn'");
assertQueryArrow("SELECT col_2 FROM test where col_2 IS NOT NULL");
assertQueryArrow("SELECT col_2 FROM test where col_2 < 'uuu'");
}

TEST_F(CiderStringNullableTestArrow, ArrowSubstringNestTest) {
assertQueryArrow("SELECT * FROM test WHERE SUBSTRING(col_2, 1, 3) = 'aaa'");
assertQueryArrow("SELECT * FROM test WHERE SUBSTRING(col_2, 1, 3) <> 'bbb'");
Expand All @@ -370,6 +440,54 @@ TEST_F(CiderStringNullableTestArrow, ArrowSubstringNestTest) {
assertQueryArrow("SELECT SUBSTRING(SUBSTRING(col_2, 1, 8), 1, 4) FROM test ");
}

TEST_F(CiderStringTestArrow, ArrowCaseConvertionTest) {
// select column from table
assertQueryArrow("SELECT col_2, LOWER(col_2) FROM test;", "stringop_lower.json");
assertQueryArrow("SELECT col_2, UPPER(col_2) FROM test;", "stringop_upper.json");

// select literal from table
assertQueryArrow("SELECT LOWER('aAbBcCdD12') FROM test;",
"stringop_lower_literal.json");
assertQueryArrow("SELECT UPPER('aAbBcCdD12') FROM test;",
"stringop_upper_literal.json");

// string op on filter clause
assertQueryArrow("SELECT col_2 FROM test WHERE LOWER(col_2) = 'aaaaaaaaaa'",
"stringop_lower_condition.json");
assertQueryArrow("SELECT col_2 FROM test WHERE UPPER(col_2) = 'AAAAAAAAAA'",
"stringop_upper_condition.json");

/// NOTE: (YBRua) Skipped for now because we dont expect queries without FROM clauses.
/// 1. Behaviors of Cider and DuckDb are different w.r.t. this query.
/// DuckDb produces only 1 row, while Cider produces input_row_num rows.
/// Because the compiled row_func IR always runs for input_row_num times
/// at runtime in current implementation of Cider.
/// 2. If no input table (no FROM clause) is given, the generated Substrait plan will
/// have a "virtualTable" (instead of a "namedTable") as a placeholder input.
/// <https://substrait.io/relations/logical_relations/#virtual-table>
// select literal
// assertQueryArrow("SELECT LOWER('ABCDEFG');", "stringop_lower_constexpr_null.json");
// assertQueryArrow("SELECT UPPER('abcdefg');", "stringop_upper_constexpr_null.json");
}

TEST_F(CiderStringNullableTestArrow, ArrowCaseConvertionTest) {
// select column from table
assertQueryArrow("SELECT col_2, LOWER(col_2) FROM test;", "stringop_lower_null.json");
assertQueryArrow("SELECT col_2, UPPER(col_2) FROM test;", "stringop_upper_null.json");

// select literal from table
assertQueryArrow("SELECT LOWER('aAbBcCdD12') FROM test;",
"stringop_lower_literal_null.json");
assertQueryArrow("SELECT UPPER('aAbBcCdD12') FROM test;",
"stringop_upper_literal_null.json");

// string op on filter clause
assertQueryArrow("SELECT col_2 FROM test WHERE LOWER(col_2) = 'aaaaaaaaaa'",
"stringop_lower_condition_null.json");
assertQueryArrow("SELECT col_2 FROM test WHERE UPPER(col_2) = 'AAAAAAAAAA'",
"stringop_upper_condition_null.json");
}

class CiderConstantStringTest : public CiderTestBase {
public:
CiderConstantStringTest() {
Expand Down
116 changes: 116 additions & 0 deletions cider/tests/substrait_plan_files/stringop_lower.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
{
"extensionUris": [
{
"extensionUriAnchor": 1,
"uri": "/functions_string.yaml"
}
],
"extensions": [
{
"extensionFunction": {
"extensionUriReference": 1,
"functionAnchor": 0,
"name": "lower:opt_vchar"
}
}
],
"relations": [
{
"root": {
"input": {
"project": {
"common": {
"emit": {
"outputMapping": [
2,
3
]
}
},
"input": {
"read": {
"common": {
"direct": {}
},
"baseSchema": {
"names": [
"COL_1",
"COL_2"
],
"struct": {
"types": [
{
"i32": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
},
{
"varchar": {
"length": 10,
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
}
],
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
},
"namedTable": {
"names": [
"TEST"
]
}
}
},
"expressions": [
{
"selection": {
"directReference": {
"structField": {
"field": 1
}
},
"rootReference": {}
}
},
{
"scalarFunction": {
"functionReference": 0,
"args": [],
"outputType": {
"varchar": {
"length": 10,
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
},
"arguments": [
{
"value": {
"selection": {
"directReference": {
"structField": {
"field": 1
}
},
"rootReference": {}
}
}
}
]
}
}
]
}
},
"names": [
"COL_2",
"EXPR$1"
]
}
}
],
"expectedTypeUrls": []
}
Loading