From 227fc393c32c3ba4f20681eeef916a6acea6ee1a Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 19 Jun 2024 00:13:27 +0800 Subject: [PATCH 1/8] feat: Implement equality = and inequality <> support for StringView --- Cargo.toml | 24 +++++++------- datafusion-cli/Cargo.lock | 30 ++++++++--------- datafusion-cli/Cargo.toml | 22 ++++++------- datafusion/common/src/scalar/mod.rs | 17 +++++++++- datafusion/expr/src/type_coercion/binary.rs | 1 + .../sqllogictest/test_files/string_view.slt | 33 +++++++++++++++++++ 6 files changed, 88 insertions(+), 39 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/string_view.slt diff --git a/Cargo.toml b/Cargo.toml index 290dd64021b7..be6e0c672f6f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -157,15 +157,15 @@ unused_imports = "deny" ## Temporary arrow-rs patch until 52.1.0 is released [patch.crates-io] -arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-cast = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-flight = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } +arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index b0b41a12328d..15f7809ee5f5 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -131,7 +131,7 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-arith", "arrow-array", @@ -151,7 +151,7 @@ dependencies = [ [[package]] name = "arrow-arith" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -165,7 +165,7 @@ dependencies = [ [[package]] name = "arrow-array" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "ahash", "arrow-buffer", @@ -181,7 +181,7 @@ dependencies = [ [[package]] name = "arrow-buffer" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "bytes", "half", @@ -191,7 +191,7 @@ dependencies = [ [[package]] name = "arrow-cast" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -211,7 +211,7 @@ dependencies = [ [[package]] name = "arrow-csv" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -229,7 +229,7 @@ dependencies = [ [[package]] name = "arrow-data" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-buffer", "arrow-schema", @@ -240,7 +240,7 @@ dependencies = [ [[package]] name = "arrow-ipc" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -254,7 +254,7 @@ dependencies = [ [[package]] name = "arrow-json" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -273,7 +273,7 @@ dependencies = [ [[package]] name = "arrow-ord" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -287,7 +287,7 @@ dependencies = [ [[package]] name = "arrow-row" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "ahash", "arrow-array", @@ -301,12 +301,12 @@ dependencies = [ [[package]] name = "arrow-schema" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" [[package]] name = "arrow-select" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "ahash", "arrow-array", @@ -319,7 +319,7 @@ dependencies = [ [[package]] name = "arrow-string" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "arrow-array", "arrow-buffer", @@ -2704,7 +2704,7 @@ dependencies = [ [[package]] name = "parquet" version = "52.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=72467c670f8c38130e4743347407f1a542e59e0c#72467c670f8c38130e4743347407f1a542e59e0c" +source = "git+https://github.com/apache/arrow-rs.git?rev=d0a88c651991b7fc4b970cf94fa77f4ec3def22d#d0a88c651991b7fc4b970cf94fa77f4ec3def22d" dependencies = [ "ahash", "arrow-array", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index b4883264731e..0e7b712d8b19 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -66,14 +66,14 @@ rstest = "0.17" ## Temporary arrow-rs patch until 52.1.0 is released [patch.crates-io] -arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-cast = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } -parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "72467c670f8c38130e4743347407f1a542e59e0c" } +arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } +parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "d0a88c651991b7fc4b970cf94fa77f4ec3def22d" } diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 96bf4216d9a1..c7724605a7b6 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -1715,6 +1715,22 @@ impl ScalarValue { )?; Arc::new(array) } + DataType::Utf8View => { + let array = scalars + .map(|sv| { + if let ScalarValue::Utf8View(v) = sv { + Ok(v.unwrap_or_default()) + } else { + _internal_err!( + "Inconsistent types in ScalarValue::iter_to_array. \ + Expected {data_type:?}, got {sv:?}" + ) + } + }) + .collect::>>()?; + let array = StringViewArray::from_iter_values(array.into_iter()); + Arc::new(array) + } // explicitly enumerate unsupported types so newly added // types must be aknowledged, Time32 and Time64 types are // not supported if the TimeUnit is not valid (Time32 can @@ -1726,7 +1742,6 @@ impl ScalarValue { | DataType::Time64(TimeUnit::Millisecond) | DataType::Map(_, _) | DataType::RunEndEncoded(_, _) - | DataType::Utf8View | DataType::BinaryView | DataType::ListView(_) | DataType::LargeListView(_) => { diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index d7cb4b1a3ef6..2299bbcaf19e 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -932,6 +932,7 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option (LargeUtf8, Utf8) => Some(LargeUtf8), (Utf8, LargeUtf8) => Some(LargeUtf8), (LargeUtf8, LargeUtf8) => Some(LargeUtf8), + (Utf8View, Utf8View) => Some(Utf8View), _ => None, } } diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt new file mode 100644 index 000000000000..332aa9a525ca --- /dev/null +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +statement ok +create table test as values (arrow_cast('Andrew', 'Utf8View'), arrow_cast('X', 'Utf8View')), + (arrow_cast('Xiangpeng', 'Utf8View'), arrow_cast('Xiangpeng', 'Utf8View')), + (arrow_cast('Raphael', 'Utf8View'), arrow_cast('R', 'Utf8View')); + +query ? +select * from test where column1 = column2; + +query ? +select * from test where column1 <> column2; + +query ? +select * from test where column1 = arrow_cast('Andrew', 'Utf8View'); + +query ? +select * from test where column1 <> arrow_cast('Andrew', 'Utf8View'); \ No newline at end of file From 9a9006c43b01e9fdea8af2c74b96b0bbd63abed2 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 19 Jun 2024 00:28:12 +0800 Subject: [PATCH 2/8] chore: Add tests for the StringView --- .../sqllogictest/test_files/string_view.slt | 43 ++++++++++++++++--- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 332aa9a525ca..4711422a746a 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -20,14 +20,47 @@ create table test as values (arrow_cast('Andrew', 'Utf8View'), arrow_cast('X', ' (arrow_cast('Xiangpeng', 'Utf8View'), arrow_cast('Xiangpeng', 'Utf8View')), (arrow_cast('Raphael', 'Utf8View'), arrow_cast('R', 'Utf8View')); -query ? + + +query B +select arrow_cast('NULL', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'); +---- +false + +query B +select arrow_cast('NULL', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'); +---- +true + +query B +select arrow_cast('Andrew', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'); +---- +true + +query B +select arrow_cast('Xiangpeng', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'); +---- +true + +query ?? select * from test where column1 = column2; +---- +Xiangpeng Xiangpeng -query ? +query ?? select * from test where column1 <> column2; +---- +Andrew X +Raphael R + -query ? +query ?? select * from test where column1 = arrow_cast('Andrew', 'Utf8View'); +---- +Andrew X -query ? -select * from test where column1 <> arrow_cast('Andrew', 'Utf8View'); \ No newline at end of file +query ?? +select * from test where column1 <> arrow_cast('Andrew', 'Utf8View'); +---- +Xiangpeng Xiangpeng +Raphael R \ No newline at end of file From 5930faa08a30e9fb20e4ebead37117acabaa3395 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 19 Jun 2024 00:33:37 +0800 Subject: [PATCH 3/8] chore --- datafusion/common/src/scalar/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index c7724605a7b6..6f63a5342edf 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -1719,7 +1719,7 @@ impl ScalarValue { let array = scalars .map(|sv| { if let ScalarValue::Utf8View(v) = sv { - Ok(v.unwrap_or_default()) + Ok(v) } else { _internal_err!( "Inconsistent types in ScalarValue::iter_to_array. \ @@ -1728,7 +1728,7 @@ impl ScalarValue { } }) .collect::>>()?; - let array = StringViewArray::from_iter_values(array.into_iter()); + let array = StringViewArray::from(array); Arc::new(array) } // explicitly enumerate unsupported types so newly added From e36a93e4bb414df1ffa2f6cc4b2ee105ff0712e1 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 19 Jun 2024 00:38:13 +0800 Subject: [PATCH 4/8] chore: Update tests for NULL --- datafusion/sqllogictest/test_files/string_view.slt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 4711422a746a..dab5ac98340f 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -18,9 +18,8 @@ statement ok create table test as values (arrow_cast('Andrew', 'Utf8View'), arrow_cast('X', 'Utf8View')), (arrow_cast('Xiangpeng', 'Utf8View'), arrow_cast('Xiangpeng', 'Utf8View')), - (arrow_cast('Raphael', 'Utf8View'), arrow_cast('R', 'Utf8View')); - - + (arrow_cast('Raphael', 'Utf8View'), arrow_cast('R', 'Utf8View')), + (arrow_cast(NULL, 'Utf8View'), arrow_cast('R', 'Utf8View')); query B select arrow_cast('NULL', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'); From 0f0de6d522c33d0e637b76442eca07d042781fd3 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 19 Jun 2024 09:45:41 +0800 Subject: [PATCH 5/8] fix: Used build_array_string! --- datafusion/common/src/scalar/mod.rs | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 6f63a5342edf..86ac115cca02 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -1570,6 +1570,7 @@ impl ScalarValue { DataType::UInt16 => build_array_primitive!(UInt16Array, UInt16), DataType::UInt32 => build_array_primitive!(UInt32Array, UInt32), DataType::UInt64 => build_array_primitive!(UInt64Array, UInt64), + DataType::Utf8View => build_array_string!(StringViewArray, Utf8View), DataType::Utf8 => build_array_string!(StringArray, Utf8), DataType::LargeUtf8 => build_array_string!(LargeStringArray, LargeUtf8), DataType::Binary => build_array_string!(BinaryArray, Binary), @@ -1715,22 +1716,6 @@ impl ScalarValue { )?; Arc::new(array) } - DataType::Utf8View => { - let array = scalars - .map(|sv| { - if let ScalarValue::Utf8View(v) = sv { - Ok(v) - } else { - _internal_err!( - "Inconsistent types in ScalarValue::iter_to_array. \ - Expected {data_type:?}, got {sv:?}" - ) - } - }) - .collect::>>()?; - let array = StringViewArray::from(array); - Arc::new(array) - } // explicitly enumerate unsupported types so newly added // types must be aknowledged, Time32 and Time64 types are // not supported if the TimeUnit is not valid (Time32 can From 6659d049d5f345e384693b654c2d5e40f6d200e3 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 19 Jun 2024 10:09:43 +0800 Subject: [PATCH 6/8] chore: Update string_coercion function to handle Utf8View type in binary.rs --- datafusion/expr/src/type_coercion/binary.rs | 2 +- .../sqllogictest/test_files/string_view.slt | 41 ++++++++++++++++++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 2299bbcaf19e..d57b5228cb74 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -932,7 +932,7 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option (LargeUtf8, Utf8) => Some(LargeUtf8), (Utf8, LargeUtf8) => Some(LargeUtf8), (LargeUtf8, LargeUtf8) => Some(LargeUtf8), - (Utf8View, Utf8View) => Some(Utf8View), + (Utf8View, Utf8View) | (Utf8View, Utf8) | (Utf8, Utf8View) => Some(Utf8View), _ => None, } } diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index dab5ac98340f..e4055a6962d9 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. + +# test StringViewArray with Utf8View columns statement ok create table test as values (arrow_cast('Andrew', 'Utf8View'), arrow_cast('X', 'Utf8View')), (arrow_cast('Xiangpeng', 'Utf8View'), arrow_cast('Xiangpeng', 'Utf8View')), @@ -62,4 +64,41 @@ query ?? select * from test where column1 <> arrow_cast('Andrew', 'Utf8View'); ---- Xiangpeng Xiangpeng -Raphael R \ No newline at end of file +Raphael R + +statement ok +drop table test; + + +# test StringViewArray with Utf8 and Utf8View columns +statement ok +create table test as values ('Andrew', arrow_cast('X', 'Utf8View')), + ('Xiangpeng', arrow_cast('Xiangpeng', 'Utf8View')), + ('Raphael', arrow_cast('R', 'Utf8View')), + (NULL, arrow_cast('R', 'Utf8View')); + +query T? +select * from test where column1 = column2; +---- +Xiangpeng Xiangpeng + +query T? +select * from test where column1 <> column2; +---- +Andrew X +Raphael R + + +query T? +select * from test where column1 = arrow_cast('Andrew', 'Utf8View'); +---- +Andrew X + +query T? +select * from test where column1 <> arrow_cast('Andrew', 'Utf8View'); +---- +Xiangpeng Xiangpeng +Raphael R + +statement ok +drop table test; From 308075de9e7a70bc78a86dc000404ee6d8346160 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 19 Jun 2024 10:12:36 +0800 Subject: [PATCH 7/8] chore: add tests --- datafusion/sqllogictest/test_files/string_view.slt | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index e4055a6962d9..563a8a32ca20 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -54,18 +54,27 @@ select * from test where column1 <> column2; Andrew X Raphael R - query ?? select * from test where column1 = arrow_cast('Andrew', 'Utf8View'); ---- Andrew X +query ?? +select * from test where column1 = 'Andrew'; +---- +Andrew X + query ?? select * from test where column1 <> arrow_cast('Andrew', 'Utf8View'); ---- Xiangpeng Xiangpeng Raphael R +query ?? +select * from test where column1 <> 'Andrew'; +---- +Andrew X + statement ok drop table test; @@ -88,7 +97,6 @@ select * from test where column1 <> column2; Andrew X Raphael R - query T? select * from test where column1 = arrow_cast('Andrew', 'Utf8View'); ---- From 94ede954ba9a0812f23ec65112f419489ea0bb1a Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 19 Jun 2024 10:36:28 +0800 Subject: [PATCH 8/8] chore: ci --- datafusion/sqllogictest/test_files/string_view.slt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 563a8a32ca20..3be3c94770db 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -73,7 +73,8 @@ Raphael R query ?? select * from test where column1 <> 'Andrew'; ---- -Andrew X +Xiangpeng Xiangpeng +Raphael R statement ok drop table test;