Merge branch 'main' into arrow_uhugeint

nickgerrets · Apr 8, 2024 · 0cfc9de · 0cfc9de
2 parents 87a1773 + b4408a8
commit 0cfc9de
Show file tree

Hide file tree

Showing 31 changed files with 317 additions and 252 deletions.
diff --git a/.github/workflows/CodeQuality.yml b/.github/workflows/CodeQuality.yml
@@ -81,7 +81,7 @@ jobs:
     - name: Install python dependencies
       if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
       shell: bash
-      run: python -m pip install clang-5
+      run: python -m pip install libclang
 
     - name: Install libclang
       if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
@@ -91,7 +91,7 @@ jobs:
     - name: Verify C enum integrity
       if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
       shell: bash
-      run: python scripts/verify_enum_integrity.py src/include/duckdb.h --library_path $(llvm-config --libdir)
+      run: python scripts/verify_enum_integrity.py src/include/duckdb.h
 
   tidy-check:
     name: Tidy Check

diff --git a/extension/json/json_scan.cpp b/extension/json/json_scan.cpp
@@ -878,14 +878,14 @@ void JSONScanLocalState::ParseNextChunk(JSONScanGlobalState &gstate) {
 	auto buffer_offset_before = buffer_offset;
 
 	const auto format = current_reader->GetFormat();
-	D_ASSERT(format != JSONFormat::AUTO_DETECT);
 	for (; scan_count < STANDARD_VECTOR_SIZE; scan_count++) {
 		SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
 		auto json_start = buffer_ptr + buffer_offset;
 		idx_t remaining = buffer_size - buffer_offset;
 		if (remaining == 0) {
 			break;
 		}
+		D_ASSERT(format != JSONFormat::AUTO_DETECT);
 		const char *json_end = format == JSONFormat::NEWLINE_DELIMITED ? NextNewline(json_start, remaining)
 		                                                               : NextJSON(json_start, remaining);
 		if (json_end == nullptr) {

diff --git a/scripts/verify_enum_integrity.py b/scripts/verify_enum_integrity.py
@@ -27,10 +27,7 @@ def visit_enum(cursor):
     print(f"Succesfully verified the integrity of enum {enum_name} ({len(enum_constants)} entries)")
 
 
-def parse_enum(file_path, clang_path: Optional[str] = None):
-    if clang_path:
-        clang.cindex.Config.set_library_path(clang_path)
-
+def parse_enum(file_path):
     # Create index
     index = clang.cindex.Index.create()
 
@@ -54,12 +51,11 @@ def parse_enum(file_path, clang_path: Optional[str] = None):
 
     parser = argparse.ArgumentParser(description="Parse a C header file and check enum integrity.")
     parser.add_argument("file_path", type=str, help="Path to the C header file")
-    parser.add_argument("--library_path", type=str, help="Path to the clang library", default=None)
 
     args = parser.parse_args()
     file_path = args.file_path
 
     if not os.path.exists(file_path):
         raise Exception(f"Error: file '{file_path}' does not exist")
 
-    enum_dict = parse_enum(file_path, args.library_path)
+    enum_dict = parse_enum(file_path)
diff --git a/src/common/vector_operations/vector_hash.cpp b/src/common/vector_operations/vector_hash.cpp
@@ -183,23 +183,31 @@ template <bool HAS_RSEL, bool FIRST_HASH>
 static inline void ArrayLoopHash(Vector &input, Vector &hashes, const SelectionVector *rsel, idx_t count) {
 	auto hdata = FlatVector::GetData<hash_t>(hashes);
 
-	if (input.GetVectorType() != VectorType::CONSTANT_VECTOR || input.GetVectorType() != VectorType::FLAT_VECTOR) {
-		input.Flatten(count);
-	}
-
 	UnifiedVectorFormat idata;
 	input.ToUnifiedFormat(count, idata);
 
 	// Hash the children into a temporary
 	auto &child = ArrayVector::GetEntry(input);
 	auto array_size = ArrayType::GetSize(input.GetType());
-	auto is_constant = input.GetVectorType() == VectorType::CONSTANT_VECTOR;
-	auto child_count = array_size * (is_constant ? 1 : count);
+
+	// Figure out how large the child hashes vector should be
+	// TODO: We could use some sort of sparse reverse selection vector to avoid having to allocate a much larger vector
+	auto child_count = array_size * count;
+	if (input.GetVectorType() == VectorType::CONSTANT_VECTOR) {
+		child_count = array_size;
+	} else if (input.GetVectorType() == VectorType::DICTIONARY_VECTOR) {
+		// Based on the largest dict offset
+		for (idx_t i = 0; i < count; i++) {
+			auto ridx = HAS_RSEL ? rsel->get_index(i) : i;
+			auto lidx = idata.sel->get_index(ridx);
+			child_count = MaxValue(child_count, array_size * lidx + array_size);
+		}
+	}
 
 	Vector child_hashes(LogicalType::HASH, child_count);
 	if (child_count > 0) {
-		child_hashes.Flatten(child_count);
 		VectorOperations::Hash(child, child_hashes, child_count);
+		child_hashes.Flatten(child_count);
 	}
 	auto chdata = FlatVector::GetData<hash_t>(child_hashes);
 

diff --git a/src/core_functions/scalar/generic/system_functions.cpp b/src/core_functions/scalar/generic/system_functions.cpp
@@ -47,6 +47,9 @@ struct CurrentSchemasBindData : public FunctionData {
 
 static unique_ptr<FunctionData> CurrentSchemasBind(ClientContext &context, ScalarFunction &bound_function,
                                                    vector<unique_ptr<Expression>> &arguments) {
+	if (arguments[0]->return_type.id() != LogicalTypeId::BOOLEAN) {
+		throw BinderException("current_schemas requires a boolean input");
+	}
 	if (!arguments[0]->IsFoldable()) {
 		throw NotImplementedException("current_schemas requires a constant input");
 	}

diff --git a/src/core_functions/scalar/list/flatten.cpp b/src/core_functions/scalar/list/flatten.cpp
@@ -3,6 +3,7 @@
 #include "duckdb/planner/expression/bound_function_expression.hpp"
 #include "duckdb/storage/statistics/list_stats.hpp"
 #include "duckdb/function/scalar/nested_functions.hpp"
+#include "duckdb/planner/expression/bound_cast_expression.hpp"
 
 namespace duckdb {
 
@@ -113,6 +114,22 @@ static unique_ptr<FunctionData> ListFlattenBind(ClientContext &context, ScalarFu
                                                 vector<unique_ptr<Expression>> &arguments) {
 	D_ASSERT(bound_function.arguments.size() == 1);
 
+	if (arguments[0]->return_type.id() == LogicalTypeId::ARRAY) {
+		auto child_type = ArrayType::GetChildType(arguments[0]->return_type);
+		if (child_type.id() == LogicalTypeId::ARRAY) {
+			child_type = LogicalType::LIST(ArrayType::GetChildType(child_type));
+		}
+		arguments[0] =
+		    BoundCastExpression::AddCastToType(context, std::move(arguments[0]), LogicalType::LIST(child_type));
+	} else if (arguments[0]->return_type.id() == LogicalTypeId::LIST) {
+		auto child_type = ListType::GetChildType(arguments[0]->return_type);
+		if (child_type.id() == LogicalTypeId::ARRAY) {
+			child_type = LogicalType::LIST(ArrayType::GetChildType(child_type));
+			arguments[0] =
+			    BoundCastExpression::AddCastToType(context, std::move(arguments[0]), LogicalType::LIST(child_type));
+		}
+	}
+
 	auto &input_type = arguments[0]->return_type;
 	bound_function.arguments[0] = input_type;
 	if (input_type.id() == LogicalTypeId::UNKNOWN) {

diff --git a/src/core_functions/scalar/list/list_sort.cpp b/src/core_functions/scalar/list/list_sort.cpp
@@ -239,6 +239,9 @@ static void ListSortFunction(DataChunk &args, ExpressionState &state, Vector &re
 			auto &result_entry = ListVector::GetEntry(result);
 			auto result_data = ListVector::GetData(result);
 			for (idx_t i = 0; i < count; i++) {
+				if (!result_validity.RowIsValid(i)) {
+					continue;
+				}
 				for (idx_t j = result_data[i].offset; j < result_data[i].offset + result_data[i].length; j++) {
 					auto b = sel_sorted.get_index(j) - result_data[i].offset;
 					result_entry.SetValue(j, Value::BIGINT(b + 1));

diff --git a/src/function/cast/enum_casts.cpp b/src/function/cast/enum_casts.cpp
@@ -7,43 +7,26 @@ namespace duckdb {
 
 template <class SRC_TYPE, class RES_TYPE>
 bool EnumEnumCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
-	result.SetVectorType(VectorType::FLAT_VECTOR);
-
-	auto &str_vec = EnumType::GetValuesInsertOrder(source.GetType());
-	auto str_vec_ptr = FlatVector::GetData<string_t>(str_vec);
-
+	auto &enum_dictionary = EnumType::GetValuesInsertOrder(source.GetType());
+	auto dictionary_data = FlatVector::GetData<string_t>(enum_dictionary);
 	auto res_enum_type = result.GetType();
 
-	UnifiedVectorFormat vdata;
-	source.ToUnifiedFormat(count, vdata);
-
-	auto source_data = UnifiedVectorFormat::GetData<SRC_TYPE>(vdata);
-	auto source_sel = vdata.sel;
-	auto source_mask = vdata.validity;
-
-	auto result_data = FlatVector::GetData<RES_TYPE>(result);
-	auto &result_mask = FlatVector::Validity(result);
-
 	VectorTryCastData vector_cast_data(result, parameters);
-	for (idx_t i = 0; i < count; i++) {
-		auto src_idx = source_sel->get_index(i);
-		if (!source_mask.RowIsValid(src_idx)) {
-			result_mask.SetInvalid(i);
-			continue;
-		}
-		auto key = EnumType::GetPos(res_enum_type, str_vec_ptr[source_data[src_idx]]);
-		if (key == -1) {
-			// key doesn't exist on result enum
-			if (!parameters.error_message) {
-				result_data[i] = HandleVectorCastError::Operation<RES_TYPE>(
-				    CastExceptionText<SRC_TYPE, RES_TYPE>(source_data[src_idx]), result_mask, i, vector_cast_data);
-			} else {
-				result_mask.SetInvalid(i);
-			}
-			continue;
-		}
-		result_data[i] = UnsafeNumericCast<RES_TYPE>(key);
-	}
+	UnaryExecutor::ExecuteWithNulls<SRC_TYPE, RES_TYPE>(
+	    source, result, count, [&](SRC_TYPE value, ValidityMask &mask, idx_t row_idx) {
+		    auto key = EnumType::GetPos(res_enum_type, dictionary_data[value]);
+		    if (key == -1) {
+			    if (!parameters.error_message) {
+				    return HandleVectorCastError::Operation<RES_TYPE>(CastExceptionText<SRC_TYPE, RES_TYPE>(value),
+				                                                      mask, row_idx, vector_cast_data);
+			    } else {
+				    mask.SetInvalid(row_idx);
+			    }
+			    return RES_TYPE();
+		    } else {
+			    return UnsafeNumericCast<RES_TYPE>(key);
+		    }
+	    });
 	return vector_cast_data.all_converted;
 }
 
@@ -65,27 +48,9 @@ template <class SRC>
 static bool EnumToVarcharCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
 	auto &enum_dictionary = EnumType::GetValuesInsertOrder(source.GetType());
 	auto dictionary_data = FlatVector::GetData<string_t>(enum_dictionary);
-	auto result_data = FlatVector::GetData<string_t>(result);
-	auto &result_mask = FlatVector::Validity(result);
-
-	UnifiedVectorFormat vdata;
-	source.ToUnifiedFormat(count, vdata);
-
-	auto source_data = UnifiedVectorFormat::GetData<SRC>(vdata);
-	for (idx_t i = 0; i < count; i++) {
-		auto source_idx = vdata.sel->get_index(i);
-		if (!vdata.validity.RowIsValid(source_idx)) {
-			result_mask.SetInvalid(i);
-			continue;
-		}
-		auto enum_idx = source_data[source_idx];
-		result_data[i] = dictionary_data[enum_idx];
-	}
-	if (source.GetVectorType() == VectorType::CONSTANT_VECTOR) {
-		result.SetVectorType(VectorType::CONSTANT_VECTOR);
-	} else {
-		result.SetVectorType(VectorType::FLAT_VECTOR);
-	}
+
+	UnaryExecutor::Execute<SRC, string_t>(source, result, count,
+	                                      [&](SRC enum_idx) { return dictionary_data[enum_idx]; });
 	return true;
 }
 

diff --git a/src/function/scalar/list/list_select.cpp b/src/function/scalar/list/list_select.cpp
@@ -3,6 +3,7 @@
 #include "duckdb/planner/expression_binder.hpp"
 #include "duckdb/planner/expression/bound_function_expression.hpp"
 #include "duckdb/planner/expression/bound_parameter_expression.hpp"
+#include "duckdb/planner/expression/bound_cast_expression.hpp"
 
 namespace duckdb {
 
@@ -140,6 +141,10 @@ static void ListSelectFunction(DataChunk &args, ExpressionState &state, Vector &
 static unique_ptr<FunctionData> ListSelectBind(ClientContext &context, ScalarFunction &bound_function,
                                                vector<unique_ptr<Expression>> &arguments) {
 	D_ASSERT(bound_function.arguments.size() == 2);
+
+	// If the first argument is an array, cast it to a list
+	arguments[0] = BoundCastExpression::AddArrayCastToList(context, std::move(arguments[0]));
+
 	LogicalType child_type;
 	if (arguments[0]->return_type == LogicalTypeId::UNKNOWN || arguments[1]->return_type == LogicalTypeId::UNKNOWN) {
 		bound_function.arguments[0] = LogicalTypeId::UNKNOWN;

diff --git a/src/function/scalar/list/list_zip.cpp b/src/function/scalar/list/list_zip.cpp
@@ -31,13 +31,14 @@ static void ListZipFunction(DataChunk &args, ExpressionState &state, Vector &res
 	idx_t result_size = 0;
 	vector<idx_t> lengths;
 	for (idx_t j = 0; j < count; j++) {
-
 		// Is flag for current row set
 		bool truncate_to_shortest = false;
 		if (truncate_flags_set) {
-			idx_t flag_idx = input_lists.back().sel->get_index(j);
-			auto flag_data = UnifiedVectorFormat::GetData<bool>(input_lists.back());
-			truncate_to_shortest = flag_data[flag_idx];
+			auto &flag_vec = input_lists.back();
+			idx_t flag_idx = flag_vec.sel->get_index(j);
+			if (flag_vec.validity.RowIsValid(flag_idx)) {
+				truncate_to_shortest = UnifiedVectorFormat::GetData<bool>(flag_vec)[flag_idx];
+			}
 		}
 
 		// Calculation of the outgoing list size

diff --git a/src/function/scalar/strftime_format.cpp b/src/function/scalar/strftime_format.cpp
@@ -1360,6 +1360,20 @@ bool StrpTimeFormat::ParseResult::TryToDate(date_t &result) {
 	return Date::TryFromDate(data[0], data[1], data[2], result);
 }
 
+dtime_t StrpTimeFormat::ParseResult::ToTime() {
+	const auto hour_offset = data[7] / Interval::MINS_PER_HOUR;
+	const auto mins_offset = data[7] % Interval::MINS_PER_HOUR;
+	return Time::FromTime(data[3] - hour_offset, data[4] - mins_offset, data[5], data[6]);
+}
+
+bool StrpTimeFormat::ParseResult::TryToTime(dtime_t &result) {
+	if (data[7]) {
+		return false;
+	}
+	result = Time::FromTime(data[3], data[4], data[5], data[6]);
+	return true;
+}
+
 timestamp_t StrpTimeFormat::ParseResult::ToTimestamp() {
 	if (is_special) {
 		if (special == date_t::infinity()) {
@@ -1371,9 +1385,7 @@ timestamp_t StrpTimeFormat::ParseResult::ToTimestamp() {
 	}
 
 	date_t date = Date::FromDate(data[0], data[1], data[2]);
-	const auto hour_offset = data[7] / Interval::MINS_PER_HOUR;
-	const auto mins_offset = data[7] % Interval::MINS_PER_HOUR;
-	dtime_t time = Time::FromTime(data[3] - hour_offset, data[4] - mins_offset, data[5], data[6]);
+	dtime_t time = ToTime();
 	return Timestamp::FromDatetime(date, time);
 }
 
@@ -1382,9 +1394,7 @@ bool StrpTimeFormat::ParseResult::TryToTimestamp(timestamp_t &result) {
 	if (!TryToDate(date)) {
 		return false;
 	}
-	const auto hour_offset = data[7] / Interval::MINS_PER_HOUR;
-	const auto mins_offset = data[7] % Interval::MINS_PER_HOUR;
-	dtime_t time = Time::FromTime(data[3] - hour_offset, data[4] - mins_offset, data[5], data[6]);
+	dtime_t time = ToTime();
 	return Timestamp::TryFromDatetime(date, time, result);
 }
 
@@ -1403,29 +1413,22 @@ bool StrpTimeFormat::TryParseDate(string_t input, date_t &result, string &error_
 	return parse_result.TryToDate(result);
 }
 
-bool StrpTimeFormat::TryParseTimestamp(string_t input, timestamp_t &result, string &error_message) const {
+bool StrpTimeFormat::TryParseTime(string_t input, dtime_t &result, string &error_message) const {
 	ParseResult parse_result;
 	if (!Parse(input, parse_result)) {
 		error_message = parse_result.FormatError(input, format_specifier);
 		return false;
 	}
-	return parse_result.TryToTimestamp(result);
-}
-
-date_t StrpTimeFormat::ParseDate(string_t input) {
-	ParseResult result;
-	if (!Parse(input, result)) {
-		throw InvalidInputException(result.FormatError(input, format_specifier));
-	}
-	return result.ToDate();
+	return parse_result.TryToTime(result);
 }
 
-timestamp_t StrpTimeFormat::ParseTimestamp(string_t input) {
-	ParseResult result;
-	if (!Parse(input, result)) {
-		throw InvalidInputException(result.FormatError(input, format_specifier));
+bool StrpTimeFormat::TryParseTimestamp(string_t input, timestamp_t &result, string &error_message) const {
+	ParseResult parse_result;
+	if (!Parse(input, parse_result)) {
+		error_message = parse_result.FormatError(input, format_specifier);
+		return false;
 	}
-	return result.ToTimestamp();
+	return parse_result.TryToTimestamp(result);
 }
 
 } // namespace duckdb
diff --git a/src/include/duckdb/function/scalar/strftime_format.hpp b/src/include/duckdb/function/scalar/strftime_format.hpp
@@ -143,9 +143,11 @@ struct StrpTimeFormat : public StrTimeFormat { // NOLINT: work-around bug in cla
 		date_t special;
 
 		date_t ToDate();
+		dtime_t ToTime();
 		timestamp_t ToTimestamp();
 
 		bool TryToDate(date_t &result);
+		bool TryToTime(dtime_t &result);
 		bool TryToTimestamp(timestamp_t &result);
 
 		DUCKDB_API string FormatError(string_t input, const string &format_specifier);
@@ -160,11 +162,9 @@ struct StrpTimeFormat : public StrTimeFormat { // NOLINT: work-around bug in cla
 	DUCKDB_API bool Parse(string_t str, ParseResult &result) const;
 
 	DUCKDB_API bool TryParseDate(string_t str, date_t &result, string &error_message) const;
+	DUCKDB_API bool TryParseTime(string_t str, dtime_t &result, string &error_message) const;
 	DUCKDB_API bool TryParseTimestamp(string_t str, timestamp_t &result, string &error_message) const;
 
-	date_t ParseDate(string_t str);
-	timestamp_t ParseTimestamp(string_t str);
-
 	void Serialize(Serializer &serializer) const;
 	static StrpTimeFormat Deserialize(Deserializer &deserializer);
 

diff --git a/src/include/duckdb/main/client_context.hpp b/src/include/duckdb/main/client_context.hpp
@@ -167,7 +167,7 @@ class ClientContext : public std::enable_shared_from_this<ClientContext> {
 	                                                 bool requires_valid_transaction = true);
 
 	//! Equivalent to CURRENT_SETTING(key) SQL function.
-	DUCKDB_API SettingLookupResult TryGetCurrentSetting(const std::string &key, Value &result);
+	DUCKDB_API SettingLookupResult TryGetCurrentSetting(const std::string &key, Value &result) const;
 
 	//! Returns the parser options for this client context
 	DUCKDB_API ParserOptions GetParserOptions() const;