Skip to content

Commit

Permalink
Merge branch 'main' into arrow_uhugeint
Browse files Browse the repository at this point in the history
  • Loading branch information
nickgerrets committed Apr 8, 2024
2 parents 87a1773 + b4408a8 commit 0cfc9de
Show file tree
Hide file tree
Showing 31 changed files with 317 additions and 252 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/CodeQuality.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ jobs:
- name: Install python dependencies
if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
shell: bash
run: python -m pip install clang-5
run: python -m pip install libclang

- name: Install libclang
if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
Expand All @@ -91,7 +91,7 @@ jobs:
- name: Verify C enum integrity
if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
shell: bash
run: python scripts/verify_enum_integrity.py src/include/duckdb.h --library_path $(llvm-config --libdir)
run: python scripts/verify_enum_integrity.py src/include/duckdb.h

tidy-check:
name: Tidy Check
Expand Down
2 changes: 1 addition & 1 deletion extension/json/json_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -878,14 +878,14 @@ void JSONScanLocalState::ParseNextChunk(JSONScanGlobalState &gstate) {
auto buffer_offset_before = buffer_offset;

const auto format = current_reader->GetFormat();
D_ASSERT(format != JSONFormat::AUTO_DETECT);
for (; scan_count < STANDARD_VECTOR_SIZE; scan_count++) {
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
auto json_start = buffer_ptr + buffer_offset;
idx_t remaining = buffer_size - buffer_offset;
if (remaining == 0) {
break;
}
D_ASSERT(format != JSONFormat::AUTO_DETECT);
const char *json_end = format == JSONFormat::NEWLINE_DELIMITED ? NextNewline(json_start, remaining)
: NextJSON(json_start, remaining);
if (json_end == nullptr) {
Expand Down
8 changes: 2 additions & 6 deletions scripts/verify_enum_integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@ def visit_enum(cursor):
print(f"Succesfully verified the integrity of enum {enum_name} ({len(enum_constants)} entries)")


def parse_enum(file_path, clang_path: Optional[str] = None):
if clang_path:
clang.cindex.Config.set_library_path(clang_path)

def parse_enum(file_path):
# Create index
index = clang.cindex.Index.create()

Expand All @@ -54,12 +51,11 @@ def parse_enum(file_path, clang_path: Optional[str] = None):

parser = argparse.ArgumentParser(description="Parse a C header file and check enum integrity.")
parser.add_argument("file_path", type=str, help="Path to the C header file")
parser.add_argument("--library_path", type=str, help="Path to the clang library", default=None)

args = parser.parse_args()
file_path = args.file_path

if not os.path.exists(file_path):
raise Exception(f"Error: file '{file_path}' does not exist")

enum_dict = parse_enum(file_path, args.library_path)
enum_dict = parse_enum(file_path)
22 changes: 15 additions & 7 deletions src/common/vector_operations/vector_hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,23 +183,31 @@ template <bool HAS_RSEL, bool FIRST_HASH>
static inline void ArrayLoopHash(Vector &input, Vector &hashes, const SelectionVector *rsel, idx_t count) {
auto hdata = FlatVector::GetData<hash_t>(hashes);

if (input.GetVectorType() != VectorType::CONSTANT_VECTOR || input.GetVectorType() != VectorType::FLAT_VECTOR) {
input.Flatten(count);
}

UnifiedVectorFormat idata;
input.ToUnifiedFormat(count, idata);

// Hash the children into a temporary
auto &child = ArrayVector::GetEntry(input);
auto array_size = ArrayType::GetSize(input.GetType());
auto is_constant = input.GetVectorType() == VectorType::CONSTANT_VECTOR;
auto child_count = array_size * (is_constant ? 1 : count);

// Figure out how large the child hashes vector should be
// TODO: We could use some sort of sparse reverse selection vector to avoid having to allocate a much larger vector
auto child_count = array_size * count;
if (input.GetVectorType() == VectorType::CONSTANT_VECTOR) {
child_count = array_size;
} else if (input.GetVectorType() == VectorType::DICTIONARY_VECTOR) {
// Based on the largest dict offset
for (idx_t i = 0; i < count; i++) {
auto ridx = HAS_RSEL ? rsel->get_index(i) : i;
auto lidx = idata.sel->get_index(ridx);
child_count = MaxValue(child_count, array_size * lidx + array_size);
}
}

Vector child_hashes(LogicalType::HASH, child_count);
if (child_count > 0) {
child_hashes.Flatten(child_count);
VectorOperations::Hash(child, child_hashes, child_count);
child_hashes.Flatten(child_count);
}
auto chdata = FlatVector::GetData<hash_t>(child_hashes);

Expand Down
3 changes: 3 additions & 0 deletions src/core_functions/scalar/generic/system_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ struct CurrentSchemasBindData : public FunctionData {

static unique_ptr<FunctionData> CurrentSchemasBind(ClientContext &context, ScalarFunction &bound_function,
vector<unique_ptr<Expression>> &arguments) {
if (arguments[0]->return_type.id() != LogicalTypeId::BOOLEAN) {
throw BinderException("current_schemas requires a boolean input");
}
if (!arguments[0]->IsFoldable()) {
throw NotImplementedException("current_schemas requires a constant input");
}
Expand Down
17 changes: 17 additions & 0 deletions src/core_functions/scalar/list/flatten.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "duckdb/planner/expression/bound_function_expression.hpp"
#include "duckdb/storage/statistics/list_stats.hpp"
#include "duckdb/function/scalar/nested_functions.hpp"
#include "duckdb/planner/expression/bound_cast_expression.hpp"

namespace duckdb {

Expand Down Expand Up @@ -113,6 +114,22 @@ static unique_ptr<FunctionData> ListFlattenBind(ClientContext &context, ScalarFu
vector<unique_ptr<Expression>> &arguments) {
D_ASSERT(bound_function.arguments.size() == 1);

if (arguments[0]->return_type.id() == LogicalTypeId::ARRAY) {
auto child_type = ArrayType::GetChildType(arguments[0]->return_type);
if (child_type.id() == LogicalTypeId::ARRAY) {
child_type = LogicalType::LIST(ArrayType::GetChildType(child_type));
}
arguments[0] =
BoundCastExpression::AddCastToType(context, std::move(arguments[0]), LogicalType::LIST(child_type));
} else if (arguments[0]->return_type.id() == LogicalTypeId::LIST) {
auto child_type = ListType::GetChildType(arguments[0]->return_type);
if (child_type.id() == LogicalTypeId::ARRAY) {
child_type = LogicalType::LIST(ArrayType::GetChildType(child_type));
arguments[0] =
BoundCastExpression::AddCastToType(context, std::move(arguments[0]), LogicalType::LIST(child_type));
}
}

auto &input_type = arguments[0]->return_type;
bound_function.arguments[0] = input_type;
if (input_type.id() == LogicalTypeId::UNKNOWN) {
Expand Down
3 changes: 3 additions & 0 deletions src/core_functions/scalar/list/list_sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,9 @@ static void ListSortFunction(DataChunk &args, ExpressionState &state, Vector &re
auto &result_entry = ListVector::GetEntry(result);
auto result_data = ListVector::GetData(result);
for (idx_t i = 0; i < count; i++) {
if (!result_validity.RowIsValid(i)) {
continue;
}
for (idx_t j = result_data[i].offset; j < result_data[i].offset + result_data[i].length; j++) {
auto b = sel_sorted.get_index(j) - result_data[i].offset;
result_entry.SetValue(j, Value::BIGINT(b + 1));
Expand Down
75 changes: 20 additions & 55 deletions src/function/cast/enum_casts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,43 +7,26 @@ namespace duckdb {

template <class SRC_TYPE, class RES_TYPE>
bool EnumEnumCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
result.SetVectorType(VectorType::FLAT_VECTOR);

auto &str_vec = EnumType::GetValuesInsertOrder(source.GetType());
auto str_vec_ptr = FlatVector::GetData<string_t>(str_vec);

auto &enum_dictionary = EnumType::GetValuesInsertOrder(source.GetType());
auto dictionary_data = FlatVector::GetData<string_t>(enum_dictionary);
auto res_enum_type = result.GetType();

UnifiedVectorFormat vdata;
source.ToUnifiedFormat(count, vdata);

auto source_data = UnifiedVectorFormat::GetData<SRC_TYPE>(vdata);
auto source_sel = vdata.sel;
auto source_mask = vdata.validity;

auto result_data = FlatVector::GetData<RES_TYPE>(result);
auto &result_mask = FlatVector::Validity(result);

VectorTryCastData vector_cast_data(result, parameters);
for (idx_t i = 0; i < count; i++) {
auto src_idx = source_sel->get_index(i);
if (!source_mask.RowIsValid(src_idx)) {
result_mask.SetInvalid(i);
continue;
}
auto key = EnumType::GetPos(res_enum_type, str_vec_ptr[source_data[src_idx]]);
if (key == -1) {
// key doesn't exist on result enum
if (!parameters.error_message) {
result_data[i] = HandleVectorCastError::Operation<RES_TYPE>(
CastExceptionText<SRC_TYPE, RES_TYPE>(source_data[src_idx]), result_mask, i, vector_cast_data);
} else {
result_mask.SetInvalid(i);
}
continue;
}
result_data[i] = UnsafeNumericCast<RES_TYPE>(key);
}
UnaryExecutor::ExecuteWithNulls<SRC_TYPE, RES_TYPE>(
source, result, count, [&](SRC_TYPE value, ValidityMask &mask, idx_t row_idx) {
auto key = EnumType::GetPos(res_enum_type, dictionary_data[value]);
if (key == -1) {
if (!parameters.error_message) {
return HandleVectorCastError::Operation<RES_TYPE>(CastExceptionText<SRC_TYPE, RES_TYPE>(value),
mask, row_idx, vector_cast_data);
} else {
mask.SetInvalid(row_idx);
}
return RES_TYPE();
} else {
return UnsafeNumericCast<RES_TYPE>(key);
}
});
return vector_cast_data.all_converted;
}

Expand All @@ -65,27 +48,9 @@ template <class SRC>
static bool EnumToVarcharCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
auto &enum_dictionary = EnumType::GetValuesInsertOrder(source.GetType());
auto dictionary_data = FlatVector::GetData<string_t>(enum_dictionary);
auto result_data = FlatVector::GetData<string_t>(result);
auto &result_mask = FlatVector::Validity(result);

UnifiedVectorFormat vdata;
source.ToUnifiedFormat(count, vdata);

auto source_data = UnifiedVectorFormat::GetData<SRC>(vdata);
for (idx_t i = 0; i < count; i++) {
auto source_idx = vdata.sel->get_index(i);
if (!vdata.validity.RowIsValid(source_idx)) {
result_mask.SetInvalid(i);
continue;
}
auto enum_idx = source_data[source_idx];
result_data[i] = dictionary_data[enum_idx];
}
if (source.GetVectorType() == VectorType::CONSTANT_VECTOR) {
result.SetVectorType(VectorType::CONSTANT_VECTOR);
} else {
result.SetVectorType(VectorType::FLAT_VECTOR);
}

UnaryExecutor::Execute<SRC, string_t>(source, result, count,
[&](SRC enum_idx) { return dictionary_data[enum_idx]; });
return true;
}

Expand Down
5 changes: 5 additions & 0 deletions src/function/scalar/list/list_select.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "duckdb/planner/expression_binder.hpp"
#include "duckdb/planner/expression/bound_function_expression.hpp"
#include "duckdb/planner/expression/bound_parameter_expression.hpp"
#include "duckdb/planner/expression/bound_cast_expression.hpp"

namespace duckdb {

Expand Down Expand Up @@ -140,6 +141,10 @@ static void ListSelectFunction(DataChunk &args, ExpressionState &state, Vector &
static unique_ptr<FunctionData> ListSelectBind(ClientContext &context, ScalarFunction &bound_function,
vector<unique_ptr<Expression>> &arguments) {
D_ASSERT(bound_function.arguments.size() == 2);

// If the first argument is an array, cast it to a list
arguments[0] = BoundCastExpression::AddArrayCastToList(context, std::move(arguments[0]));

LogicalType child_type;
if (arguments[0]->return_type == LogicalTypeId::UNKNOWN || arguments[1]->return_type == LogicalTypeId::UNKNOWN) {
bound_function.arguments[0] = LogicalTypeId::UNKNOWN;
Expand Down
9 changes: 5 additions & 4 deletions src/function/scalar/list/list_zip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,14 @@ static void ListZipFunction(DataChunk &args, ExpressionState &state, Vector &res
idx_t result_size = 0;
vector<idx_t> lengths;
for (idx_t j = 0; j < count; j++) {

// Is flag for current row set
bool truncate_to_shortest = false;
if (truncate_flags_set) {
idx_t flag_idx = input_lists.back().sel->get_index(j);
auto flag_data = UnifiedVectorFormat::GetData<bool>(input_lists.back());
truncate_to_shortest = flag_data[flag_idx];
auto &flag_vec = input_lists.back();
idx_t flag_idx = flag_vec.sel->get_index(j);
if (flag_vec.validity.RowIsValid(flag_idx)) {
truncate_to_shortest = UnifiedVectorFormat::GetData<bool>(flag_vec)[flag_idx];
}
}

// Calculation of the outgoing list size
Expand Down
45 changes: 24 additions & 21 deletions src/function/scalar/strftime_format.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1360,6 +1360,20 @@ bool StrpTimeFormat::ParseResult::TryToDate(date_t &result) {
return Date::TryFromDate(data[0], data[1], data[2], result);
}

dtime_t StrpTimeFormat::ParseResult::ToTime() {
const auto hour_offset = data[7] / Interval::MINS_PER_HOUR;
const auto mins_offset = data[7] % Interval::MINS_PER_HOUR;
return Time::FromTime(data[3] - hour_offset, data[4] - mins_offset, data[5], data[6]);
}

bool StrpTimeFormat::ParseResult::TryToTime(dtime_t &result) {
if (data[7]) {
return false;
}
result = Time::FromTime(data[3], data[4], data[5], data[6]);
return true;
}

timestamp_t StrpTimeFormat::ParseResult::ToTimestamp() {
if (is_special) {
if (special == date_t::infinity()) {
Expand All @@ -1371,9 +1385,7 @@ timestamp_t StrpTimeFormat::ParseResult::ToTimestamp() {
}

date_t date = Date::FromDate(data[0], data[1], data[2]);
const auto hour_offset = data[7] / Interval::MINS_PER_HOUR;
const auto mins_offset = data[7] % Interval::MINS_PER_HOUR;
dtime_t time = Time::FromTime(data[3] - hour_offset, data[4] - mins_offset, data[5], data[6]);
dtime_t time = ToTime();
return Timestamp::FromDatetime(date, time);
}

Expand All @@ -1382,9 +1394,7 @@ bool StrpTimeFormat::ParseResult::TryToTimestamp(timestamp_t &result) {
if (!TryToDate(date)) {
return false;
}
const auto hour_offset = data[7] / Interval::MINS_PER_HOUR;
const auto mins_offset = data[7] % Interval::MINS_PER_HOUR;
dtime_t time = Time::FromTime(data[3] - hour_offset, data[4] - mins_offset, data[5], data[6]);
dtime_t time = ToTime();
return Timestamp::TryFromDatetime(date, time, result);
}

Expand All @@ -1403,29 +1413,22 @@ bool StrpTimeFormat::TryParseDate(string_t input, date_t &result, string &error_
return parse_result.TryToDate(result);
}

bool StrpTimeFormat::TryParseTimestamp(string_t input, timestamp_t &result, string &error_message) const {
bool StrpTimeFormat::TryParseTime(string_t input, dtime_t &result, string &error_message) const {
ParseResult parse_result;
if (!Parse(input, parse_result)) {
error_message = parse_result.FormatError(input, format_specifier);
return false;
}
return parse_result.TryToTimestamp(result);
}

date_t StrpTimeFormat::ParseDate(string_t input) {
ParseResult result;
if (!Parse(input, result)) {
throw InvalidInputException(result.FormatError(input, format_specifier));
}
return result.ToDate();
return parse_result.TryToTime(result);
}

timestamp_t StrpTimeFormat::ParseTimestamp(string_t input) {
ParseResult result;
if (!Parse(input, result)) {
throw InvalidInputException(result.FormatError(input, format_specifier));
bool StrpTimeFormat::TryParseTimestamp(string_t input, timestamp_t &result, string &error_message) const {
ParseResult parse_result;
if (!Parse(input, parse_result)) {
error_message = parse_result.FormatError(input, format_specifier);
return false;
}
return result.ToTimestamp();
return parse_result.TryToTimestamp(result);
}

} // namespace duckdb
6 changes: 3 additions & 3 deletions src/include/duckdb/function/scalar/strftime_format.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,11 @@ struct StrpTimeFormat : public StrTimeFormat { // NOLINT: work-around bug in cla
date_t special;

date_t ToDate();
dtime_t ToTime();
timestamp_t ToTimestamp();

bool TryToDate(date_t &result);
bool TryToTime(dtime_t &result);
bool TryToTimestamp(timestamp_t &result);

DUCKDB_API string FormatError(string_t input, const string &format_specifier);
Expand All @@ -160,11 +162,9 @@ struct StrpTimeFormat : public StrTimeFormat { // NOLINT: work-around bug in cla
DUCKDB_API bool Parse(string_t str, ParseResult &result) const;

DUCKDB_API bool TryParseDate(string_t str, date_t &result, string &error_message) const;
DUCKDB_API bool TryParseTime(string_t str, dtime_t &result, string &error_message) const;
DUCKDB_API bool TryParseTimestamp(string_t str, timestamp_t &result, string &error_message) const;

date_t ParseDate(string_t str);
timestamp_t ParseTimestamp(string_t str);

void Serialize(Serializer &serializer) const;
static StrpTimeFormat Deserialize(Deserializer &deserializer);

Expand Down
2 changes: 1 addition & 1 deletion src/include/duckdb/main/client_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ class ClientContext : public std::enable_shared_from_this<ClientContext> {
bool requires_valid_transaction = true);

//! Equivalent to CURRENT_SETTING(key) SQL function.
DUCKDB_API SettingLookupResult TryGetCurrentSetting(const std::string &key, Value &result);
DUCKDB_API SettingLookupResult TryGetCurrentSetting(const std::string &key, Value &result) const;

//! Returns the parser options for this client context
DUCKDB_API ParserOptions GetParserOptions() const;
Expand Down
Loading

0 comments on commit 0cfc9de

Please sign in to comment.