Skip to content

Commit

Permalink
Merge branch 'main' into materialized_insert
Browse files Browse the repository at this point in the history
  • Loading branch information
kryonix authored Feb 29, 2024
2 parents fd092a4 + 68e4f93 commit 1ae5793
Show file tree
Hide file tree
Showing 106 changed files with 1,664 additions and 962 deletions.
1 change: 1 addition & 0 deletions .github/config/out_of_tree_extensions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -90,5 +90,6 @@ if (NOT WIN32)
LOAD_TESTS DONT_LINK
GIT_URL https://github.com/duckdb/substrait
GIT_TAG 870bab8725d1123905296bfb1f35ce737434e0b3
APPLY_PATCHES
)
endif()
38 changes: 38 additions & 0 deletions .github/patches/extensions/substrait/substrait.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
diff --git a/src/to_substrait.cpp b/src/to_substrait.cpp
index 03d9778..d2429c6 100644
--- a/src/to_substrait.cpp
+++ b/src/to_substrait.cpp
@@ -777,8 +777,31 @@ substrait::Rel *DuckDBToSubstrait::TransformLimit(LogicalOperator &dop) {
auto stopn = res->mutable_fetch();
stopn->set_allocated_input(TransformOp(*dop.children[0]));

- stopn->set_offset(dlimit.offset_val);
- stopn->set_count(dlimit.limit_val);
+ idx_t limit_val;
+ idx_t offset_val;
+
+ switch(dlimit.limit_val.Type()) {
+ case LimitNodeType::CONSTANT_VALUE:
+ limit_val = dlimit.limit_val.GetConstantValue();
+ break;
+ case LimitNodeType::UNSET:
+ limit_val = 2ULL << 62ULL;
+ break;
+ default:
+ throw InternalException("Unsupported limit value type");
+ }
+ switch(dlimit.offset_val.Type()) {
+ case LimitNodeType::CONSTANT_VALUE:
+ offset_val = dlimit.offset_val.GetConstantValue();
+ break;
+ case LimitNodeType::UNSET:
+ offset_val = 0;
+ break;
+ default:
+ throw InternalException("Unsupported offset value type");
+ }
+ stopn->set_offset(offset_val);
+ stopn->set_count(limit_val);
return res;
}

1 change: 0 additions & 1 deletion .github/workflows/Julia.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ jobs:
- '1.6'
- '1.7'
- '1'
- 'nightly'
os:
- ubuntu-latest
arch:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/LinuxRelease.yml
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ jobs:
zip -j duckdb_cli-linux-amd64.zip build/release/duckdb
zip -j libduckdb-linux-amd64.zip build/release/src/libduckdb*.* src/amalgamation/duckdb.hpp src/include/duckdb.h
zip -j libduckdb-src.zip src/amalgamation/duckdb.hpp src/amalgamation/duckdb.cpp src/include/duckdb.h
zip -j duckdb_odbc-linux-amd64.zip build/release/tools/odbc/libduckdb_odbc.so tools/odbc/linux_setup/unixodbc_setup.sh
zip -j duckdb_odbc-linux-amd64.zip build/release/tools/odbc/libduckdb_odbc.so tools/odbc/linux_setup/unixodbc_setup.sh tools/odbc/linux_setup/update_odbc_path.py
python3 scripts/asset-upload-gha.py libduckdb-src.zip libduckdb-linux-amd64.zip duckdb_cli-linux-amd64.zip duckdb_odbc-linux-amd64.zip
- uses: actions/upload-artifact@v3
Expand Down
47 changes: 30 additions & 17 deletions .github/workflows/Wasm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,19 @@ name: DuckDB-Wasm extensions
on:
workflow_dispatch:
inputs:
# Git ref of the duckdb-wasm repo
duckdb_wasm_ref:
required: true
type: string
# Git ref of the duckdb repo
duckdb_ref:
required: true
type: string
# Git ref of the duckdb repo
platforms:
required: false
default: '["wasm_mvp", "wasm_eh", "wasm_threads"]'
type: string
# Publish extensions on extensions.duckdb.org?
release_s3:
required: true
Expand All @@ -21,16 +30,18 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
duckdb_wasm_arch: [ 'mvp', 'eh', 'threads' ]
duckdb_wasm_arch: ${{ fromJSON(github.event.inputs.platforms) }}
env:
VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake
DUCKDB_PLATFORM: "wasm_${{ matrix.duckdb_wasm_arch }}"
DUCKDB_PLATFORM: "${{ matrix.duckdb_wasm_arch }}"

steps:
- uses: actions/checkout@v3
with:
ref: ${{ inputs.duckdb_ref }}
ref: ${{ inputs.duckdb_wasm_ref }}
fetch-depth: 0
submodules: true
repository: duckdb/duckdb-wasm

- uses: mymindstorm/setup-emsdk@v12
with:
Expand All @@ -50,32 +61,34 @@ jobs:
with:
key: ${{ github.job }}-${{ matrix.duckdb_wasm_arch }}

- name: PatchDuckDB
run: |
cd submodules/duckdb
git checkout ${{ github.event.inputs.duckdb_ref }}
git apply ../../duckdb.patch
cd ../..
cp .github/config/extension_config_wasm.cmake submodules/duckdb/extension/extension_config.cmake
- name: Build Wasm module MVP
if: ${{ matrix.duckdb_wasm_arch == 'mvp' }}
if: ${{ matrix.duckdb_wasm_arch == 'wasm_mvp' }}
run: |
mkdir -p ./build/wasm_mvp
emcmake cmake -G "Ninja" -DWASM_LOADABLE_EXTENSIONS=1 -DBUILD_EXTENSIONS_ONLY=1 -Bbuild/wasm_mvp -DCMAKE_CXX_FLAGS="-DDUCKDB_CUSTOM_PLATFORM=wasm_mvp" -DDUCKDB_EXPLICIT_PLATFORM=wasm_mvp -DLOCAL_EXTENSION_REPO='build/to_be_deployed' -DDUCKDB_EXTENSION_CONFIGS=".github/config/in_tree_extensions.cmake" -DSKIP_EXTENSIONS="httpfs"
emmake ninja -j8 -Cbuild/wasm_mvp
DUCKDB_PLATFORM=wasm_mvp DUCKDB_WASM_LOADABLE_EXTENSIONS="signed" GEN=ninja ./scripts/wasm_build_lib.sh relsize mvp
- name: Build Wasm module EH
if: ${{ matrix.duckdb_wasm_arch == 'eh' }}
if: ${{ matrix.duckdb_wasm_arch == 'wasm_eh' }}
run: |
mkdir -p ./build/wasm_eh
emcmake cmake -G "Ninja" -DWASM_LOADABLE_EXTENSIONS=1 -DBUILD_EXTENSIONS_ONLY=1 -Bbuild/wasm_eh -DCMAKE_CXX_FLAGS="-fwasm-exceptions -DWEBDB_FAST_EXCEPTIONS=1 -DDUCKDB_CUSTOM_PLATFORM=wasm_eh" -DDUCKDB_EXPLICIT_PLATFORM=wasm_eh -DLOCAL_EXTENSION_REPO='build/to_be_deployed' -DDUCKDB_EXTENSION_CONFIGS=".github/config/in_tree_extensions.cmake" -DSKIP_EXTENSIONS="httpfs"
emmake ninja -j8 -Cbuild/wasm_eh
DUCKDB_PLATFORM=wasm_eh DUCKDB_WASM_LOADABLE_EXTENSIONS="signed" GEN=ninja ./scripts/wasm_build_lib.sh relsize eh
- name: Build Wasm module THREADS
if: ${{ matrix.duckdb_wasm_arch == 'threads' }}
if: ${{ matrix.duckdb_wasm_arch == 'wasm_threads' }}
run: |
mkdir -p ./build/wasm_threads
emcmake cmake -G "Ninja" -DWASM_LOADABLE_EXTENSIONS=1 -DBUILD_EXTENSIONS_ONLY=1 -Bbuild/wasm_threads -DCMAKE_CXX_FLAGS="-fwasm-exceptions -DWEBDB_FAST_EXCEPTIONS=1 -DWITH_WASM_THREADS=1 -DWITH_WASM_SIMD=1 -DWITH_WASM_BULK_MEMORY=1 -DDUCKDB_CUSTOM_PLATFORM=wasm_threads" -DDUCKDB_EXPLICIT_PLATFORM=wasm_threads -DLOCAL_EXTENSION_REPO='build/to_be_deployed' -DDUCKDB_EXTENSION_CONFIGS=".github/config/in_tree_extensions.cmake" -DSKIP_EXTENSIONS="httpfs"
emmake ninja -j8 -Cbuild/wasm_threads
DUCKDB_PLATFORM=wasm_threads DUCKDB_WASM_LOADABLE_EXTENSIONS="signed" GEN=ninja ./scripts/wasm_build_lib.sh relsize coi
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: duckdb_extensions_${{ env.DUCKDB_PLATFORM }}
path: build/to_be_deployed/${{ inputs.duckdb_ref }}/${{ env.DUCKDB_PLATFORM }}
path: build/extension_repository/${{ inputs.duckdb_ref }}/${{ env.DUCKDB_PLATFORM }}
retention-days: 1

publish:
Expand All @@ -85,7 +98,7 @@ jobs:
- build_wasm
strategy:
matrix:
duckdb_arch: [ 'wasm_mvp', 'wasm_eh', 'wasm_threads' ]
duckdb_arch: ${{ fromJSON(github.event.inputs.platforms) }}
steps:
- uses: actions/checkout@v3

Expand Down
6 changes: 6 additions & 0 deletions extension/json/json_functions/json_create.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ static LogicalType GetJSONType(StructNames &const_struct_names, const LogicalTyp
// The nested types need to conform as well
case LogicalTypeId::LIST:
return LogicalType::LIST(GetJSONType(const_struct_names, ListType::GetChildType(type)));
case LogicalTypeId::ARRAY:
return LogicalType::ARRAY(GetJSONType(const_struct_names, ArrayType::GetChildType(type)),
ArrayType::GetSize(type));
// Struct and MAP are treated as JSON values
case LogicalTypeId::STRUCT: {
child_list_t<LogicalType> child_types;
Expand Down Expand Up @@ -435,6 +438,9 @@ static void CreateValuesList(const StructNames &names, yyjson_mut_doc *doc, yyjs

static void CreateValuesArray(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
idx_t count) {

value_v.Flatten(count);

// Initialize array for the nested values
auto &child_v = ArrayVector::GetEntry(value_v);
auto array_size = ArrayType::GetSize(value_v.GetType());
Expand Down
117 changes: 107 additions & 10 deletions extension/parquet/column_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -663,19 +663,17 @@ void BasicColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {

auto &column_writer = writer.GetWriter();
auto start_offset = column_writer.GetTotalWritten();
auto page_offset = start_offset;
// flush the dictionary
if (HasDictionary(state)) {
column_chunk.meta_data.statistics.distinct_count = DictionarySize(state);
column_chunk.meta_data.statistics.__isset.distinct_count = true;
column_chunk.meta_data.dictionary_page_offset = page_offset;
column_chunk.meta_data.dictionary_page_offset = start_offset;
column_chunk.meta_data.__isset.dictionary_page_offset = true;
FlushDictionary(state, state.stats_state.get());
page_offset += state.write_info[0].compressed_size;
}

// record the start position of the pages for this column
column_chunk.meta_data.data_page_offset = page_offset;
column_chunk.meta_data.data_page_offset = column_writer.GetTotalWritten();
SetParquetStatistics(state, column_chunk);

// write the individual pages to disk
Expand Down Expand Up @@ -1828,9 +1826,102 @@ void ListColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
child_writer->FinalizeWrite(*state.child_state);
}

//===--------------------------------------------------------------------===//
// Array Column Writer
//===--------------------------------------------------------------------===//
class ArrayColumnWriter : public ListColumnWriter {
public:
ArrayColumnWriter(ParquetWriter &writer, idx_t schema_idx, vector<string> schema_path_p, idx_t max_repeat,
idx_t max_define, unique_ptr<ColumnWriter> child_writer_p, bool can_have_nulls)
: ListColumnWriter(writer, schema_idx, std::move(schema_path_p), max_repeat, max_define,
std::move(child_writer_p), can_have_nulls) {
}
~ArrayColumnWriter() override = default;

public:
void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
};

void ArrayColumnWriter::Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
auto &state = state_p.Cast<ListColumnWriterState>();
auto &array_child = ArrayVector::GetEntry(vector);
auto array_size = ArrayType::GetSize(vector.GetType());
child_writer->Analyze(*state.child_state, &state_p, array_child, array_size * count);
}

void ArrayColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
auto &state = state_p.Cast<ListColumnWriterState>();

auto array_size = ArrayType::GetSize(vector.GetType());
auto &validity = FlatVector::Validity(vector);

// write definition levels and repeats
// the main difference between this and ListColumnWriter::Prepare is that we need to make sure to write out
// repetition levels and definitions for the child elements of the array even if the array itself is NULL.
idx_t start = 0;
idx_t vcount = parent ? parent->definition_levels.size() - state.parent_index : count;
idx_t vector_index = 0;
for (idx_t i = start; i < vcount; i++) {
idx_t parent_index = state.parent_index + i;
if (parent && !parent->is_empty.empty() && parent->is_empty[parent_index]) {
state.definition_levels.push_back(parent->definition_levels[parent_index]);
state.repetition_levels.push_back(parent->repetition_levels[parent_index]);
state.is_empty.push_back(true);
continue;
}
auto first_repeat_level =
parent && !parent->repetition_levels.empty() ? parent->repetition_levels[parent_index] : max_repeat;
if (parent && parent->definition_levels[parent_index] != PARQUET_DEFINE_VALID) {
state.definition_levels.push_back(parent->definition_levels[parent_index]);
state.repetition_levels.push_back(first_repeat_level);
state.is_empty.push_back(false);
for (idx_t k = 1; k < array_size; k++) {
state.repetition_levels.push_back(max_repeat + 1);
state.definition_levels.push_back(parent->definition_levels[parent_index]);
state.is_empty.push_back(false);
}
} else if (validity.RowIsValid(vector_index)) {
// push the repetition levels
state.definition_levels.push_back(PARQUET_DEFINE_VALID);
state.is_empty.push_back(false);

state.repetition_levels.push_back(first_repeat_level);
for (idx_t k = 1; k < array_size; k++) {
state.repetition_levels.push_back(max_repeat + 1);
state.definition_levels.push_back(PARQUET_DEFINE_VALID);
state.is_empty.push_back(false);
}
} else {
state.definition_levels.push_back(max_define - 1);
state.repetition_levels.push_back(first_repeat_level);
state.is_empty.push_back(false);
for (idx_t k = 1; k < array_size; k++) {
state.repetition_levels.push_back(max_repeat + 1);
state.definition_levels.push_back(max_define - 1);
state.is_empty.push_back(false);
}
}
vector_index++;
}
state.parent_index += vcount;

auto &array_child = ArrayVector::GetEntry(vector);
child_writer->Prepare(*state.child_state, &state_p, array_child, count * array_size);
}

void ArrayColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
auto &state = state_p.Cast<ListColumnWriterState>();
auto array_size = ArrayType::GetSize(vector.GetType());
auto &array_child = ArrayVector::GetEntry(vector);
child_writer->Write(*state.child_state, array_child, count * array_size);
}

//===--------------------------------------------------------------------===//
// Create Column Writer
//===--------------------------------------------------------------------===//

unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parquet::format::SchemaElement> &schemas,
ParquetWriter &writer, const LogicalType &type,
const string &name, vector<string> schema_path,
Expand Down Expand Up @@ -1879,8 +1970,9 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
return make_uniq<StructColumnWriter>(writer, schema_idx, std::move(schema_path), max_repeat, max_define,
std::move(child_writers), can_have_nulls);
}
if (type.id() == LogicalTypeId::LIST) {
auto &child_type = ListType::GetChildType(type);
if (type.id() == LogicalTypeId::LIST || type.id() == LogicalTypeId::ARRAY) {
auto is_list = type.id() == LogicalTypeId::LIST;
auto &child_type = is_list ? ListType::GetChildType(type) : ArrayType::GetChildType(type);
// set up the two schema elements for the list
// for some reason we only set the converted type in the OPTIONAL element
// first an OPTIONAL element
Expand All @@ -1907,14 +1999,19 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
repeated_element.__isset.num_children = true;
repeated_element.__isset.type = false;
repeated_element.__isset.repetition_type = true;
repeated_element.name = "list";
repeated_element.name = is_list ? "list" : "array";
schemas.push_back(std::move(repeated_element));
schema_path.emplace_back("list");
schema_path.emplace_back(is_list ? "list" : "array");

auto child_writer = CreateWriterRecursive(schemas, writer, child_type, "element", schema_path, child_field_ids,
max_repeat + 1, max_define + 2);
return make_uniq<ListColumnWriter>(writer, schema_idx, std::move(schema_path), max_repeat, max_define,
std::move(child_writer), can_have_nulls);
if (is_list) {
return make_uniq<ListColumnWriter>(writer, schema_idx, std::move(schema_path), max_repeat, max_define,
std::move(child_writer), can_have_nulls);
} else {
return make_uniq<ArrayColumnWriter>(writer, schema_idx, std::move(schema_path), max_repeat, max_define,
std::move(child_writer), can_have_nulls);
}
}
if (type.id() == LogicalTypeId::MAP) {
// map type
Expand Down
4 changes: 4 additions & 0 deletions extension/parquet/parquet_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,10 @@ CopyTypeSupport ParquetWriter::TypeIsSupported(const LogicalType &type) {
auto &child_type = ListType::GetChildType(type);
return TypeIsSupported(child_type);
}
if (id == LogicalTypeId::ARRAY) {
auto &child_type = ArrayType::GetChildType(type);
return TypeIsSupported(child_type);
}
if (id == LogicalTypeId::UNION) {
auto count = UnionType::GetMemberCount(type);
for (idx_t i = 0; i < count; i++) {
Expand Down
1 change: 1 addition & 0 deletions scripts/generate_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
'LogicalType',
'ColumnDefinition',
'BaseStatistics',
'BoundLimitNode',
]

reference_list = ['ClientContext', 'bound_parameter_map_t']
Expand Down
1 change: 1 addition & 0 deletions src/catalog/catalog_entry/table_catalog_entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ static void BindExtraColumns(TableCatalogEntry &table, LogicalGet &get, LogicalP
static bool TypeSupportsRegularUpdate(const LogicalType &type) {
switch (type.id()) {
case LogicalTypeId::LIST:
case LogicalTypeId::ARRAY:
case LogicalTypeId::MAP:
case LogicalTypeId::UNION:
// lists and maps and unions don't support updates directly
Expand Down
2 changes: 1 addition & 1 deletion src/common/arrow/appender/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
add_library_unity(duckdb_common_arrow_appender OBJECT bool_data.cpp
struct_data.cpp union_data.cpp)
struct_data.cpp union_data.cpp fixed_size_list_data.cpp)
set(ALL_OBJECT_FILES
${ALL_OBJECT_FILES} $<TARGET_OBJECTS:duckdb_common_arrow_appender>
PARENT_SCOPE)
Loading

0 comments on commit 1ae5793

Please sign in to comment.