Merge branch 'main' into materialized_insert

kryonix · Feb 29, 2024 · 1ae5793 · 1ae5793
2 parents fd092a4 + 68e4f93
commit 1ae5793
Show file tree

Hide file tree

Showing 106 changed files with 1,664 additions and 962 deletions.
diff --git a/.github/config/out_of_tree_extensions.cmake b/.github/config/out_of_tree_extensions.cmake
@@ -90,5 +90,6 @@ if (NOT WIN32)
             LOAD_TESTS DONT_LINK
             GIT_URL https://github.com/duckdb/substrait
             GIT_TAG 870bab8725d1123905296bfb1f35ce737434e0b3
+            APPLY_PATCHES
             )
 endif()
diff --git a/.github/patches/extensions/substrait/substrait.patch b/.github/patches/extensions/substrait/substrait.patch
@@ -0,0 +1,38 @@
+diff --git a/src/to_substrait.cpp b/src/to_substrait.cpp
+index 03d9778..d2429c6 100644
+--- a/src/to_substrait.cpp
++++ b/src/to_substrait.cpp
+@@ -777,8 +777,31 @@ substrait::Rel *DuckDBToSubstrait::TransformLimit(LogicalOperator &dop) {
+ 	auto stopn = res->mutable_fetch();
+ 	stopn->set_allocated_input(TransformOp(*dop.children[0]));
+
+-	stopn->set_offset(dlimit.offset_val);
+-	stopn->set_count(dlimit.limit_val);
++	idx_t limit_val;
++	idx_t offset_val;
++
++	switch(dlimit.limit_val.Type()) {
++	case LimitNodeType::CONSTANT_VALUE:
++		limit_val = dlimit.limit_val.GetConstantValue();
++		break;
++	case LimitNodeType::UNSET:
++		limit_val = 2ULL << 62ULL;
++		break;
++	default:
++		throw InternalException("Unsupported limit value type");
++	}
++	switch(dlimit.offset_val.Type()) {
++	case LimitNodeType::CONSTANT_VALUE:
++		offset_val = dlimit.offset_val.GetConstantValue();
++		break;
++	case LimitNodeType::UNSET:
++		offset_val = 0;
++		break;
++	default:
++		throw InternalException("Unsupported offset value type");
++	}
++	stopn->set_offset(offset_val);
++	stopn->set_count(limit_val);
+ 	return res;
+ }
+
diff --git a/.github/workflows/Julia.yml b/.github/workflows/Julia.yml
@@ -69,7 +69,6 @@ jobs:
           - '1.6'
           - '1.7'
           - '1'
-          - 'nightly'
         os:
           - ubuntu-latest
         arch:

diff --git a/.github/workflows/LinuxRelease.yml b/.github/workflows/LinuxRelease.yml
@@ -107,7 +107,7 @@ jobs:
         zip -j duckdb_cli-linux-amd64.zip build/release/duckdb
         zip -j libduckdb-linux-amd64.zip build/release/src/libduckdb*.* src/amalgamation/duckdb.hpp src/include/duckdb.h
         zip -j libduckdb-src.zip src/amalgamation/duckdb.hpp src/amalgamation/duckdb.cpp src/include/duckdb.h
-        zip -j duckdb_odbc-linux-amd64.zip build/release/tools/odbc/libduckdb_odbc.so tools/odbc/linux_setup/unixodbc_setup.sh
+        zip -j duckdb_odbc-linux-amd64.zip build/release/tools/odbc/libduckdb_odbc.so tools/odbc/linux_setup/unixodbc_setup.sh tools/odbc/linux_setup/update_odbc_path.py
         python3 scripts/asset-upload-gha.py libduckdb-src.zip libduckdb-linux-amd64.zip duckdb_cli-linux-amd64.zip duckdb_odbc-linux-amd64.zip
 
     - uses: actions/upload-artifact@v3

diff --git a/.github/workflows/Wasm.yml b/.github/workflows/Wasm.yml
@@ -2,10 +2,19 @@ name: DuckDB-Wasm extensions
 on:
   workflow_dispatch:
     inputs:
+      # Git ref of the duckdb-wasm repo
+      duckdb_wasm_ref:
+        required: true
+        type: string
       # Git ref of the duckdb repo
       duckdb_ref:
         required: true
         type: string
+      # Git ref of the duckdb repo
+      platforms:
+        required: false
+        default: '["wasm_mvp", "wasm_eh", "wasm_threads"]'
+        type: string
       # Publish extensions on extensions.duckdb.org?
       release_s3:
         required: true
@@ -21,16 +30,18 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        duckdb_wasm_arch: [ 'mvp', 'eh', 'threads' ]
+        duckdb_wasm_arch: ${{ fromJSON(github.event.inputs.platforms) }}
     env:
       VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake
-      DUCKDB_PLATFORM: "wasm_${{ matrix.duckdb_wasm_arch }}"
+      DUCKDB_PLATFORM: "${{ matrix.duckdb_wasm_arch }}"
 
     steps:
             - uses: actions/checkout@v3
               with:
-                  ref: ${{ inputs.duckdb_ref }}
+                  ref: ${{ inputs.duckdb_wasm_ref }}
                   fetch-depth: 0
+                  submodules: true
+                  repository: duckdb/duckdb-wasm
 
             - uses: mymindstorm/setup-emsdk@v12
               with:
@@ -50,32 +61,34 @@ jobs:
               with:
                   key: ${{ github.job }}-${{ matrix.duckdb_wasm_arch }}
 
+            - name: PatchDuckDB
+              run: |
+                 cd submodules/duckdb
+                 git checkout ${{ github.event.inputs.duckdb_ref }}
+                 git apply ../../duckdb.patch
+                 cd ../..
+                 cp .github/config/extension_config_wasm.cmake submodules/duckdb/extension/extension_config.cmake
+
             - name: Build Wasm module MVP
-              if: ${{ matrix.duckdb_wasm_arch == 'mvp' }}
+              if: ${{ matrix.duckdb_wasm_arch == 'wasm_mvp' }}
               run: |
-                mkdir -p ./build/wasm_mvp
-                emcmake cmake -G "Ninja" -DWASM_LOADABLE_EXTENSIONS=1 -DBUILD_EXTENSIONS_ONLY=1 -Bbuild/wasm_mvp -DCMAKE_CXX_FLAGS="-DDUCKDB_CUSTOM_PLATFORM=wasm_mvp" -DDUCKDB_EXPLICIT_PLATFORM=wasm_mvp -DLOCAL_EXTENSION_REPO='build/to_be_deployed' -DDUCKDB_EXTENSION_CONFIGS=".github/config/in_tree_extensions.cmake" -DSKIP_EXTENSIONS="httpfs"
-                emmake ninja -j8 -Cbuild/wasm_mvp
+                DUCKDB_PLATFORM=wasm_mvp DUCKDB_WASM_LOADABLE_EXTENSIONS="signed" GEN=ninja ./scripts/wasm_build_lib.sh relsize mvp
 
             - name: Build Wasm module EH
-              if: ${{ matrix.duckdb_wasm_arch == 'eh' }}
+              if: ${{ matrix.duckdb_wasm_arch == 'wasm_eh' }}
               run: |
-                mkdir -p ./build/wasm_eh
-                emcmake cmake -G "Ninja" -DWASM_LOADABLE_EXTENSIONS=1 -DBUILD_EXTENSIONS_ONLY=1 -Bbuild/wasm_eh -DCMAKE_CXX_FLAGS="-fwasm-exceptions -DWEBDB_FAST_EXCEPTIONS=1 -DDUCKDB_CUSTOM_PLATFORM=wasm_eh" -DDUCKDB_EXPLICIT_PLATFORM=wasm_eh -DLOCAL_EXTENSION_REPO='build/to_be_deployed' -DDUCKDB_EXTENSION_CONFIGS=".github/config/in_tree_extensions.cmake" -DSKIP_EXTENSIONS="httpfs"
-                emmake ninja -j8 -Cbuild/wasm_eh
+                DUCKDB_PLATFORM=wasm_eh DUCKDB_WASM_LOADABLE_EXTENSIONS="signed" GEN=ninja ./scripts/wasm_build_lib.sh relsize eh
 
             - name: Build Wasm module THREADS
-              if: ${{ matrix.duckdb_wasm_arch == 'threads' }}
+              if: ${{ matrix.duckdb_wasm_arch == 'wasm_threads' }}
               run: |
-                mkdir -p ./build/wasm_threads
-                emcmake cmake -G "Ninja" -DWASM_LOADABLE_EXTENSIONS=1 -DBUILD_EXTENSIONS_ONLY=1 -Bbuild/wasm_threads -DCMAKE_CXX_FLAGS="-fwasm-exceptions -DWEBDB_FAST_EXCEPTIONS=1 -DWITH_WASM_THREADS=1 -DWITH_WASM_SIMD=1 -DWITH_WASM_BULK_MEMORY=1 -DDUCKDB_CUSTOM_PLATFORM=wasm_threads" -DDUCKDB_EXPLICIT_PLATFORM=wasm_threads -DLOCAL_EXTENSION_REPO='build/to_be_deployed' -DDUCKDB_EXTENSION_CONFIGS=".github/config/in_tree_extensions.cmake" -DSKIP_EXTENSIONS="httpfs"
-                emmake ninja -j8 -Cbuild/wasm_threads
+                DUCKDB_PLATFORM=wasm_threads DUCKDB_WASM_LOADABLE_EXTENSIONS="signed" GEN=ninja ./scripts/wasm_build_lib.sh relsize coi
 
             - name: Upload artifact
               uses: actions/upload-artifact@v3
               with:
                   name: duckdb_extensions_${{ env.DUCKDB_PLATFORM }}
-                  path: build/to_be_deployed/${{ inputs.duckdb_ref }}/${{ env.DUCKDB_PLATFORM }}
+                  path: build/extension_repository/${{ inputs.duckdb_ref }}/${{ env.DUCKDB_PLATFORM }}
                   retention-days: 1
 
   publish:
@@ -85,7 +98,7 @@ jobs:
     - build_wasm
     strategy:
       matrix:
-        duckdb_arch: [ 'wasm_mvp', 'wasm_eh', 'wasm_threads' ]
+        duckdb_arch: ${{ fromJSON(github.event.inputs.platforms) }}
     steps:
             - uses: actions/checkout@v3
 

diff --git a/extension/json/json_functions/json_create.cpp b/extension/json/json_functions/json_create.cpp
@@ -61,6 +61,9 @@ static LogicalType GetJSONType(StructNames &const_struct_names, const LogicalTyp
 	// The nested types need to conform as well
 	case LogicalTypeId::LIST:
 		return LogicalType::LIST(GetJSONType(const_struct_names, ListType::GetChildType(type)));
+	case LogicalTypeId::ARRAY:
+		return LogicalType::ARRAY(GetJSONType(const_struct_names, ArrayType::GetChildType(type)),
+		                          ArrayType::GetSize(type));
 	// Struct and MAP are treated as JSON values
 	case LogicalTypeId::STRUCT: {
 		child_list_t<LogicalType> child_types;
@@ -435,6 +438,9 @@ static void CreateValuesList(const StructNames &names, yyjson_mut_doc *doc, yyjs
 
 static void CreateValuesArray(const StructNames &names, yyjson_mut_doc *doc, yyjson_mut_val *vals[], Vector &value_v,
                               idx_t count) {
+
+	value_v.Flatten(count);
+
 	// Initialize array for the nested values
 	auto &child_v = ArrayVector::GetEntry(value_v);
 	auto array_size = ArrayType::GetSize(value_v.GetType());

diff --git a/extension/parquet/column_writer.cpp b/extension/parquet/column_writer.cpp
@@ -663,19 +663,17 @@ void BasicColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
 
 	auto &column_writer = writer.GetWriter();
 	auto start_offset = column_writer.GetTotalWritten();
-	auto page_offset = start_offset;
 	// flush the dictionary
 	if (HasDictionary(state)) {
 		column_chunk.meta_data.statistics.distinct_count = DictionarySize(state);
 		column_chunk.meta_data.statistics.__isset.distinct_count = true;
-		column_chunk.meta_data.dictionary_page_offset = page_offset;
+		column_chunk.meta_data.dictionary_page_offset = start_offset;
 		column_chunk.meta_data.__isset.dictionary_page_offset = true;
 		FlushDictionary(state, state.stats_state.get());
-		page_offset += state.write_info[0].compressed_size;
 	}
 
 	// record the start position of the pages for this column
-	column_chunk.meta_data.data_page_offset = page_offset;
+	column_chunk.meta_data.data_page_offset = column_writer.GetTotalWritten();
 	SetParquetStatistics(state, column_chunk);
 
 	// write the individual pages to disk
@@ -1828,9 +1826,102 @@ void ListColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
 	child_writer->FinalizeWrite(*state.child_state);
 }
 
+//===--------------------------------------------------------------------===//
+// Array Column Writer
+//===--------------------------------------------------------------------===//
+class ArrayColumnWriter : public ListColumnWriter {
+public:
+	ArrayColumnWriter(ParquetWriter &writer, idx_t schema_idx, vector<string> schema_path_p, idx_t max_repeat,
+	                  idx_t max_define, unique_ptr<ColumnWriter> child_writer_p, bool can_have_nulls)
+	    : ListColumnWriter(writer, schema_idx, std::move(schema_path_p), max_repeat, max_define,
+	                       std::move(child_writer_p), can_have_nulls) {
+	}
+	~ArrayColumnWriter() override = default;
+
+public:
+	void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
+	void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) override;
+	void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
+};
+
+void ArrayColumnWriter::Analyze(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
+	auto &state = state_p.Cast<ListColumnWriterState>();
+	auto &array_child = ArrayVector::GetEntry(vector);
+	auto array_size = ArrayType::GetSize(vector.GetType());
+	child_writer->Analyze(*state.child_state, &state_p, array_child, array_size * count);
+}
+
+void ArrayColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
+	auto &state = state_p.Cast<ListColumnWriterState>();
+
+	auto array_size = ArrayType::GetSize(vector.GetType());
+	auto &validity = FlatVector::Validity(vector);
+
+	// write definition levels and repeats
+	// the main difference between this and ListColumnWriter::Prepare is that we need to make sure to write out
+	// repetition levels and definitions for the child elements of the array even if the array itself is NULL.
+	idx_t start = 0;
+	idx_t vcount = parent ? parent->definition_levels.size() - state.parent_index : count;
+	idx_t vector_index = 0;
+	for (idx_t i = start; i < vcount; i++) {
+		idx_t parent_index = state.parent_index + i;
+		if (parent && !parent->is_empty.empty() && parent->is_empty[parent_index]) {
+			state.definition_levels.push_back(parent->definition_levels[parent_index]);
+			state.repetition_levels.push_back(parent->repetition_levels[parent_index]);
+			state.is_empty.push_back(true);
+			continue;
+		}
+		auto first_repeat_level =
+		    parent && !parent->repetition_levels.empty() ? parent->repetition_levels[parent_index] : max_repeat;
+		if (parent && parent->definition_levels[parent_index] != PARQUET_DEFINE_VALID) {
+			state.definition_levels.push_back(parent->definition_levels[parent_index]);
+			state.repetition_levels.push_back(first_repeat_level);
+			state.is_empty.push_back(false);
+			for (idx_t k = 1; k < array_size; k++) {
+				state.repetition_levels.push_back(max_repeat + 1);
+				state.definition_levels.push_back(parent->definition_levels[parent_index]);
+				state.is_empty.push_back(false);
+			}
+		} else if (validity.RowIsValid(vector_index)) {
+			// push the repetition levels
+			state.definition_levels.push_back(PARQUET_DEFINE_VALID);
+			state.is_empty.push_back(false);
+
+			state.repetition_levels.push_back(first_repeat_level);
+			for (idx_t k = 1; k < array_size; k++) {
+				state.repetition_levels.push_back(max_repeat + 1);
+				state.definition_levels.push_back(PARQUET_DEFINE_VALID);
+				state.is_empty.push_back(false);
+			}
+		} else {
+			state.definition_levels.push_back(max_define - 1);
+			state.repetition_levels.push_back(first_repeat_level);
+			state.is_empty.push_back(false);
+			for (idx_t k = 1; k < array_size; k++) {
+				state.repetition_levels.push_back(max_repeat + 1);
+				state.definition_levels.push_back(max_define - 1);
+				state.is_empty.push_back(false);
+			}
+		}
+		vector_index++;
+	}
+	state.parent_index += vcount;
+
+	auto &array_child = ArrayVector::GetEntry(vector);
+	child_writer->Prepare(*state.child_state, &state_p, array_child, count * array_size);
+}
+
+void ArrayColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t count) {
+	auto &state = state_p.Cast<ListColumnWriterState>();
+	auto array_size = ArrayType::GetSize(vector.GetType());
+	auto &array_child = ArrayVector::GetEntry(vector);
+	child_writer->Write(*state.child_state, array_child, count * array_size);
+}
+
 //===--------------------------------------------------------------------===//
 // Create Column Writer
 //===--------------------------------------------------------------------===//
+
 unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parquet::format::SchemaElement> &schemas,
                                                              ParquetWriter &writer, const LogicalType &type,
                                                              const string &name, vector<string> schema_path,
@@ -1879,8 +1970,9 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
 		return make_uniq<StructColumnWriter>(writer, schema_idx, std::move(schema_path), max_repeat, max_define,
 		                                     std::move(child_writers), can_have_nulls);
 	}
-	if (type.id() == LogicalTypeId::LIST) {
-		auto &child_type = ListType::GetChildType(type);
+	if (type.id() == LogicalTypeId::LIST || type.id() == LogicalTypeId::ARRAY) {
+		auto is_list = type.id() == LogicalTypeId::LIST;
+		auto &child_type = is_list ? ListType::GetChildType(type) : ArrayType::GetChildType(type);
 		// set up the two schema elements for the list
 		// for some reason we only set the converted type in the OPTIONAL element
 		// first an OPTIONAL element
@@ -1907,14 +1999,19 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
 		repeated_element.__isset.num_children = true;
 		repeated_element.__isset.type = false;
 		repeated_element.__isset.repetition_type = true;
-		repeated_element.name = "list";
+		repeated_element.name = is_list ? "list" : "array";
 		schemas.push_back(std::move(repeated_element));
-		schema_path.emplace_back("list");
+		schema_path.emplace_back(is_list ? "list" : "array");
 
 		auto child_writer = CreateWriterRecursive(schemas, writer, child_type, "element", schema_path, child_field_ids,
 		                                          max_repeat + 1, max_define + 2);
-		return make_uniq<ListColumnWriter>(writer, schema_idx, std::move(schema_path), max_repeat, max_define,
-		                                   std::move(child_writer), can_have_nulls);
+		if (is_list) {
+			return make_uniq<ListColumnWriter>(writer, schema_idx, std::move(schema_path), max_repeat, max_define,
+			                                   std::move(child_writer), can_have_nulls);
+		} else {
+			return make_uniq<ArrayColumnWriter>(writer, schema_idx, std::move(schema_path), max_repeat, max_define,
+			                                    std::move(child_writer), can_have_nulls);
+		}
 	}
 	if (type.id() == LogicalTypeId::MAP) {
 		// map type

diff --git a/extension/parquet/parquet_writer.cpp b/extension/parquet/parquet_writer.cpp
@@ -170,6 +170,10 @@ CopyTypeSupport ParquetWriter::TypeIsSupported(const LogicalType &type) {
 		auto &child_type = ListType::GetChildType(type);
 		return TypeIsSupported(child_type);
 	}
+	if (id == LogicalTypeId::ARRAY) {
+		auto &child_type = ArrayType::GetChildType(type);
+		return TypeIsSupported(child_type);
+	}
 	if (id == LogicalTypeId::UNION) {
 		auto count = UnionType::GetMemberCount(type);
 		for (idx_t i = 0; i < count; i++) {

diff --git a/scripts/generate_serialization.py b/scripts/generate_serialization.py
@@ -103,6 +103,7 @@
     'LogicalType',
     'ColumnDefinition',
     'BaseStatistics',
+    'BoundLimitNode',
 ]
 
 reference_list = ['ClientContext', 'bound_parameter_map_t']

diff --git a/src/catalog/catalog_entry/table_catalog_entry.cpp b/src/catalog/catalog_entry/table_catalog_entry.cpp
@@ -217,6 +217,7 @@ static void BindExtraColumns(TableCatalogEntry &table, LogicalGet &get, LogicalP
 static bool TypeSupportsRegularUpdate(const LogicalType &type) {
 	switch (type.id()) {
 	case LogicalTypeId::LIST:
+	case LogicalTypeId::ARRAY:
 	case LogicalTypeId::MAP:
 	case LogicalTypeId::UNION:
 		// lists and maps and unions don't support updates directly

diff --git a/src/common/arrow/appender/CMakeLists.txt b/src/common/arrow/appender/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_library_unity(duckdb_common_arrow_appender OBJECT bool_data.cpp
-                  struct_data.cpp union_data.cpp)
+                  struct_data.cpp union_data.cpp fixed_size_list_data.cpp)
 set(ALL_OBJECT_FILES
     ${ALL_OBJECT_FILES} $<TARGET_OBJECTS:duckdb_common_arrow_appender>
     PARENT_SCOPE)
-Original file line number
+Diff line change
@@ Expand Up / @@ -69,7 +69,6 @@ jobs: @@
               - '1.6'
               - '1.7'
               - '1'
-              - 'nightly'
             os:
               - ubuntu-latest
             arch:
@@ Expand Down @@