From 1a6510c4751c1761d7c6706e1d9a96fdcb772b47 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 7 Jun 2022 15:40:36 +0200 Subject: [PATCH 01/28] current state of RLE doc --- docs/source/format/Columnar.rst | 79 +++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 5f9537384c000..dba2b17694470 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -765,6 +765,85 @@ application. We discuss dictionary encoding as it relates to serialization further below. +.. _run-length-encoded-layout: + +Run-Length-encoded Layout +------------------------- + +Run-Length is a data representation that represents data as sequences of the +same a, called runs. Each run is represented as a value, and an integer +describing how often this value is repeated. + +Any array can be run-length-encoded. A run-length encoded array has a single +buffer holding as many 32-bit integers, as there are runs. The actual values are +hold in a child array, which is just a regular array + +The dictionary is stored as an optional +property of an array. When a field is dictionary encoded, the values are +represented by an array of non-negative integers representing the index of the +value in the dictionary. The memory layout for a dictionary-encoded array is +the same as that of a primitive integer layout. The dictionary is handled as a +separate columnar array with its own respective layout. + +As an example, you could have the following data: :: + + type: Float32 + + [1.0, 1.0, 1.0, 1.0, null, 'null', 2.0] + +In Run-length-encoded form, this could appear as: + +:: + + * Length: 3, Null count: 0 + * Accumulated run lengths buffer: + + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 6-63 | + |-------------|-------------|-------------|-----------------------| + | 4 | 6 | 7 | unspecified (padding) | + + * Children arrays: + + * values (Float32): + * Length: 3, Null count: 1 + * Validity bitmap buffer: + + |Byte 0 (validity bitmap) | Bytes 1-63 | + |-------------------------|-----------------------| + |00000101 | 0 (padding) | + + * Values buffer + + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 6-63 | + |-------------|-------------|-------------|-----------------------| + | 1.0 | unspecified | 2.0 | unspecified (padding) | + + +Note that a dictionary is permitted to contain duplicate values or +nulls: + +:: + + data VarBinary (dictionary-encoded) + index_type: Int32 + values: [0, 1, 3, 1, 4, 2] + + dictionary + type: VarBinary + values: ['foo', 'bar', 'baz', 'foo', null] + +The null count of such arrays is dictated only by the validity bitmap +of its indices, irrespective of any null values in the dictionary. + +Since unsigned integers can be more difficult to work with in some cases +(e.g. in the JVM), we recommend preferring signed integers over unsigned +integers for representing dictionary indices. Additionally, we recommend +avoiding using 64-bit unsigned integer indices unless they are required by an +application. + +We discuss dictionary encoding as it relates to serialization further +below. + Buffer Listing for Each Layout ------------------------------ From d05c1ae75f84808dde522cea37d23344e1aee0de Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 7 Jun 2022 19:39:16 +0200 Subject: [PATCH 02/28] formatting --- docs/source/format/Columnar.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index dba2b17694470..dc966216153b2 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -808,15 +808,15 @@ In Run-length-encoded form, this could appear as: * Length: 3, Null count: 1 * Validity bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - |00000101 | 0 (padding) | + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00000101 | 0 (padding) | - * Values buffer + * Values buffer - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 6-63 | - |-------------|-------------|-------------|-----------------------| - | 1.0 | unspecified | 2.0 | unspecified (padding) | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 6-63 | + |-------------|-------------|-------------|-----------------------| + | 1.0 | unspecified | 2.0 | unspecified (padding) | Note that a dictionary is permitted to contain duplicate values or From 2a458b95826aa38a8264803cb4b9348888e06671 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 7 Jun 2022 19:42:30 +0200 Subject: [PATCH 03/28] minor fixes --- docs/source/format/Columnar.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index dc966216153b2..503cf0f598a2c 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -789,13 +789,13 @@ As an example, you could have the following data: :: type: Float32 - [1.0, 1.0, 1.0, 1.0, null, 'null', 2.0] + [1.0, 1.0, 1.0, 1.0, null, null, 2.0] In Run-length-encoded form, this could appear as: :: - * Length: 3, Null count: 0 + * Length: 3, Null count: 2 * Accumulated run lengths buffer: | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 6-63 | From d07cda19b991e7bb96fc20d3b403809f7843d39c Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 16 Jun 2022 15:35:08 +0200 Subject: [PATCH 04/28] replace copy-paste mistake with actual rle description --- docs/source/format/Columnar.rst | 50 ++++++++++----------------------- 1 file changed, 15 insertions(+), 35 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 503cf0f598a2c..caa5150fc7225 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -775,20 +775,25 @@ same a, called runs. Each run is represented as a value, and an integer describing how often this value is repeated. Any array can be run-length-encoded. A run-length encoded array has a single -buffer holding as many 32-bit integers, as there are runs. The actual values are -hold in a child array, which is just a regular array +buffer holding as many 32-bit integers, as there are runs. The actual values +are hold in a child array, which is just a regular array. -The dictionary is stored as an optional -property of an array. When a field is dictionary encoded, the values are -represented by an array of non-negative integers representing the index of the -value in the dictionary. The memory layout for a dictionary-encoded array is -the same as that of a primitive integer layout. The dictionary is handled as a -separate columnar array with its own respective layout. +The values in the parent array buffer represent the length of each run. They do +not hold the length of the respective run directly, but the accumulated length +of all runs from the first to the current one. This allows relatively efficient +random access from a logical index using binary search. The length of an +individual run can be determined by subtracting two adjacent values. + +A run has to have a length of at least 1. This means the values in the +accumulated run lengths buffer are all positive and in strictly ascending +order. + +An accumulated run length cannot be null, therefore the parent array has no +validity buffer. As an example, you could have the following data: :: type: Float32 - [1.0, 1.0, 1.0, 1.0, null, null, 2.0] In Run-length-encoded form, this could appear as: @@ -819,31 +824,6 @@ In Run-length-encoded form, this could appear as: | 1.0 | unspecified | 2.0 | unspecified (padding) | -Note that a dictionary is permitted to contain duplicate values or -nulls: - -:: - - data VarBinary (dictionary-encoded) - index_type: Int32 - values: [0, 1, 3, 1, 4, 2] - - dictionary - type: VarBinary - values: ['foo', 'bar', 'baz', 'foo', null] - -The null count of such arrays is dictated only by the validity bitmap -of its indices, irrespective of any null values in the dictionary. - -Since unsigned integers can be more difficult to work with in some cases -(e.g. in the JVM), we recommend preferring signed integers over unsigned -integers for representing dictionary indices. Additionally, we recommend -avoiding using 64-bit unsigned integer indices unless they are required by an -application. - -We discuss dictionary encoding as it relates to serialization further -below. - Buffer Listing for Each Layout ------------------------------ @@ -1036,7 +1016,7 @@ The ``Buffer`` Flatbuffers value describes the location and size of a piece of memory. Generally these are interpreted relative to the **encapsulated message format** defined below. -The ``size`` field of ``Buffer`` is not required to account for padding +The ``size`` field of ``Buffer`` is not required to account for paddingeng-career-mgmt bytes. Since this metadata can be used to communicate in-memory pointer addresses between libraries, it is recommended to set ``size`` to the actual memory size rather than the padded size. From 29ff9304f7f5f9ba5e4c5d3df6cc9e550642e024 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 16 Jun 2022 23:40:52 +0200 Subject: [PATCH 05/28] small fixes from PR comments --- docs/source/format/Columnar.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index caa5150fc7225..9009fa549c796 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -771,12 +771,12 @@ Run-Length-encoded Layout ------------------------- Run-Length is a data representation that represents data as sequences of the -same a, called runs. Each run is represented as a value, and an integer +same value, called runs. Each run is represented as a value, and an integer describing how often this value is repeated. Any array can be run-length-encoded. A run-length encoded array has a single -buffer holding as many 32-bit integers, as there are runs. The actual values -are hold in a child array, which is just a regular array. +buffer holding as many signed 32-bit integers, as there are runs. The actual +values are hold in a child array, which is just a regular array. The values in the parent array buffer represent the length of each run. They do not hold the length of the respective run directly, but the accumulated length From 095e7421a30f7fecb9a81fe9c65b437cb2209500 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 16 Jun 2022 23:41:41 +0200 Subject: [PATCH 06/28] hold -> held --- docs/source/format/Columnar.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 9009fa549c796..f943dcd53aae8 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -776,7 +776,7 @@ describing how often this value is repeated. Any array can be run-length-encoded. A run-length encoded array has a single buffer holding as many signed 32-bit integers, as there are runs. The actual -values are hold in a child array, which is just a regular array. +values are held in a child array, which is just a regular array. The values in the parent array buffer represent the length of each run. They do not hold the length of the respective run directly, but the accumulated length From 386a23b449faebcb08923fc31d17cd72e214bf50 Mon Sep 17 00:00:00 2001 From: zagto Date: Mon, 27 Jun 2022 17:40:35 +0200 Subject: [PATCH 07/28] Apply suggestions from code review Co-authored-by: Weston Pace --- docs/source/format/Columnar.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index f943dcd53aae8..189bbf24a9ec3 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -767,15 +767,15 @@ below. .. _run-length-encoded-layout: -Run-Length-encoded Layout +Run-Length Encoded Layout ------------------------- Run-Length is a data representation that represents data as sequences of the same value, called runs. Each run is represented as a value, and an integer describing how often this value is repeated. -Any array can be run-length-encoded. A run-length encoded array has a single -buffer holding as many signed 32-bit integers, as there are runs. The actual +Any array can be run-length encoded. A run-length encoded array has a single +buffer holding a signed 32-bit integer for each run. The actual values are held in a child array, which is just a regular array. The values in the parent array buffer represent the length of each run. They do @@ -1016,7 +1016,7 @@ The ``Buffer`` Flatbuffers value describes the location and size of a piece of memory. Generally these are interpreted relative to the **encapsulated message format** defined below. -The ``size`` field of ``Buffer`` is not required to account for paddingeng-career-mgmt +The ``size`` field of ``Buffer`` is not required to account for padding bytes. Since this metadata can be used to communicate in-memory pointer addresses between libraries, it is recommended to set ``size`` to the actual memory size rather than the padded size. From fb108833902b14fce79ef6a37f5a8df82f35fca8 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 27 Jun 2022 17:50:22 +0200 Subject: [PATCH 08/28] make rle parent length the logical length (code already works like this) --- docs/source/format/Columnar.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 189bbf24a9ec3..1eb1de5a2d417 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -800,7 +800,7 @@ In Run-length-encoded form, this could appear as: :: - * Length: 3, Null count: 2 + * Length: 7, Null count: 2 * Accumulated run lengths buffer: | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 6-63 | From 5225b2a014978206a36acf8f2a0398a1cb3f2201 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 25 Aug 2022 18:21:35 +0200 Subject: [PATCH 09/28] update columnar format doc --- docs/source/format/Columnar.rst | 38 +++++++++++++++++---------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 1eb1de5a2d417..d2e8ef071e0dc 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -774,22 +774,20 @@ Run-Length is a data representation that represents data as sequences of the same value, called runs. Each run is represented as a value, and an integer describing how often this value is repeated. -Any array can be run-length encoded. A run-length encoded array has a single -buffer holding a signed 32-bit integer for each run. The actual -values are held in a child array, which is just a regular array. +Any array can be run-length encoded. A run-length encoded array has no buffers +by itself, but has two child arrays. The first one holds a signed 32-bit integer +for each run. The actual values of each run are held the second child array. -The values in the parent array buffer represent the length of each run. They do +The values in the first child array represent the length of each run. They do not hold the length of the respective run directly, but the accumulated length -of all runs from the first to the current one. This allows relatively efficient -random access from a logical index using binary search. The length of an -individual run can be determined by subtracting two adjacent values. +of all runs from the first to the current one, i.e. the logical index where the +current run ends. This allows relatively efficient random access from a logical +index using binary search. The length of an individual run can be determined by +subtracting two adjacent values. A run has to have a length of at least 1. This means the values in the -accumulated run lengths buffer are all positive and in strictly ascending -order. - -An accumulated run length cannot be null, therefore the parent array has no -validity buffer. +run ends array all positive and in strictly ascending order. A run end cannot be +null. As an example, you could have the following data: :: @@ -801,15 +799,18 @@ In Run-length-encoded form, this could appear as: :: * Length: 7, Null count: 2 - * Accumulated run lengths buffer: + * Children arrays: - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 6-63 | - |-------------|-------------|-------------|-----------------------| - | 4 | 6 | 7 | unspecified (padding) | + * run ends (Int32): + * Length: 3, Null count: 0 + * Validity bitmap buffer: Not required + * Values buffer - * Children arrays: + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 6-63 | + |-------------|-------------|-------------|-----------------------| + | 4 | 6 | 7 | unspecified (padding) | - * values (Float32): + * values (Float32): * Length: 3, Null count: 1 * Validity bitmap buffer: @@ -843,6 +844,7 @@ of memory buffers for each layout. "Dense Union",type ids,offsets, "Null",,, "Dictionary-encoded",validity,data (indices), + "Run-length encoded",,, Logical Types ============= From bae0488edb25ee8d108595f63fa575ed65174d36 Mon Sep 17 00:00:00 2001 From: zagto Date: Tue, 20 Sep 2022 18:48:22 +0200 Subject: [PATCH 10/28] Update docs/source/format/Columnar.rst Co-authored-by: Andrew Lamb --- docs/source/format/Columnar.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index d2e8ef071e0dc..6a0984e159bbe 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -775,7 +775,7 @@ same value, called runs. Each run is represented as a value, and an integer describing how often this value is repeated. Any array can be run-length encoded. A run-length encoded array has no buffers -by itself, but has two child arrays. The first one holds a signed 32-bit integer +by itself, but has two child arrays. The first one holds a signed 32-bit integer called a "run end" for each run. The actual values of each run are held the second child array. The values in the first child array represent the length of each run. They do From 89d1ea19ebed1b89e16be2abbad61786dcae6b68 Mon Sep 17 00:00:00 2001 From: zagto Date: Tue, 20 Sep 2022 18:48:55 +0200 Subject: [PATCH 11/28] Update docs/source/format/Columnar.rst Co-authored-by: Andrew Lamb --- docs/source/format/Columnar.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 6a0984e159bbe..a7376c753abb5 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -785,7 +785,7 @@ current run ends. This allows relatively efficient random access from a logical index using binary search. The length of an individual run can be determined by subtracting two adjacent values. -A run has to have a length of at least 1. This means the values in the +A run must have have a length of at least 1. This means the values in the run ends array all positive and in strictly ascending order. A run end cannot be null. From abdb295fbafeee796b3444d258189b45300e84c4 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 29 Nov 2022 19:19:45 +0100 Subject: [PATCH 12/28] Columnar doc: mention different bit-widths --- docs/source/format/Columnar.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index a7376c753abb5..c080111d462b2 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -775,8 +775,10 @@ same value, called runs. Each run is represented as a value, and an integer describing how often this value is repeated. Any array can be run-length encoded. A run-length encoded array has no buffers -by itself, but has two child arrays. The first one holds a signed 32-bit integer called a "run end" -for each run. The actual values of each run are held the second child array. +by itself, but has two child arrays. The first one holds a signed integer +called a "run end" for each run. The run ends array can hold either 16, 32, or +64-bit integers. The actual values of each run are held +the second child array. The values in the first child array represent the length of each run. They do not hold the length of the respective run directly, but the accumulated length From a6f991e98f197c5387e9daa7e2e45f2ae4efb65d Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 20 Sep 2022 18:04:44 +0200 Subject: [PATCH 13/28] add rle type to Schema.fbs --- format/Schema.fbs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/format/Schema.fbs b/format/Schema.fbs index 7ee827b5de8da..e1af41be5a7eb 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -178,6 +178,9 @@ table FixedSizeBinary { table Bool { } +table RunLengthEncoded { +} + /// Exact decimal value represented as an integer value in two's /// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers /// are used. The representation uses the endianness indicated @@ -417,6 +420,7 @@ union Type { LargeBinary, LargeUtf8, LargeList, + RunLengthEncoded, } /// ---------------------------------------------------------------------- From 5d7d7087b060be0ea844dd31d17d124b0c7a569e Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 20 Sep 2022 18:06:10 +0200 Subject: [PATCH 14/28] re-generate generated C++ files from flatbuffers --- cpp/src/generated/File_generated.h | 10 +- cpp/src/generated/Message_generated.h | 12 +- cpp/src/generated/Schema_generated.h | 90 ++++-- cpp/src/generated/SparseTensor_generated.h | 11 +- cpp/src/generated/Tensor_generated.h | 9 +- cpp/src/generated/feather_generated.h | 7 - cpp/src/plasma/common_generated.h | 27 +- cpp/src/plasma/plasma_generated.h | 343 +++++++-------------- 8 files changed, 202 insertions(+), 307 deletions(-) diff --git a/cpp/src/generated/File_generated.h b/cpp/src/generated/File_generated.h index 06953c4a04044..5b219f1eb0ed1 100644 --- a/cpp/src/generated/File_generated.h +++ b/cpp/src/generated/File_generated.h @@ -26,15 +26,18 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Block FLATBUFFERS_FINAL_CLASS { int64_t bodyLength_; public: - Block() { - memset(static_cast(this), 0, sizeof(Block)); + Block() + : offset_(0), + metaDataLength_(0), + padding0__(0), + bodyLength_(0) { + (void)padding0__; } Block(int64_t _offset, int32_t _metaDataLength, int64_t _bodyLength) : offset_(flatbuffers::EndianScalar(_offset)), metaDataLength_(flatbuffers::EndianScalar(_metaDataLength)), padding0__(0), bodyLength_(flatbuffers::EndianScalar(_bodyLength)) { - (void)padding0__; } /// Index to the start of the RecordBlock (note this is past the Message header) int64_t offset() const { @@ -119,7 +122,6 @@ struct FooterBuilder { : fbb_(_fbb) { start_ = fbb_.StartTable(); } - FooterBuilder &operator=(const FooterBuilder &); flatbuffers::Offset