Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vector: Fix ColumnArray does not work well with CHBlockChunkCodec #9477

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 31 additions & 8 deletions dbms/src/DataTypes/DataTypeArray.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include <Columns/ColumnArray.h>
#include <Common/Exception.h>
#include <Common/typeid_cast.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeFactory.h>
Expand Down Expand Up @@ -111,13 +112,12 @@ void serializeArraySizesPositionIndependent(const IColumn & column, WriteBuffer
{
const ColumnArray & column_array = typeid_cast<const ColumnArray &>(column);
const ColumnArray::Offsets & offset_values = column_array.getOffsets();
size_t size = offset_values.size();

if (!size)
size_t size = offset_values.size();
if (size == 0)
return;

size_t end = limit && (offset + limit < size) ? offset + limit : size;

ColumnArray::Offset prev_offset = offset == 0 ? 0 : offset_values[offset - 1];
for (size_t i = offset; i < end; ++i)
{
Expand Down Expand Up @@ -173,6 +173,12 @@ void DataTypeArray::serializeBinaryBulkWithMultipleStreams(
path.push_back(Substream::ArraySizes);
if (auto * stream = getter(path))
{
// `position_independent_encoding == false` indicates that the `column_array.offsets`
// is serialized as is, which can provide better performance but only supports
// deserialization into an empty column. Conversely, when `position_independent_encoding == true`,
// the `column_array.offsets` is encoded into a format that supports deserializing
// and appending data into a column containing existing data.
// If you are unsure, set position_independent_encoding to true.
if (position_independent_encoding)
serializeArraySizesPositionIndependent(column, *stream, offset, limit);
else
Expand Down Expand Up @@ -224,11 +230,23 @@ void DataTypeArray::deserializeBinaryBulkWithMultipleStreams(
path.push_back(Substream::ArraySizes);
if (auto * stream = getter(path))
{
// `position_independent_encoding == false` indicates that the `column_array.offsets`
// is serialized as is, which can provide better performance but only supports
// deserialization into an empty column. Conversely, when `position_independent_encoding == true`,
// the `column_array.offsets` is encoded into a format that supports deserializing
// and appending data into a column containing existing data.
// If you are unsure, set position_independent_encoding to true.
if (position_independent_encoding)
deserializeArraySizesPositionIndependent(column, *stream, limit);
else
{
RUNTIME_CHECK_MSG(
column_array.getOffsetsColumn().empty(),
"try to deserialize Array type to non-empty column without position idenpendent encoding, type_name={}",
getName());
DataTypeNumber<ColumnArray::Offset>()
.deserializeBinaryBulk(column_array.getOffsetsColumn(), *stream, limit, 0);
}
}

path.back() = Substream::ArrayElements;
Expand All @@ -237,9 +255,13 @@ void DataTypeArray::deserializeBinaryBulkWithMultipleStreams(
IColumn & nested_column = column_array.getData();

/// Number of values corresponding with `offset_values` must be read.
size_t last_offset = (offset_values.empty() ? 0 : offset_values.back());
const size_t last_offset = (offset_values.empty() ? 0 : offset_values.back());
if (last_offset < nested_column.size())
throw Exception("Nested column is longer than last offset", ErrorCodes::LOGICAL_ERROR);
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"Nested column is longer than last offset, last_offset={} nest_column_size={}",
last_offset,
nested_column.size());
size_t nested_limit = last_offset - nested_column.size();
nested->deserializeBinaryBulkWithMultipleStreams(
nested_column,
Expand All @@ -253,9 +275,10 @@ void DataTypeArray::deserializeBinaryBulkWithMultipleStreams(
/// But if elements column is empty - it's ok for columns of Nested types that was added by ALTER.
if (!nested_column.empty() && nested_column.size() != last_offset)
throw Exception(
"Cannot read all array values: read just " + toString(nested_column.size()) + " of "
+ toString(last_offset),
ErrorCodes::CANNOT_READ_ALL_DATA);
ErrorCodes::CANNOT_READ_ALL_DATA,
"Cannot read all array values: read just {} of {}",
nested_column.size(),
last_offset);
}


Expand Down
62 changes: 33 additions & 29 deletions dbms/src/DataTypes/IDataType.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class IDataType : private boost::noncopyable
/// static constexpr bool is_parametric = false;

/// Name of data type (examples: UInt64, Array(String)).
virtual String getName() const { return getFamilyName(); };
virtual String getName() const { return getFamilyName(); }

virtual TypeIndex getTypeId() const = 0;

Expand Down Expand Up @@ -124,6 +124,8 @@ class IDataType : private boost::noncopyable
* offset must be not greater than size of column.
* offset + limit could be greater than size of column
* - in that case, column is serialized till the end.
* `position_independent_encoding` - provide better performance when it is false, but it requires not to be
* deserialized the data into a column with existing data.
*/
virtual void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
Expand All @@ -149,7 +151,9 @@ class IDataType : private boost::noncopyable
}

/** Read no more than limit values and append them into column.
* avg_value_size_hint - if not zero, may be used to avoid reallocations while reading column of String type.
* `avg_value_size_hint` - if not zero, may be used to avoid reallocations while reading column of String type.
* `position_independent_encoding` - provide better performance when it is false, but it requires not to be
* deserialized the data into a column with existing data.
*/
virtual void deserializeBinaryBulkWithMultipleStreams(
IColumn & column,
Expand Down Expand Up @@ -295,89 +299,89 @@ class IDataType : private boost::noncopyable
/** Can appear in table definition.
* Counterexamples: Interval, Nothing.
*/
virtual bool cannotBeStoredInTables() const { return false; };
virtual bool cannotBeStoredInTables() const { return false; }

/** In text formats that render "pretty" tables,
* is it better to align value right in table cell.
* Examples: numbers, even nullable.
*/
virtual bool shouldAlignRightInPrettyFormats() const { return false; };
virtual bool shouldAlignRightInPrettyFormats() const { return false; }

/** Does formatted value in any text format can contain anything but valid UTF8 sequences.
* Example: String (because it can contain arbitary bytes).
* Counterexamples: numbers, Date, DateTime.
* For Enum, it depends.
*/
virtual bool textCanContainOnlyValidUTF8() const { return false; };
virtual bool textCanContainOnlyValidUTF8() const { return false; }

/** Is it possible to compare for less/greater, to calculate min/max?
* Not necessarily totally comparable. For example, floats are comparable despite the fact that NaNs compares to nothing.
* The same for nullable of comparable types: they are comparable (but not totally-comparable).
*/
virtual bool isComparable() const { return false; };
virtual bool isComparable() const { return false; }

/** Does it make sense to use this type with COLLATE modifier in ORDER BY.
* Example: String, but not FixedString.
*/
virtual bool canBeComparedWithCollation() const { return false; };
virtual bool canBeComparedWithCollation() const { return false; }

/** If the type is totally comparable (Ints, Date, DateTime, not nullable, not floats)
* and "simple" enough (not String, FixedString) to be used as version number
* (to select rows with maximum version).
*/
virtual bool canBeUsedAsVersion() const { return false; };
virtual bool canBeUsedAsVersion() const { return false; }

/** Values of data type can be summed (possibly with overflow, within the same data type).
* Example: numbers, even nullable. Not Date/DateTime. Not Enum.
* Enums can be passed to aggregate function 'sum', but the result is Int64, not Enum, so they are not summable.
*/
virtual bool isSummable() const { return false; };
virtual bool isSummable() const { return false; }

/** Can be used in operations like bit and, bit shift, bit not, etc.
*/
virtual bool canBeUsedInBitOperations() const { return false; };
virtual bool canBeUsedInBitOperations() const { return false; }

/** Can be used in boolean context (WHERE, HAVING).
* UInt8, maybe nullable.
*/
virtual bool canBeUsedInBooleanContext() const { return false; };
virtual bool canBeUsedInBooleanContext() const { return false; }

/** Integers, floats, not Nullable. Not Enums. Not Date/DateTime.
*/
virtual bool isNumber() const { return false; };
virtual bool isNumber() const { return false; }

/** Integers. Not Nullable. Not Enums. Not Date/DateTime.
*/
virtual bool isInteger() const { return false; };
virtual bool isUnsignedInteger() const { return false; };
virtual bool isInteger() const { return false; }
virtual bool isUnsignedInteger() const { return false; }

/** Floating point values. Not Nullable. Not Enums. Not Date/DateTime.
*/
virtual bool isFloatingPoint() const { return false; }

/** Date, DateTime, MyDate, MyDateTime. Not Nullable.
*/
virtual bool isDateOrDateTime() const { return false; };
virtual bool isDateOrDateTime() const { return false; }

/** MyDate, MyDateTime. Not Nullable.
*/
virtual bool isMyDateOrMyDateTime() const { return false; };
virtual bool isMyDateOrMyDateTime() const { return false; }

/** MyTime. Not Nullable.
*/
virtual bool isMyTime() const { return false; };
virtual bool isMyTime() const { return false; }

/** Decimal. Not Nullable.
*/
virtual bool isDecimal() const { return false; };
virtual bool isDecimal() const { return false; }

/** Numbers, Enums, Date, DateTime, MyDate, MyDateTime. Not nullable.
*/
virtual bool isValueRepresentedByNumber() const { return false; };
virtual bool isValueRepresentedByNumber() const { return false; }

/** Integers, Enums, Date, DateTime, MyDate, MyDateTime. Not nullable.
*/
virtual bool isValueRepresentedByInteger() const { return false; };
virtual bool isValueRepresentedByInteger() const { return false; }

/** Values are unambiguously identified by contents of contiguous memory region,
* that can be obtained by IColumn::getDataAt method.
Expand All @@ -386,23 +390,23 @@ class IDataType : private boost::noncopyable
* (because Array(String) values became ambiguous if you concatenate Strings).
* Counterexamples: Nullable, Tuple.
*/
virtual bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const { return false; };
virtual bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const { return false; }

virtual bool isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion() const
{
return isValueUnambiguouslyRepresentedInContiguousMemoryRegion()
&& (isValueRepresentedByNumber() || isFixedString());
};
}

virtual bool isString() const { return false; };
virtual bool isFixedString() const { return false; };
virtual bool isStringOrFixedString() const { return isString() || isFixedString(); };
virtual bool isString() const { return false; }
virtual bool isFixedString() const { return false; }
virtual bool isStringOrFixedString() const { return isString() || isFixedString(); }

/** Example: numbers, Date, DateTime, FixedString, Enum... Nullable and Tuple of such types.
* Counterexamples: String, Array.
* It's Ok to return false for AggregateFunction despite the fact that some of them have fixed size state.
*/
virtual bool haveMaximumSizeOfValue() const { return false; };
virtual bool haveMaximumSizeOfValue() const { return false; }

/** Size in amount of bytes in memory. Throws an exception if not haveMaximumSizeOfValue.
*/
Expand All @@ -414,9 +418,9 @@ class IDataType : private boost::noncopyable

/** Integers (not floats), Enum, String, FixedString.
*/
virtual bool isCategorial() const { return false; };
virtual bool isCategorial() const { return false; }

virtual bool isEnum() const { return false; };
virtual bool isEnum() const { return false; }

virtual bool isNullable() const { return false; }
/** Is this type can represent only NULL value? (It also implies isNullable)
Expand All @@ -425,7 +429,7 @@ class IDataType : private boost::noncopyable

/** If this data type cannot be wrapped in Nullable data type.
*/
virtual bool canBeInsideNullable() const { return false; };
virtual bool canBeInsideNullable() const { return false; }

/// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column.
static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint);
Expand Down
16 changes: 14 additions & 2 deletions dbms/src/DataTypes/tests/gtest_data_type_get_common_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@
// limitations under the License.

#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/getLeastSupertype.h>
#include <DataTypes/getMostSubtype.h>
#include <DataTypes/isSupportedDataTypeCast.h>
#include <TestUtils/TiFlashTestBasic.h>

#include <sstream>

namespace DB
{
namespace tests
Expand Down Expand Up @@ -330,6 +329,19 @@ try
// not true for nullable
ASSERT_FALSE(ntype->isDateOrDateTime()) << "type: " + type->getName();
}

{
// array can be wrapped by Nullable
auto type = typeFromString("Array(Float32)");
ASSERT_NE(type, nullptr);
auto ntype = DataTypeNullable(type);
ASSERT_TRUE(ntype.isNullable());
}

{
auto type = typeFromString("Nullable(Array(Float32))");
ASSERT_TRUE(type->isNullable());
}
}
CATCH

Expand Down
16 changes: 14 additions & 2 deletions dbms/src/Flash/Coprocessor/CHBlockChunkCodec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,27 @@ void WriteColumnData(const IDataType & type, const ColumnPtr & column, WriteBuff
IDataType::OutputStreamGetter output_stream_getter = [&](const IDataType::SubstreamPath &) {
return &ostr;
};
type.serializeBinaryBulkWithMultipleStreams(*full_column, output_stream_getter, offset, limit, false, {});
type.serializeBinaryBulkWithMultipleStreams(
*full_column,
output_stream_getter,
offset,
limit,
/*position_independent_encoding=*/true,
{});
}

void CHBlockChunkCodec::readData(const IDataType & type, IColumn & column, ReadBuffer & istr, size_t rows)
{
IDataType::InputStreamGetter input_stream_getter = [&](const IDataType::SubstreamPath &) {
return &istr;
};
type.deserializeBinaryBulkWithMultipleStreams(column, input_stream_getter, rows, 0, false, {});
type.deserializeBinaryBulkWithMultipleStreams(
column,
input_stream_getter,
rows,
0,
/*position_independent_encoding=*/true,
{});
}

size_t ApproxBlockBytes(const Block & block)
Expand Down
2 changes: 1 addition & 1 deletion dbms/src/Flash/Coprocessor/CHBlockChunkCodecV1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ static inline void decodeColumnsByBlock(ReadBuffer & istr, Block & res, size_t r
[&](const IDataType::SubstreamPath &) { return &istr; },
sz,
0,
{},
/*position_independent_encoding=*/true,
{});
}
}
Expand Down
Loading