Skip to content

Commit

Permalink
Merge pull request ClickHouse#48838 from amosbird/issue_48735
Browse files Browse the repository at this point in the history
Fix key condition on duplicate primary keys
  • Loading branch information
robot-clickhouse-ci-1 authored and Enmk committed Jun 15, 2023
1 parent 91115cc commit e112350
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 41 deletions.
33 changes: 14 additions & 19 deletions src/Storages/MergeTree/KeyCondition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -743,9 +743,16 @@ KeyCondition::KeyCondition(
, single_point(single_point_)
, strict(strict_)
{
size_t key_index = 0;
for (const auto & name : key_column_names)
{
if (!key_columns.contains(name))
{
key_columns[name] = key_columns.size();
key_indices.push_back(key_index);
}
++key_index;
}

auto filter_node = buildFilterNode(query, additional_filter_asts);

Expand Down Expand Up @@ -808,9 +815,16 @@ KeyCondition::KeyCondition(
, single_point(single_point_)
, strict(strict_)
{
size_t key_index = 0;
for (const auto & name : key_column_names)
{
if (!key_columns.contains(name))
{
key_columns[name] = key_columns.size();
key_indices.push_back(key_index);
}
++key_index;
}

if (!filter_dag)
{
Expand Down Expand Up @@ -2561,25 +2575,6 @@ bool KeyCondition::alwaysFalse() const
return rpn_stack[0] == 0;
}

size_t KeyCondition::getMaxKeyColumn() const
{
size_t res = 0;
for (const auto & element : rpn)
{
if (element.function == RPNElement::FUNCTION_NOT_IN_RANGE
|| element.function == RPNElement::FUNCTION_IN_RANGE
|| element.function == RPNElement::FUNCTION_IS_NULL
|| element.function == RPNElement::FUNCTION_IS_NOT_NULL
|| element.function == RPNElement::FUNCTION_IN_SET
|| element.function == RPNElement::FUNCTION_NOT_IN_SET)
{
if (element.key_column > res)
res = element.key_column;
}
}
return res;
}

bool KeyCondition::hasMonotonicFunctionsChain() const
{
for (const auto & element : rpn)
Expand Down
8 changes: 5 additions & 3 deletions src/Storages/MergeTree/KeyCondition.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,9 +286,6 @@ class KeyCondition

bool alwaysFalse() const;

/// Get the maximum number of the key element used in the condition.
size_t getMaxKeyColumn() const;

bool hasMonotonicFunctionsChain() const;

/// Impose an additional condition: the value in the column `column` must be in the range `range`.
Expand All @@ -297,6 +294,9 @@ class KeyCondition

String toString() const;

/// Get the key indices of key names used in the condition.
const std::vector<size_t> & getKeyIndices() const { return key_indices; }

/// Condition description for EXPLAIN query.
struct Description
{
Expand Down Expand Up @@ -478,6 +478,8 @@ class KeyCondition
RPN rpn;

ColumnIndices key_columns;
std::vector<size_t> key_indices;

/// Expression which is used for key condition.
const ExpressionActionsPtr key_expr;
/// All intermediate columns are used to calculate key_expr.
Expand Down
26 changes: 15 additions & 11 deletions src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1430,18 +1430,21 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
return res;
}

size_t used_key_size = key_condition.getMaxKeyColumn() + 1;
const String & part_name = part->isProjectionPart() ? fmt::format("{}.{}", part->name, part->getParentPart()->name) : part->name;
const auto & primary_key = metadata_snapshot->getPrimaryKey();
auto index_columns = std::make_shared<ColumnsWithTypeAndName>();
const auto & key_indices = key_condition.getKeyIndices();
DataTypes key_types;
for (size_t i : key_indices)
{
index_columns->emplace_back(ColumnWithTypeAndName{index[i], primary_key.data_types[i], primary_key.column_names[i]});
key_types.emplace_back(primary_key.data_types[i]);
}

std::function<void(size_t, size_t, FieldRef &)> create_field_ref;
/// If there are no monotonic functions, there is no need to save block reference.
/// Passing explicit field to FieldRef allows to optimize ranges and shows better performance.
const auto & primary_key = metadata_snapshot->getPrimaryKey();
std::function<void(size_t, size_t, FieldRef &)> create_field_ref;
if (key_condition.hasMonotonicFunctionsChain())
{
auto index_columns = std::make_shared<ColumnsWithTypeAndName>();
for (size_t i = 0; i < used_key_size; ++i)
index_columns->emplace_back(ColumnWithTypeAndName{index[i], primary_key.data_types[i], primary_key.column_names[i]});

create_field_ref = [index_columns](size_t row, size_t column, FieldRef & field)
{
Expand All @@ -1453,16 +1456,17 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
}
else
{
create_field_ref = [&index](size_t row, size_t column, FieldRef & field)
create_field_ref = [index_columns](size_t row, size_t column, FieldRef & field)
{
index[column]->get(row, field);
(*index_columns)[column].column->get(row, field);
// NULL_LAST
if (field.isNull())
field = POSITIVE_INFINITY;
};
}

/// NOTE Creating temporary Field objects to pass to KeyCondition.
size_t used_key_size = key_indices.size();
std::vector<FieldRef> index_left(used_key_size);
std::vector<FieldRef> index_right(used_key_size);

Expand All @@ -1487,10 +1491,10 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
create_field_ref(range.end, i, index_right[i]);
}
}
return key_condition.mayBeTrueInRange(
used_key_size, index_left.data(), index_right.data(), primary_key.data_types);
return key_condition.mayBeTrueInRange(used_key_size, index_left.data(), index_right.data(), key_types);
};

const String & part_name = part->isProjectionPart() ? fmt::format("{}.{}", part->name, part->getParentPart()->name) : part->name;
if (!key_condition.matchesExactContinuousRange())
{
// Do exclusion search, where we drop ranges that do not match
Expand Down
16 changes: 8 additions & 8 deletions tests/queries/0_stateless/02540_duplicate_primary_key.sql
Original file line number Diff line number Diff line change
Expand Up @@ -90,16 +90,16 @@ ORDER BY (coverage, situation_name, NAME_toe, NAME_cockroach);

insert into test select * from generateRandom() limit 10;

with dissonance as (
Select cast(toStartOfInterval(coverage, INTERVAL 1 day) as Date) as flour, count() as regulation
with dissonance as (
Select cast(toStartOfInterval(coverage, INTERVAL 1 day) as Date) as flour, count() as regulation
from test
group by flour having flour >= toDate(now())-100
group by flour having flour >= toDate(now())-100
),
cheetah as (
Select flour, regulation from dissonance
union distinct
Select toDate(now())-1, ifnull((select regulation from dissonance where flour = toDate(now())-1),0) as regulation
)
cheetah as (
Select flour, regulation from dissonance
union distinct
Select toDate(now())-1, ifnull((select regulation from dissonance where flour = toDate(now())-1),0) as regulation
)
Select flour, regulation from cheetah order by flour with fill step 1 limit 100 format Null;

drop table test;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
200
99 changes: 99 additions & 0 deletions tests/queries/0_stateless/02540_duplicate_primary_key2.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
drop table if exists test;

set allow_suspicious_low_cardinality_types = 1;

CREATE TABLE test
(
`timestamp` DateTime,
`latitude` Nullable(Float32) CODEC(Gorilla, ZSTD(1)),
`longitude` Nullable(Float32) CODEC(Gorilla, ZSTD(1)),
`xxxx1` LowCardinality(UInt8),
`xxxx2` LowCardinality(Nullable(Int16)),
`xxxx3` LowCardinality(Nullable(Int16)),
`xxxx4` Nullable(Int32),
`xxxx5` LowCardinality(Nullable(Int32)),
`xxxx6` Nullable(Int32),
`xxxx7` Nullable(Int32),
`xxxx8` LowCardinality(Int32),
`xxxx9` LowCardinality(Nullable(Int16)),
`xxxx10` LowCardinality(Nullable(Int16)),
`xxxx11` LowCardinality(Nullable(Int16)),
`xxxx12` LowCardinality(String),
`xxxx13` Nullable(Float32),
`xxxx14` LowCardinality(String),
`xxxx15` LowCardinality(Nullable(String)),
`xxxx16` LowCardinality(String),
`xxxx17` LowCardinality(String),
`xxxx18` FixedString(19),
`xxxx19` FixedString(17),
`xxxx20` LowCardinality(UInt8),
`xxxx21` LowCardinality(Nullable(Int16)),
`xxxx22` LowCardinality(Nullable(Int16)),
`xxxx23` LowCardinality(Nullable(Int16)),
`xxxx24` LowCardinality(Nullable(Int16)),
`xxxx25` LowCardinality(Nullable(Int16)),
`xxxx26` LowCardinality(Nullable(Int16)),
`xxxx27` Nullable(Float32),
`xxxx28` LowCardinality(Nullable(String)),
`xxxx29` LowCardinality(String),
`xxxx30` LowCardinality(String),
`xxxx31` LowCardinality(Nullable(String)),
`xxxx32` UInt64,
PROJECTION cumsum_projection_simple
(
SELECT
xxxx1,
toStartOfInterval(timestamp, toIntervalMonth(1)),
toStartOfWeek(timestamp, 8),
toStartOfInterval(timestamp, toIntervalDay(1)),
xxxx17,
xxxx16,
xxxx14,
xxxx9,
xxxx10,
xxxx21,
xxxx22,
xxxx11,
sum(multiIf(xxxx21 IS NULL, 0, 1)),
sum(multiIf(xxxx22 IS NULL, 0, 1)),
sum(multiIf(xxxx23 IS NULL, 0, 1)),
max(toStartOfInterval(timestamp, toIntervalDay(1))),
max(CAST(CAST(toStartOfInterval(timestamp, toIntervalDay(1)), 'Nullable(DATE)'), 'Nullable(TIMESTAMP)')),
min(toStartOfInterval(timestamp, toIntervalDay(1))),
min(CAST(CAST(toStartOfInterval(timestamp, toIntervalDay(1)), 'Nullable(DATE)'), 'Nullable(TIMESTAMP)')),
count(),
sum(1),
COUNTDistinct(xxxx16),
COUNTDistinct(xxxx31),
COUNTDistinct(xxxx14),
COUNTDistinct(CAST(toStartOfInterval(timestamp, toIntervalDay(1)), 'Nullable(DATE)'))
GROUP BY
xxxx1,
toStartOfInterval(timestamp, toIntervalMonth(1)),
toStartOfWeek(timestamp, 8),
toStartOfInterval(timestamp, toIntervalDay(1)),
xxxx1,
toStartOfInterval(timestamp, toIntervalMonth(1)),
toStartOfWeek(timestamp, 8),
toStartOfInterval(timestamp, toIntervalDay(1)),
xxxx17,
xxxx16,
xxxx14,
xxxx9,
xxxx10,
xxxx21,
xxxx22,
xxxx11
)
)
ENGINE = MergeTree
PARTITION BY toYYYYMM(timestamp)
ORDER BY (xxxx17, xxxx14, xxxx16, toStartOfDay(timestamp), left(xxxx19, 10), timestamp);

INSERT INTO test SELECT * replace 1 as xxxx16 replace 1 as xxxx1 replace '2022-02-02 01:00:00' as timestamp replace 'Airtel' as xxxx14 FROM generateRandom() LIMIT 100;
INSERT INTO test SELECT * replace 1 as xxxx16 replace 1 as xxxx1 replace '2022-02-02 01:00:00' as timestamp replace 'BSNL' as xxxx14 FROM generateRandom() LIMIT 100;
INSERT INTO test SELECT * replace 1 as xxxx16 replace 1 as xxxx1 replace '2022-02-02 01:00:00' as timestamp replace 'xxx' as xxxx14 FROM generateRandom() LIMIT 100;

select sum(1) from test where toStartOfInterval(timestamp, INTERVAL 1 day) >= TIMESTAMP '2022-02-01 01:00:00' and xxxx14 in ('Airtel', 'BSNL') and xxxx1 = 1 GROUP BY xxxx16;

drop table test;

0 comments on commit e112350

Please sign in to comment.