From 390b3fe8db99ba57282dac82f69d609a34089b73 Mon Sep 17 00:00:00 2001 From: kst-morozov Date: Fri, 4 Oct 2024 13:57:32 +0200 Subject: [PATCH 1/6] add test that reproduce problem with order parts for ReplacingMergeTree after backup. --- .../features/backup_restore.feature | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tests/integration/features/backup_restore.feature b/tests/integration/features/backup_restore.feature index 06627165..a915f126 100644 --- a/tests/integration/features/backup_restore.feature +++ b/tests/integration/features/backup_restore.feature @@ -87,6 +87,78 @@ Feature: Backup & Restore And we restore clickhouse backup #0 to clickhouse02 Then we got same clickhouse data at clickhouse01 clickhouse02 + Scenario: Backup & Restore for ReplacingMergeTree + When we drop all databases at clickhouse01 + And we drop all databases at clickhouse02 + Given we have executed queries on clickhouse01 + """ + CREATE DATABASE test_db; + CREATE TABLE test_db.hits ( + id UInt32, + url String, + visits UInt32 + ) + ENGINE ReplacingMergeTree + ORDER BY id; + INSERT INTO test_db.hits VALUES + (1, '/index', 100); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (1, '/index', 101); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (1, '/index', 102); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (1, '/index', 103); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (1, '/index', 104); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (1, '/index', 105); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (1, '/index', 106); + """ + When we create clickhouse01 clickhouse backup + And we restore clickhouse backup #0 to clickhouse02 + When we execute query on clickhouse01 + """ + SELECT id, visits FROM test_db.hits FINAL ORDER BY id FORMAT Vertical; + """ + Then we get response + """ + Row 1: + ────── + id: 1 + visits: 106 + """ + When we execute query on clickhouse02 + """ + SELECT id, visits FROM test_db.hits FINAL ORDER BY id FORMAT Vertical; + """ + Then we get response + """ + Row 1: + ────── + id: 1 + visits: 106 + """ + Then we got same clickhouse data at clickhouse01 clickhouse02 + Scenario: Backup & Restore with long file names When we drop all databases at clickhouse01 And we drop all databases at clickhouse02 From 2f02c3a5e5ab224d3aef3c555000e491a9995ffd Mon Sep 17 00:00:00 2001 From: kst-morozov Date: Wed, 23 Oct 2024 15:54:08 +0200 Subject: [PATCH 2/6] sort parts was added. --- ch_backup/backup/metadata/table_metadata.py | 21 +++++++- .../features/backup_restore.feature | 48 +++++++++++++++---- 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/ch_backup/backup/metadata/table_metadata.py b/ch_backup/backup/metadata/table_metadata.py index 23390fe1..7d3a8b08 100644 --- a/ch_backup/backup/metadata/table_metadata.py +++ b/ch_backup/backup/metadata/table_metadata.py @@ -1,13 +1,15 @@ """ Backup metadata for ClickHouse table. """ - from types import SimpleNamespace from typing import List, Optional, Set from ch_backup.backup.metadata.part_metadata import PartMetadata +REPLACING_MERGE_TREE = "ReplacingMergeTree" + + class TableMetadata(SimpleNamespace): """ Backup metadata for ClickHouse table. @@ -53,6 +55,23 @@ def get_parts(self, *, excluded_parts: Set[str] = None) -> List[PartMetadata]: PartMetadata.load(self.database, self.name, part_name, raw_metadata) ) + if self.engine == REPLACING_MERGE_TREE: + def split_part_name(part): + chunks = part.split('_', maxsplit=3) + partition = "" + suffix = "" + try: + partition = chunks[0] + start_range = int(chunks[1]) + finish_range = int(chunks[2]) + suffix = chunks[3] + except (IndexError, ValueError): + start_range = 0 + finish_range = 0 + return partition, start_range, finish_range, suffix + + result.sort(key=lambda part: split_part_name(part.name)) + return result def add_part(self, part: PartMetadata) -> None: diff --git a/tests/integration/features/backup_restore.feature b/tests/integration/features/backup_restore.feature index a915f126..bc1ebbfa 100644 --- a/tests/integration/features/backup_restore.feature +++ b/tests/integration/features/backup_restore.feature @@ -94,44 +94,72 @@ Feature: Backup & Restore """ CREATE DATABASE test_db; CREATE TABLE test_db.hits ( + dt DateTime, id UInt32, url String, visits UInt32 ) ENGINE ReplacingMergeTree - ORDER BY id; + ORDER BY (dt, id) + PARTITION BY toYYYYMM(dt); + + INSERT INTO test_db.hits VALUES + (now(), 1, '/index', 100); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (now(), 1, '/index', 101); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (now(), 1, '/index', 102); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (now(), 1, '/index', 103); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (now(), 1, '/index', 104); + """ + Given we have executed queries on clickhouse01 + """ INSERT INTO test_db.hits VALUES - (1, '/index', 100); + (now(), 1, '/index', 105); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (1, '/index', 101); + (now(), 1, '/index', 106); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (1, '/index', 102); + (now(), 1, '/index', 107); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (1, '/index', 103); + (now(), 1, '/index', 108); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (1, '/index', 104); + (now(), 1, '/index', 109); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (1, '/index', 105); + (now(), 1, '/index', 110); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (1, '/index', 106); + (now(), 1, '/index', 111); """ When we create clickhouse01 clickhouse backup And we restore clickhouse backup #0 to clickhouse02 @@ -144,7 +172,7 @@ Feature: Backup & Restore Row 1: ────── id: 1 - visits: 106 + visits: 111 """ When we execute query on clickhouse02 """ @@ -155,7 +183,7 @@ Feature: Backup & Restore Row 1: ────── id: 1 - visits: 106 + visits: 111 """ Then we got same clickhouse data at clickhouse01 clickhouse02 From ced92d57814342626a58c7c379ef26bf17d4021a Mon Sep 17 00:00:00 2001 From: kst-morozov Date: Wed, 23 Oct 2024 15:57:20 +0200 Subject: [PATCH 3/6] fix style --- ch_backup/backup/metadata/table_metadata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ch_backup/backup/metadata/table_metadata.py b/ch_backup/backup/metadata/table_metadata.py index 7d3a8b08..e8e00462 100644 --- a/ch_backup/backup/metadata/table_metadata.py +++ b/ch_backup/backup/metadata/table_metadata.py @@ -1,12 +1,12 @@ """ Backup metadata for ClickHouse table. """ + from types import SimpleNamespace from typing import List, Optional, Set from ch_backup.backup.metadata.part_metadata import PartMetadata - REPLACING_MERGE_TREE = "ReplacingMergeTree" @@ -56,8 +56,9 @@ def get_parts(self, *, excluded_parts: Set[str] = None) -> List[PartMetadata]: ) if self.engine == REPLACING_MERGE_TREE: + def split_part_name(part): - chunks = part.split('_', maxsplit=3) + chunks = part.split("_", maxsplit=3) partition = "" suffix = "" try: From 45e42900c8733cf68097e58584e4e10035e88305 Mon Sep 17 00:00:00 2001 From: kst-morozov Date: Thu, 24 Oct 2024 12:33:29 +0200 Subject: [PATCH 4/6] fix test --- .../features/backup_restore.feature | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/integration/features/backup_restore.feature b/tests/integration/features/backup_restore.feature index bc1ebbfa..0dae5353 100644 --- a/tests/integration/features/backup_restore.feature +++ b/tests/integration/features/backup_restore.feature @@ -104,62 +104,62 @@ Feature: Backup & Restore PARTITION BY toYYYYMM(dt); INSERT INTO test_db.hits VALUES - (now(), 1, '/index', 100); + (toDate('2024-10-24'), 1, '/index', 100); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (now(), 1, '/index', 101); + (toDate('2024-10-24'), 1, '/index', 101); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (now(), 1, '/index', 102); + (toDate('2024-10-24'), 1, '/index', 102); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (now(), 1, '/index', 103); + (toDate('2024-10-24'), 1, '/index', 103); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (now(), 1, '/index', 104); + (toDate('2024-10-24'), 1, '/index', 104); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (now(), 1, '/index', 105); + (toDate('2024-10-24'), 1, '/index', 105); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (now(), 1, '/index', 106); + (toDate('2024-10-24'), 1, '/index', 106); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (now(), 1, '/index', 107); + (toDate('2024-10-24'), 1, '/index', 107); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (now(), 1, '/index', 108); + (toDate('2024-10-24'), 1, '/index', 108); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (now(), 1, '/index', 109); + (toDate('2024-10-24'), 1, '/index', 109); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (now(), 1, '/index', 110); + (toDate('2024-10-24'), 1, '/index', 110); """ Given we have executed queries on clickhouse01 """ INSERT INTO test_db.hits VALUES - (now(), 1, '/index', 111); + (toDate('2024-10-24'), 1, '/index', 111); """ When we create clickhouse01 clickhouse backup And we restore clickhouse backup #0 to clickhouse02 From 01d67a648699aeb4021e9ebccf42ec98bf513603 Mon Sep 17 00:00:00 2001 From: kst-morozov Date: Thu, 24 Oct 2024 14:41:57 +0200 Subject: [PATCH 5/6] comments were applied --- ch_backup/backup/metadata/table_metadata.py | 54 +++++++++++++-------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/ch_backup/backup/metadata/table_metadata.py b/ch_backup/backup/metadata/table_metadata.py index e8e00462..edc9cf84 100644 --- a/ch_backup/backup/metadata/table_metadata.py +++ b/ch_backup/backup/metadata/table_metadata.py @@ -3,11 +3,22 @@ """ from types import SimpleNamespace -from typing import List, Optional, Set +from typing import List, NamedTuple, Optional, Set from ch_backup.backup.metadata.part_metadata import PartMetadata -REPLACING_MERGE_TREE = "ReplacingMergeTree" + +class PartInfo(NamedTuple): + """ + Parsed part name. + https://github.com/ClickHouse/ClickHouse/blob/e2821c5e8b728d1d28f9e0b98db87e0af5bc4a29/src/Storages/MergeTree/MergeTreePartInfo.cpp#L54 + """ + + partition_id: str + min_block_num: int + max_block_num: int + level: int + mutation: str class TableMetadata(SimpleNamespace): @@ -43,7 +54,7 @@ def uuid(self) -> Optional[str]: def get_parts(self, *, excluded_parts: Set[str] = None) -> List[PartMetadata]: """ - Return data parts. + Return data parts (sorted). """ if not excluded_parts: excluded_parts = set() @@ -55,24 +66,25 @@ def get_parts(self, *, excluded_parts: Set[str] = None) -> List[PartMetadata]: PartMetadata.load(self.database, self.name, part_name, raw_metadata) ) - if self.engine == REPLACING_MERGE_TREE: - - def split_part_name(part): - chunks = part.split("_", maxsplit=3) - partition = "" - suffix = "" - try: - partition = chunks[0] - start_range = int(chunks[1]) - finish_range = int(chunks[2]) - suffix = chunks[3] - except (IndexError, ValueError): - start_range = 0 - finish_range = 0 - return partition, start_range, finish_range, suffix - - result.sort(key=lambda part: split_part_name(part.name)) - + def split_part_name(part: str) -> PartInfo: + max_split = 4 + chunks = part.split("_", maxsplit=max_split) + partition_id = "" + level = 0 + mutation = "" + try: + partition_id = chunks[0] + min_block_num = int(chunks[1]) + max_block_num = int(chunks[2]) + level = int(chunks[3]) + if max_split + 1 == len(chunks): + mutation = chunks[4] + except (IndexError, ValueError): + min_block_num = 0 + max_block_num = 0 + return PartInfo(partition_id, min_block_num, max_block_num, level, mutation) + + result.sort(key=lambda part: split_part_name(part.name)) return result def add_part(self, part: PartMetadata) -> None: From f57383888686a67723a39e8c205743aea5e85858 Mon Sep 17 00:00:00 2001 From: Konstantin Morozov Date: Mon, 28 Oct 2024 10:12:21 +0100 Subject: [PATCH 6/6] mutation type --- ch_backup/backup/metadata/table_metadata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ch_backup/backup/metadata/table_metadata.py b/ch_backup/backup/metadata/table_metadata.py index edc9cf84..7ea45373 100644 --- a/ch_backup/backup/metadata/table_metadata.py +++ b/ch_backup/backup/metadata/table_metadata.py @@ -18,7 +18,7 @@ class PartInfo(NamedTuple): min_block_num: int max_block_num: int level: int - mutation: str + mutation: int class TableMetadata(SimpleNamespace): @@ -71,14 +71,14 @@ def split_part_name(part: str) -> PartInfo: chunks = part.split("_", maxsplit=max_split) partition_id = "" level = 0 - mutation = "" + mutation = 0 try: partition_id = chunks[0] min_block_num = int(chunks[1]) max_block_num = int(chunks[2]) level = int(chunks[3]) if max_split + 1 == len(chunks): - mutation = chunks[4] + mutation = int(chunks[4]) except (IndexError, ValueError): min_block_num = 0 max_block_num = 0