From cf766e27e06cb020b2e74cbeb1e91100785d5b97 Mon Sep 17 00:00:00 2001 From: Konstantin Morozov <34001730+k-morozov@users.noreply.github.com> Date: Mon, 28 Oct 2024 17:27:40 +0100 Subject: [PATCH] Bugfix: order parts after backup for ReplacingMergeTree (#191) * add test that reproduce problem with order parts for ReplacingMergeTree after backup. * sort parts was added. * fix style * fix test * comments were applied * mutation type --------- Co-authored-by: kst-morozov --- ch_backup/backup/metadata/table_metadata.py | 36 ++++++- .../features/backup_restore.feature | 100 ++++++++++++++++++ 2 files changed, 134 insertions(+), 2 deletions(-) diff --git a/ch_backup/backup/metadata/table_metadata.py b/ch_backup/backup/metadata/table_metadata.py index 23390fe1..7ea45373 100644 --- a/ch_backup/backup/metadata/table_metadata.py +++ b/ch_backup/backup/metadata/table_metadata.py @@ -3,11 +3,24 @@ """ from types import SimpleNamespace -from typing import List, Optional, Set +from typing import List, NamedTuple, Optional, Set from ch_backup.backup.metadata.part_metadata import PartMetadata +class PartInfo(NamedTuple): + """ + Parsed part name. + https://github.com/ClickHouse/ClickHouse/blob/e2821c5e8b728d1d28f9e0b98db87e0af5bc4a29/src/Storages/MergeTree/MergeTreePartInfo.cpp#L54 + """ + + partition_id: str + min_block_num: int + max_block_num: int + level: int + mutation: int + + class TableMetadata(SimpleNamespace): """ Backup metadata for ClickHouse table. @@ -41,7 +54,7 @@ def uuid(self) -> Optional[str]: def get_parts(self, *, excluded_parts: Set[str] = None) -> List[PartMetadata]: """ - Return data parts. + Return data parts (sorted). """ if not excluded_parts: excluded_parts = set() @@ -53,6 +66,25 @@ def get_parts(self, *, excluded_parts: Set[str] = None) -> List[PartMetadata]: PartMetadata.load(self.database, self.name, part_name, raw_metadata) ) + def split_part_name(part: str) -> PartInfo: + max_split = 4 + chunks = part.split("_", maxsplit=max_split) + partition_id = "" + level = 0 + mutation = 0 + try: + partition_id = chunks[0] + min_block_num = int(chunks[1]) + max_block_num = int(chunks[2]) + level = int(chunks[3]) + if max_split + 1 == len(chunks): + mutation = int(chunks[4]) + except (IndexError, ValueError): + min_block_num = 0 + max_block_num = 0 + return PartInfo(partition_id, min_block_num, max_block_num, level, mutation) + + result.sort(key=lambda part: split_part_name(part.name)) return result def add_part(self, part: PartMetadata) -> None: diff --git a/tests/integration/features/backup_restore.feature b/tests/integration/features/backup_restore.feature index 06627165..0dae5353 100644 --- a/tests/integration/features/backup_restore.feature +++ b/tests/integration/features/backup_restore.feature @@ -87,6 +87,106 @@ Feature: Backup & Restore And we restore clickhouse backup #0 to clickhouse02 Then we got same clickhouse data at clickhouse01 clickhouse02 + Scenario: Backup & Restore for ReplacingMergeTree + When we drop all databases at clickhouse01 + And we drop all databases at clickhouse02 + Given we have executed queries on clickhouse01 + """ + CREATE DATABASE test_db; + CREATE TABLE test_db.hits ( + dt DateTime, + id UInt32, + url String, + visits UInt32 + ) + ENGINE ReplacingMergeTree + ORDER BY (dt, id) + PARTITION BY toYYYYMM(dt); + + INSERT INTO test_db.hits VALUES + (toDate('2024-10-24'), 1, '/index', 100); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (toDate('2024-10-24'), 1, '/index', 101); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (toDate('2024-10-24'), 1, '/index', 102); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (toDate('2024-10-24'), 1, '/index', 103); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (toDate('2024-10-24'), 1, '/index', 104); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (toDate('2024-10-24'), 1, '/index', 105); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (toDate('2024-10-24'), 1, '/index', 106); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (toDate('2024-10-24'), 1, '/index', 107); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (toDate('2024-10-24'), 1, '/index', 108); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (toDate('2024-10-24'), 1, '/index', 109); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (toDate('2024-10-24'), 1, '/index', 110); + """ + Given we have executed queries on clickhouse01 + """ + INSERT INTO test_db.hits VALUES + (toDate('2024-10-24'), 1, '/index', 111); + """ + When we create clickhouse01 clickhouse backup + And we restore clickhouse backup #0 to clickhouse02 + When we execute query on clickhouse01 + """ + SELECT id, visits FROM test_db.hits FINAL ORDER BY id FORMAT Vertical; + """ + Then we get response + """ + Row 1: + ────── + id: 1 + visits: 111 + """ + When we execute query on clickhouse02 + """ + SELECT id, visits FROM test_db.hits FINAL ORDER BY id FORMAT Vertical; + """ + Then we get response + """ + Row 1: + ────── + id: 1 + visits: 111 + """ + Then we got same clickhouse data at clickhouse01 clickhouse02 + Scenario: Backup & Restore with long file names When we drop all databases at clickhouse01 And we drop all databases at clickhouse02