From 2ff10b93e38d0eca0d62a00bbf5a3832387e954e Mon Sep 17 00:00:00 2001 From: JaySon Date: Fri, 2 Aug 2019 14:37:28 +0800 Subject: [PATCH] [FLASH-318/319/343/344] PageStorage: RefPages, MVCC && Snapshot read (#97) * Page storage bug fix (#87) * add test cases for PageStorage * split PageStorage gc stage into small helper functions * add test cases for PageStorage gc concurrency * add stress test and dump utils of PageStorage * Fix bug: 1. ensure PageFile with meta && data, in case gc drop file but be killed before drop dir 2. turn PageStorage::Config::sync_on_write = true by default 3. avoid PageStorage::gc run by multi-threads 4. print PageFile's path if checksum is not correct * throw exception for must_exist==false and errno != ENOENT * add RefPage && ref counting * 1. move PageCacheMap into isolated file 2. rename PageCacheMap -> PageEntryMap * accept non-exist Page reference/del while being used by WriteBatch or Gc * adjust gc on RefPages * add more test on gc RefPages * add memory usage comparing between vec/hashmap/treemap * fix tests code format * Bug fix: Add RefPage to non exist Page * Add unit test for PageStorage * fix bug * stress test for benchmark * fix bug of PageEntryMap updating RefPage * Keep RefPage entry ref-count if it has been gc and move to another place * MVCC on PageMap, All previse tests passed. TODO: refactor on read_mutex on PageStorage * PageStorage gc: only delete file that are not used by any version * fix bug under multi-threads; remove unused code && format codes * 1. support snapshot read 2. standalone VersionSet/MultiVersionCountable template * 1. Apply merge delta if no read ref 2. Apply generate new delta if has read ref 3. Delta merged when snapshot released 4. Add simple restore test 5. Use template and constexpr instead of copying codes in PageEntryMap * add test for PageStorage snapshot read * use typed test * Bug fix: Add RefPage to non exist Page * fix bug * Delta merged when view is finished * 1. VersionDeltaSet: add snapshots for snapshot linked-list, so that gc can collect valid PageFiles 2. fix some bugs * do compact on delta -> base * rebase after do compact on delta * fix bugs * avoid duplicted codes * 1. new find API 2. all tests passed * 1. remove legacy API 2. remove duplicated codes * reformat * apply inplace if there are no readers * fix bug of applying edits * fix bug of gc apply * fix bug of merging PageEntryMap * fix bug of invalid ref page * fix bug: 1. listAllLiveFiles 2. double compaction on same version * tmp * fixbug * split PageEntryMapView to single cpp file * minor fix * 1. Avoid visit same version multiple time in PageEntryMapDeltaVersionSet::listAllLiveFiles 2. use template to reduce duplicated code for gcApply * todo mark * fix bug: RefPage -> RefPage record may become invalid record when PageStorage GC * remove unused function * simple fix on unittest * fix bug: PageEntryMapView::isRefId * fix bug: PageEntryMapDeltaBuilder::applyPut * fix * fix bug: PageEntryView::validNormalPageIds filter out tombstone of PageEntry * iterator over PageEntryMap * remove unused tests * Refactor on PageEntryMapDeltaVersionSet * Reduce lock range when PageStorage Snapshot is release * Add metrics for PageStorage MVCC * rename some classes * reformat. * use exception instead * adress comment * adress comment * fix compiler error under gcc * fix compiler error under gcc --- dbms/CMakeLists.txt | 1 + dbms/src/Common/ProfileEvents.cpp | 7 + dbms/src/IO/WriteHelpers.h | 8 + dbms/src/Storages/Page/Page.h | 19 +- dbms/src/Storages/Page/PageDefines.h | 1 + dbms/src/Storages/Page/PageEntries.h | 453 ++++++++++++ dbms/src/Storages/Page/PageFile.cpp | 112 +-- dbms/src/Storages/Page/PageFile.h | 57 +- dbms/src/Storages/Page/PageStorage.cpp | 449 +++++++---- dbms/src/Storages/Page/PageStorage.h | 61 +- .../Page/VersionSet/PageEntriesBuilder.cpp | 39 + .../Page/VersionSet/PageEntriesBuilder.h | 70 ++ .../Page/VersionSet/PageEntriesEdit.h | 78 ++ .../Page/VersionSet/PageEntriesVersionSet.cpp | 37 + .../Page/VersionSet/PageEntriesVersionSet.h | 36 + .../PageEntriesVersionSetWithDelta.cpp | 294 ++++++++ .../PageEntriesVersionSetWithDelta.h | 89 +++ .../Page/VersionSet/PageEntriesView.cpp | 154 ++++ .../Page/VersionSet/PageEntriesView.h | 47 ++ dbms/src/Storages/Page/WriteBatch.h | 38 +- dbms/src/Storages/Page/mvcc/VersionSet.h | 219 ++++++ .../Storages/Page/mvcc/VersionSetWithDelta.h | 332 +++++++++ dbms/src/Storages/Page/tests/CMakeLists.txt | 8 +- .../Page/tests/gtest_page_entry_map.cpp | 359 +++++++++ .../Storages/Page/tests/gtest_page_file.cpp | 7 +- .../Page/tests/gtest_page_map_version_set.cpp | 643 ++++++++++++++++ .../Page/tests/gtest_page_storage.cpp | 695 ++++++++++++++++-- .../Storages/Page/tests/mem_usage_test.cpp | 81 ++ .../Page/tests/stress_page_stroage.cpp | 245 +++++- .../test_page_storage_write_disk_full.cpp | 22 +- .../Page/tests/utils_get_valid_pages.cpp | 63 +- .../Storages/Transaction/RegionPersister.cpp | 6 +- 32 files changed, 4327 insertions(+), 403 deletions(-) create mode 100644 dbms/src/Storages/Page/PageEntries.h create mode 100644 dbms/src/Storages/Page/VersionSet/PageEntriesBuilder.cpp create mode 100644 dbms/src/Storages/Page/VersionSet/PageEntriesBuilder.h create mode 100644 dbms/src/Storages/Page/VersionSet/PageEntriesEdit.h create mode 100644 dbms/src/Storages/Page/VersionSet/PageEntriesVersionSet.cpp create mode 100644 dbms/src/Storages/Page/VersionSet/PageEntriesVersionSet.h create mode 100644 dbms/src/Storages/Page/VersionSet/PageEntriesVersionSetWithDelta.cpp create mode 100644 dbms/src/Storages/Page/VersionSet/PageEntriesVersionSetWithDelta.h create mode 100644 dbms/src/Storages/Page/VersionSet/PageEntriesView.cpp create mode 100644 dbms/src/Storages/Page/VersionSet/PageEntriesView.h create mode 100644 dbms/src/Storages/Page/mvcc/VersionSet.h create mode 100644 dbms/src/Storages/Page/mvcc/VersionSetWithDelta.h create mode 100644 dbms/src/Storages/Page/tests/gtest_page_entry_map.cpp create mode 100644 dbms/src/Storages/Page/tests/gtest_page_map_version_set.cpp create mode 100644 dbms/src/Storages/Page/tests/mem_usage_test.cpp diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index e04717d107c..2d51fb7fffe 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -59,6 +59,7 @@ add_headers_and_sources(dbms src/Storages/Distributed) add_headers_and_sources(dbms src/Storages/MergeTree) add_headers_and_sources(dbms src/Storages/Transaction) add_headers_and_sources(dbms src/Storages/Page) +add_headers_and_sources(dbms src/Storages/Page/VersionSet) add_headers_and_sources(dbms src/Raft) add_headers_and_sources(dbms src/TiDB) add_headers_and_sources(dbms src/Client) diff --git a/dbms/src/Common/ProfileEvents.cpp b/dbms/src/Common/ProfileEvents.cpp index 1e23a00b011..6bc34ce02b3 100644 --- a/dbms/src/Common/ProfileEvents.cpp +++ b/dbms/src/Common/ProfileEvents.cpp @@ -162,6 +162,13 @@ M(PSMWriteFailed) \ M(PSMReadFailed) \ \ + M(PSMVCCApplyOnCurrentBase) \ + M(PSMVCCApplyOnCurrentDelta) \ + M(PSMVCCApplyOnNewDelta) \ + M(PSMVCCCompactOnDelta) \ + M(PSMVCCCompactOnDeltaRebaseRejected) \ + M(PSMVCCCompactOnBase) \ + \ M(DMWriteBlock) \ M(DMWriteBlockNS) \ M(DMAppendDelta) \ diff --git a/dbms/src/IO/WriteHelpers.h b/dbms/src/IO/WriteHelpers.h index 583dc40cf2e..f005087f976 100644 --- a/dbms/src/IO/WriteHelpers.h +++ b/dbms/src/IO/WriteHelpers.h @@ -831,4 +831,12 @@ toString(const T & x, int precision) return ss.str(); } +/// Pointer to a string +inline String ptrToString(const void * const p) +{ + std::stringstream ss; + ss << p; + return ss.str(); +} + } diff --git a/dbms/src/Storages/Page/Page.h b/dbms/src/Storages/Page/Page.h index 64c46988318..5cc6d4d363a 100644 --- a/dbms/src/Storages/Page/Page.h +++ b/dbms/src/Storages/Page/Page.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -27,8 +28,8 @@ using Pages = std::vector; using PageMap = std::map; using PageHandler = std::function; -// Indicate the page size && offset in PageFile. TODO: rename to `PageEntry`? -struct PageCache +// Indicate the page size && offset in PageFile. +struct PageEntry { // if file_id == 0, means it is invalid PageFileId file_id = 0; @@ -37,15 +38,15 @@ struct PageCache UInt64 offset = 0; UInt64 tag = 0; UInt64 checksum = 0; + UInt32 ref = 1; // for ref counting - bool isValid() const { return file_id != 0; } - PageFileIdAndLevel fileIdLevel() const { return std::make_pair(file_id, level); } + inline bool isValid() const { return file_id != 0; } + inline bool isTombstone() const { return ref == 0; } + inline PageFileIdAndLevel fileIdLevel() const { return std::make_pair(file_id, level); } }; -static_assert(std::is_trivially_copyable_v); +static_assert(std::is_trivially_copyable_v); -using PageCacheMap = std::unordered_map; -using PageCaches = std::vector; -using PageIdAndCache = std::pair; -using PageIdAndCaches = std::vector; +using PageIdAndEntry = std::pair; +using PageIdAndEntries = std::vector; } // namespace DB diff --git a/dbms/src/Storages/Page/PageDefines.h b/dbms/src/Storages/Page/PageDefines.h index f746415f765..f9d8c0065e5 100644 --- a/dbms/src/Storages/Page/PageDefines.h +++ b/dbms/src/Storages/Page/PageDefines.h @@ -3,6 +3,7 @@ #include #include +#include #include namespace DB diff --git a/dbms/src/Storages/Page/PageEntries.h b/dbms/src/Storages/Page/PageEntries.h new file mode 100644 index 00000000000..0375edd270d --- /dev/null +++ b/dbms/src/Storages/Page/PageEntries.h @@ -0,0 +1,453 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ +extern const int LOGICAL_ERROR; +} // namespace ErrorCodes + +template +class PageEntriesMixin +{ +public: + explicit PageEntriesMixin(bool is_base_) : normal_pages(), page_ref(), ref_deletions(), max_page_id(0), is_base(is_base_) {} + +public: + static std::shared_ptr createBase() { return std::make_shared(true); } + + static std::shared_ptr createDelta() { return std::make_shared(false); } + + bool isBase() const { return is_base; } + +public: + /** Update Page{page_id} / RefPage{page_id} entry. If it's a new page_id, + * create a RefPage{page_id} -> Page{page_id} at the same time. + * If page_id is a ref-id of RefPage, it will find corresponding Page + * and update that Page, all other RefPages reference to that Page get updated. + */ + void put(PageId page_id, const PageEntry & entry); + + /** Delete RefPage{page_id} and decrease corresponding Page ref-count. + * if origin Page ref-count down to 0, the Page is erased from entry map + * template must_exist = true ensure that corresponding Page must exist. + * must_exist = false just ignore if that corresponding Page is not exist. + */ + template + void del(PageId page_id); + + /** Bind RefPage{ref_id} to Page{page_id}. + * If page_id is a ref-id of RefPage, it will find corresponding Page + * and bind ref_id to that Page. + * template must_exist = true ensure that corresponding Page must exist. + * must_exist = false if corresponding Page not exist, just add a record for RefPage{ref_id} -> Page{page_id} + */ + template + void ref(PageId ref_id, PageId page_id); + + inline const PageEntry * find(const PageId page_id) const + { + auto ref_iter = page_ref.find(page_id); + if (ref_iter == page_ref.end()) + return nullptr; + else + { + auto normal_iter = normal_pages.find(ref_iter->second); + if (normal_iter == normal_pages.end()) + return nullptr; + else + return &normal_iter->second; + } + } + + inline PageEntry & at(const PageId page_id) + { + PageId normal_page_id = resolveRefId(page_id); + auto iter = normal_pages.find(normal_page_id); + if (likely(iter != normal_pages.end())) + { + return iter->second; + } + else + { + throw DB::Exception("Accessing RefPage" + DB::toString(page_id) + " to non-exist Page" + DB::toString(normal_page_id), + ErrorCodes::LOGICAL_ERROR); + } + } + inline const PageEntry & at(const PageId page_id) const { return const_cast(this)->at(page_id); } + + inline std::pair isRefId(PageId page_id) const + { + auto ref_pair = page_ref.find(page_id); + if (ref_pair == page_ref.end()) + { + return {false, 0UL}; + } + return {ref_pair->second != page_id, ref_pair->second}; + } + + inline void clear() + { + page_ref.clear(); + normal_pages.clear(); + max_page_id = 0; + ref_deletions.clear(); + } + + PageId maxId() const { return max_page_id; } + +public: + using const_normal_page_iterator = std::unordered_map::const_iterator; + // only scan over normal Pages, excluding RefPages + inline const_normal_page_iterator pages_cbegin() const { return normal_pages.cbegin(); } + inline const_normal_page_iterator pages_cend() const { return normal_pages.cend(); } + +protected: + std::unordered_map normal_pages; + std::unordered_map page_ref; // RefPageId -> PageId + // RefPageId deletions + std::unordered_set ref_deletions; + + PageId max_page_id; + bool is_base; + +protected: + size_t numDeletions() const + { + assert(!isBase()); // should only call by delta + return ref_deletions.size(); + } + + size_t numRefEntries() const { return page_ref.size(); } + + size_t numNormalEntries() const { return normal_pages.size(); } + + inline bool isRefDeleted(PageId page_id) const { return ref_deletions.count(page_id) > 0; } + +private: + PageId resolveRefId(PageId page_id) const + { + // resolve RefPageId to normal PageId + // if RefPage3 -> Page1, RefPage4 -> RefPage3 + // resolveRefId(3) -> 1 + // resolveRefId(4) -> 1 + auto [is_ref, normal_page_id] = isRefId(page_id); + return is_ref ? normal_page_id : page_id; + } + + template + void decreasePageRef(PageId page_id); + + void copyEntries(const PageEntriesMixin & rhs) + { + page_ref = rhs.page_ref; + normal_pages = rhs.normal_pages; + max_page_id = rhs.max_page_id; + ref_deletions = rhs.ref_deletions; + } + +public: + // no copying allowed + PageEntriesMixin(const PageEntriesMixin &) = delete; + PageEntriesMixin & operator=(const PageEntriesMixin &) = delete; + // only move allowed + PageEntriesMixin(PageEntriesMixin && rhs) noexcept : PageEntriesMixin(true) { *this = std::move(rhs); } + PageEntriesMixin & operator=(PageEntriesMixin && rhs) noexcept + { + if (this != &rhs) + { + normal_pages.swap(rhs.normal_pages); + page_ref.swap(rhs.page_ref); + max_page_id = rhs.max_page_id; + is_base = rhs.is_base; + ref_deletions.swap(rhs.ref_deletions); + } + return *this; + } + + friend class PageEntriesBuilder; + friend class DeltaVersionEditAcceptor; + friend class PageEntriesView; + friend class PageEntriesVersionSetWithDelta; // For copyEntries +}; + +template +void PageEntriesMixin::put(PageId page_id, const PageEntry & entry) +{ + assert(is_base); // can only call by base + const PageId normal_page_id = resolveRefId(page_id); + + // update ref-pairs + bool is_new_ref_pair_inserted = false; + { + // add a RefPage to Page + auto res = page_ref.emplace(page_id, normal_page_id); + is_new_ref_pair_inserted = res.second; + } + + // update normal page's entry + auto ori_iter = normal_pages.find(normal_page_id); + if (ori_iter == normal_pages.end()) + { + // Page{normal_page_id} not exist + normal_pages[normal_page_id] = entry; + normal_pages[normal_page_id].ref = 1; + } + else + { + // replace ori Page{normal_page_id}'s entry but inherit ref-counting + const UInt32 page_ref_count = ori_iter->second.ref; + normal_pages[normal_page_id] = entry; + normal_pages[normal_page_id].ref = page_ref_count + is_new_ref_pair_inserted; + } + + // update max_page_id + max_page_id = std::max(max_page_id, page_id); +} + +template +template +void PageEntriesMixin::del(PageId page_id) +{ + assert(is_base); // can only call by base + // Note: must resolve ref-id before erasing entry in `page_ref` + const PageId normal_page_id = resolveRefId(page_id); + page_ref.erase(page_id); + + // decrease origin page's ref counting + decreasePageRef(normal_page_id); +} + +template +template +void PageEntriesMixin::ref(const PageId ref_id, const PageId page_id) +{ + assert(is_base); // can only call by base + // if `page_id` is a ref-id, collapse the ref-path to actual PageId + // eg. exist RefPage2 -> Page1, add RefPage3 -> RefPage2, collapse to RefPage3 -> Page1 + const PageId normal_page_id = resolveRefId(page_id); + auto iter = normal_pages.find(normal_page_id); + if (likely(iter != normal_pages.end())) + { + // if RefPage{ref_id} already exist, release that ref first + const auto ori_ref = page_ref.find(ref_id); + if (unlikely(ori_ref != page_ref.end())) + { + // if RefPage{ref-id} -> Page{normal_page_id} already exists, just ignore + if (ori_ref->second == normal_page_id) + return; + decreasePageRef(ori_ref->second); + } + // build ref + page_ref[ref_id] = normal_page_id; + iter->second.ref += 1; + } + else + { + // The Page to be ref is not exist. + if constexpr (must_exist) + { + throw Exception("Adding RefPage" + DB::toString(ref_id) + " to non-exist Page" + DB::toString(page_id), + ErrorCodes::LOGICAL_ERROR); + } + else + { + // else accept dangling ref if we are writing to a tmp entry map. + // like entry map of WriteBatch or Gc or AnalyzeMeta + page_ref[ref_id] = normal_page_id; + } + } + max_page_id = std::max(max_page_id, std::max(ref_id, page_id)); +} + +template +template +void PageEntriesMixin::decreasePageRef(const PageId page_id) +{ + auto iter = normal_pages.find(page_id); + if constexpr (must_exist) + { + if (unlikely(iter == normal_pages.end())) + { + throw Exception("Decreasing NON-exist normal page[" + DB::toString(page_id) + "] ref-count", ErrorCodes::LOGICAL_ERROR); + } + } + if (iter != normal_pages.end()) + { + auto & entry = iter->second; + entry.ref -= 1; + if (entry.ref == 0) + { + normal_pages.erase(iter); + } + } +} + +/// For PageEntriesVersionSet +class PageEntries : public PageEntriesMixin, public ::DB::MVCC::MultiVersionCountable +{ +public: + explicit PageEntries(bool is_base_ = true) : PageEntriesMixin(true), ::DB::MVCC::MultiVersionCountable(this) + { + (void)is_base_; + } + +public: + /// Iterator definition. Used for scan over all RefPages / NormalPages + + class iterator + { + public: + iterator(const std::unordered_map::iterator & iter, std::unordered_map & normal_pages) + : _iter(iter), _normal_pages(normal_pages) + { + } + bool operator==(const iterator & rhs) const { return _iter == rhs._iter; } + bool operator!=(const iterator & rhs) const { return _iter != rhs._iter; } + // prefix incr + inline iterator & operator++() + { + _iter++; + return *this; + } + // suffix incr + inline const iterator operator++(int) + { + iterator tmp(*this); + _iter++; + return tmp; + } + inline PageId pageId() const { return _iter->first; } + inline PageEntry & pageEntry() + { + auto iter = _normal_pages.find(_iter->second); + if (likely(iter != _normal_pages.end())) + { + return iter->second; + } + else + { + throw DB::Exception("Accessing RefPage" + DB::toString(_iter->first) + " to non-exist Page" + DB::toString(_iter->second), + ErrorCodes::LOGICAL_ERROR); + } + } + + private: + std::unordered_map::iterator _iter; + std::unordered_map & _normal_pages; + friend class PageEntriesView; + }; + + class const_iterator + { + public: + const_iterator(const std::unordered_map::const_iterator & iter, + const std::unordered_map & normal_pages) + : _iter(iter), _normal_pages(const_cast &>(normal_pages)) + { + } + bool operator==(const const_iterator & rhs) const { return _iter == rhs._iter; } + bool operator!=(const const_iterator & rhs) const { return _iter != rhs._iter; } + // prefix incr + inline const_iterator & operator++() + { + _iter++; + return *this; + } + // suffix incr + inline const const_iterator operator++(int) + { + const_iterator tmp(*this); + _iter++; + return tmp; + } + inline PageId pageId() const { return _iter->first; } + inline const PageEntry & pageEntry() const + { + auto iter = _normal_pages.find(_iter->second); + if (likely(iter != _normal_pages.end())) + { + return iter->second; + } + else + { + throw DB::Exception("Accessing RefPage" + DB::toString(_iter->first) + " to non-exist Page" + DB::toString(_iter->second), + ErrorCodes::LOGICAL_ERROR); + } + } + + private: + std::unordered_map::const_iterator _iter; + std::unordered_map & _normal_pages; + friend class PageEntriesView; + }; + +public: + // Iterator to scan over all ref/normal pages (read only) + inline const_iterator cend() const { return const_iterator(page_ref.cend(), normal_pages); } + inline const_iterator cbegin() const { return const_iterator(page_ref.cbegin(), normal_pages); } +}; + +/// For PageEntriesVersionSetWithDelta +class PageEntriesForDelta : public PageEntriesMixin, + public ::DB::MVCC::MultiVersionCountableForDelta +{ +public: + explicit PageEntriesForDelta(bool is_base_) + : PageEntriesMixin(is_base_), ::DB::MVCC::MultiVersionCountableForDelta() + { + } + + void merge(PageEntriesForDelta & rhs) + { + assert(!rhs.isBase()); // rhs must be delta + for (auto page_id : rhs.ref_deletions) + { + page_ref.erase(page_id); + if (!is_base) + { + ref_deletions.insert(page_id); + } + } + for (auto it : rhs.page_ref) + { + page_ref[it.first] = it.second; + } + for (auto it : rhs.normal_pages) + { + if (it.second.isTombstone() && is_base) + { + // A tombstone of normal page, delete this page + normal_pages.erase(it.first); + } + else + { + normal_pages[it.first] = it.second; + } + } + max_page_id = std::max(max_page_id, rhs.max_page_id); + } + + bool shouldCompactToBase(const ::DB::MVCC::VersionSetConfig & config) + { + assert(!this->isBase()); + return numDeletions() >= config.compact_hint_delta_deletions // + || numRefEntries() >= config.compact_hint_delta_entries || numNormalEntries() >= config.compact_hint_delta_entries; + } +}; + +} // namespace DB diff --git a/dbms/src/Storages/Page/PageFile.cpp b/dbms/src/Storages/Page/PageFile.cpp index eacd6fe8dd9..16c204250a7 100644 --- a/dbms/src/Storages/Page/PageFile.cpp +++ b/dbms/src/Storages/Page/PageFile.cpp @@ -14,7 +14,9 @@ #include #endif +#include #include +#include #include #include @@ -234,7 +236,7 @@ namespace PageMetaFormat using WBSize = UInt32; using PageFileVersion = PageFile::Version; using PageTag = UInt64; -using IsPut = UInt8; +using IsPut = std::underlying_type::type; using PageOffset = UInt64; using PageSize = UInt32; using Checksum = UInt64; @@ -245,7 +247,7 @@ static const size_t PAGE_META_SIZE = sizeof(PageId) + sizeof(PageTag) + sizeof(P std::pair genWriteData( // const WriteBatch & wb, PageFile & page_file, - PageCacheMap & page_cache_map) + PageEntriesEdit & edit) { WBSize meta_write_bytes = 0; size_t data_write_bytes = 0; @@ -255,14 +257,20 @@ std::pair genWriteData( // for (const auto & write : wb.getWrites()) { meta_write_bytes += sizeof(IsPut); - if (write.is_put) + switch (write.type) { + case WriteBatch::WriteType::PUT: data_write_bytes += write.size; meta_write_bytes += PAGE_META_SIZE; - } - else - { - meta_write_bytes += sizeof(PageId); // For delete page, store page id only. And don't need to write data file. + break; + case WriteBatch::WriteType::DEL: + // For delete page, store page id only. And don't need to write data file. + meta_write_bytes += sizeof(PageId); + break; + case WriteBatch::WriteType::REF: + // For ref page, store RefPageId -> PageId. And don't need to write data file. + meta_write_bytes += (sizeof(PageId) + sizeof(PageId)); + break; } } @@ -280,21 +288,23 @@ std::pair genWriteData( // PageOffset page_data_file_off = page_file.getDataFileAppendPos(); for (const auto & write : wb.getWrites()) { - put(meta_pos, (IsPut)(write.is_put ? 1 : 0)); - if (write.is_put) + put(meta_pos, static_cast(write.type)); + switch (write.type) + { + case WriteBatch::WriteType::PUT: { write.read_buffer->readStrict(data_pos, write.size); Checksum page_checksum = CityHash_v1_0_2::CityHash64(data_pos, write.size); data_pos += write.size; - PageCache pc{}; + PageEntry pc{}; pc.file_id = page_file.getFileId(); pc.level = page_file.getLevel(); pc.size = write.size; pc.offset = page_data_file_off; pc.checksum = page_checksum; - page_cache_map[write.page_id] = pc; + edit.put(write.page_id, pc); put(meta_pos, (PageId)write.page_id); put(meta_pos, (PageTag)write.tag); @@ -303,16 +313,23 @@ std::pair genWriteData( // put(meta_pos, (Checksum)page_checksum); page_data_file_off += write.size; + break; } - else - { + case WriteBatch::WriteType::DEL: put(meta_pos, (PageId)write.page_id); - page_cache_map.erase(write.page_id); + edit.del(write.page_id); + break; + case WriteBatch::WriteType::REF: + put(meta_pos, static_cast(write.page_id)); + put(meta_pos, static_cast(write.ori_page_id)); + + edit.ref(write.page_id, write.ori_page_id); + break; } } - Checksum wb_checksum = CityHash_v1_0_2::CityHash64(meta_buffer, meta_write_bytes - sizeof(Checksum)); + const Checksum wb_checksum = CityHash_v1_0_2::CityHash64(meta_buffer, meta_write_bytes - sizeof(Checksum)); put(meta_pos, wb_checksum); if (unlikely(meta_pos != meta_buffer + meta_write_bytes || data_pos != data_buffer + data_write_bytes)) @@ -323,13 +340,13 @@ std::pair genWriteData( // /// Analyze meta file, and return . std::pair analyzeMetaFile( // - const String & path, - PageFileId file_id, - UInt32 level, - const char * meta_data, - const size_t meta_data_size, - PageCacheMap & page_caches, - Logger * log) + const String & path, + PageFileId file_id, + UInt32 level, + const char * meta_data, + const size_t meta_data_size, + PageEntriesEdit & edit, + Logger * log) { const char * meta_data_end = meta_data + meta_data_size; @@ -367,12 +384,14 @@ std::pair analyzeMetaFile( // // recover WriteBatch while (pos < wb_start_pos + wb_bytes_without_checksum) { - auto is_put = get(pos); - if (is_put) + const auto is_put = get(pos); + const auto write_type = static_cast(is_put); + switch (write_type) + { + case WriteBatch::WriteType::PUT: { - auto page_id = get(pos); - PageCache pc; + PageEntry pc; pc.file_id = file_id; pc.level = level; pc.tag = get(pos); @@ -380,13 +399,22 @@ std::pair analyzeMetaFile( // pc.size = get(pos); pc.checksum = get(pos); - page_caches[page_id] = pc; + edit.put(page_id, pc); page_data_file_size += pc.size; + break; } - else + case WriteBatch::WriteType::DEL: { auto page_id = get(pos); - page_caches.erase(page_id); // Reserve the order of removal. + edit.del(page_id); // Reserve the order of removal. + break; + } + case WriteBatch::WriteType::REF: + { + const auto ref_id = get(pos); + const auto page_id = get(pos); + edit.ref(ref_id, page_id); + } } } // move `pos` over the checksum of WriteBatch @@ -423,13 +451,13 @@ PageFile::Writer::~Writer() syncFile(meta_file_fd, meta_file_path); } -void PageFile::Writer::write(const WriteBatch & wb, PageCacheMap & page_cache_map) +void PageFile::Writer::write(const WriteBatch & wb, PageEntriesEdit & edit) { ProfileEvents::increment(ProfileEvents::PSMWritePages, wb.putWriteCount()); // TODO: investigate if not copy data into heap, write big pages can be faster? ByteBuffer meta_buf, data_buf; - std::tie(meta_buf, data_buf) = PageMetaFormat::genWriteData(wb, page_file, page_cache_map); + std::tie(meta_buf, data_buf) = PageMetaFormat::genWriteData(wb, page_file, edit); SCOPE_EXIT({ page_file.free(meta_buf.begin(), meta_buf.size()); }); SCOPE_EXIT({ page_file.free(data_buf.begin(), data_buf.size()); }); @@ -458,12 +486,12 @@ PageFile::Reader::~Reader() ::close(data_file_fd); } -PageMap PageFile::Reader::read(PageIdAndCaches & to_read) +PageMap PageFile::Reader::read(PageIdAndEntries & to_read) { ProfileEvents::increment(ProfileEvents::PSMReadPages, to_read.size()); // Sort in ascending order by offset in file. - std::sort(to_read.begin(), to_read.end(), [](const PageIdAndCache & a, const PageIdAndCache & b) { + std::sort(to_read.begin(), to_read.end(), [](const PageIdAndEntry & a, const PageIdAndEntry & b) { return a.second.offset < b.second.offset; }); @@ -512,12 +540,12 @@ PageMap PageFile::Reader::read(PageIdAndCaches & to_read) return page_map; } -void PageFile::Reader::read(PageIdAndCaches & to_read, const PageHandler & handler) +void PageFile::Reader::read(PageIdAndEntries & to_read, const PageHandler & handler) { ProfileEvents::increment(ProfileEvents::PSMReadPages, to_read.size()); // Sort in ascending order by offset in file. - std::sort(to_read.begin(), to_read.end(), [](const PageIdAndCache & a, const PageIdAndCache & b) { + std::sort(to_read.begin(), to_read.end(), [](const PageIdAndEntry & a, const PageIdAndEntry & b) { return a.second.offset < b.second.offset; }); @@ -631,11 +659,11 @@ PageFile PageFile::openPageFileForRead(PageFileId file_id, UInt32 level, const s return PageFile(file_id, level, parent_path, false, false, log); } -void PageFile::readAndSetPageMetas(PageCacheMap & page_caches) +void PageFile::readAndSetPageMetas(PageEntriesEdit & edit) { - const auto path = metaPath(); - Poco::File file(path); - size_t file_size = file.getSize(); + const auto path = metaPath(); + Poco::File file(path); + const size_t file_size = file.getSize(); int file_fd = openFile(path); // File not exists. @@ -646,9 +674,9 @@ void PageFile::readAndSetPageMetas(PageCacheMap & page_caches) readFile(file_fd, 0, data, file_size, path); - // analyze meta file and update page_caches + // analyze meta file and update page_entries std::tie(this->meta_file_pos, this->data_file_pos) - = PageMetaFormat::analyzeMetaFile(folderPath(), file_id, level, data, file_size, page_caches, log); + = PageMetaFormat::analyzeMetaFile(folderPath(), file_id, level, data, file_size, edit, log); } void PageFile::setFormal() @@ -660,7 +688,7 @@ void PageFile::setFormal() file.renameTo(folderPath()); } -void PageFile::destroy() +void PageFile::destroy() const { // TODO: delay remove. Poco::File file(folderPath()); diff --git a/dbms/src/Storages/Page/PageFile.h b/dbms/src/Storages/Page/PageFile.h index 3329eec1b19..34d74386f6f 100644 --- a/dbms/src/Storages/Page/PageFile.h +++ b/dbms/src/Storages/Page/PageFile.h @@ -4,20 +4,18 @@ #include #include -#include - -#include - -#include -#include -#include #include -#include #include #include +#include #include +namespace Poco +{ +class Logger; +} // namespace Poco + namespace DB { @@ -39,14 +37,14 @@ class PageFile : public Allocator Writer(PageFile &, bool sync_on_write); ~Writer(); - void write(const WriteBatch & wb, PageCacheMap & page_cache_map); + void write(const WriteBatch & wb, PageEntriesEdit & edit); private: PageFile & page_file; bool sync_on_write; - std::string data_file_path; - std::string meta_file_path; + String data_file_path; + String meta_file_path; int data_file_fd; int meta_file_fd; @@ -63,13 +61,13 @@ class PageFile : public Allocator /// Read pages from files. /// After return, the items in to_read could be reordered, but won't be removed or added. - PageMap read(PageIdAndCaches & to_read); + PageMap read(PageIdAndEntries & to_read); - void read(PageIdAndCaches & to_read, const PageHandler & handler); + void read(PageIdAndEntries & to_read, const PageHandler & handler); private: - std::string data_file_path; - int data_file_fd; + String data_file_path; + int data_file_fd; }; struct Comparator @@ -84,20 +82,21 @@ class PageFile : public Allocator /// Create an empty page file. PageFile() = default; /// Recover a page file from disk. - static std::pair recover(const std::string & parent_path, const std::string & page_file_name, Logger * log); + static std::pair recover(const String & parent_path, const String & page_file_name, Poco::Logger * log); /// Create a new page file. - static PageFile newPageFile(PageFileId file_id, UInt32 level, const std::string & parent_path, bool is_tmp, Logger * log); + static PageFile newPageFile(PageFileId file_id, UInt32 level, const String & parent_path, bool is_tmp, Poco::Logger * log); /// Open an existing page file for read. - static PageFile openPageFileForRead(PageFileId file_id, UInt32 level, const std::string & parent_path, Logger * log); + static PageFile openPageFileForRead(PageFileId file_id, UInt32 level, const String & parent_path, Poco::Logger * log); /// Get pages' metadata by this method. Will also update file pos. /// Call this method after a page file recovered. - void readAndSetPageMetas(PageCacheMap & page_caches); + /// if check_page_map_complete is true, do del or ref on non-exist page will throw exception. + void readAndSetPageMetas(PageEntriesEdit & edit); /// Rename this page file into formal style. void setFormal(); /// Destroy underlying system files. - void destroy(); + void destroy() const; /// Return a writer bound with this PageFile object. /// Note that the user MUST keep the PageFile object around before this writer being freed. @@ -115,26 +114,26 @@ class PageFile : public Allocator private: /// Create a new page file. - PageFile(PageFileId file_id_, UInt32 level_, const std::string & parent_path, bool is_tmp_, bool is_create, Logger * log); + PageFile(PageFileId file_id_, UInt32 level_, const String & parent_path, bool is_tmp_, bool is_create, Poco::Logger * log); - std::string folderPath() const + String folderPath() const { return parent_path + "/" + (is_tmp ? ".tmp.page_" : "page_") + DB::toString(file_id) + "_" + DB::toString(level); } - std::string dataPath() const { return folderPath() + "/page"; } - std::string metaPath() const { return folderPath() + "/meta"; } + String dataPath() const { return folderPath() + "/page"; } + String metaPath() const { return folderPath() + "/meta"; } private: - UInt64 file_id = 0; // Valid id start from 1. - UInt32 level = 0; // 0: normal, >= 1: generated by GC. - bool is_tmp = false; // true if currently writen by GC thread. - std::string parent_path{}; // The parent folder of this page file. + UInt64 file_id = 0; // Valid id start from 1. + UInt32 level = 0; // 0: normal, >= 1: generated by GC. + bool is_tmp = false; // true if currently writen by GC thread. + String parent_path{}; // The parent folder of this page file. // The append pos. UInt64 data_file_pos = 0; UInt64 meta_file_pos = 0; - Logger * log = nullptr; + Poco::Logger * log = nullptr; }; } // namespace DB diff --git a/dbms/src/Storages/Page/PageStorage.cpp b/dbms/src/Storages/Page/PageStorage.cpp index b85ab3da914..2a75c570cc0 100644 --- a/dbms/src/Storages/Page/PageStorage.cpp +++ b/dbms/src/Storages/Page/PageStorage.cpp @@ -1,7 +1,11 @@ +#include + #include #include -#include +#include +#include +#include namespace DB { @@ -12,17 +16,21 @@ extern const int LOGICAL_ERROR; } // namespace ErrorCodes std::set -PageStorage::listAllPageFiles(const std::string & storage_path, bool remove_tmp_file, Logger * page_file_log) +PageStorage::listAllPageFiles(const String & storage_path, bool remove_tmp_file, Logger * page_file_log) { // collect all pages from `storage_path` and recover to `PageFile` objects Poco::File folder(storage_path); if (!folder.exists()) + { folder.createDirectories(); + } std::vector file_names; folder.list(file_names); if (file_names.empty()) + { return {}; + } std::set page_files; for (const auto & name : file_names) @@ -43,42 +51,79 @@ PageStorage::listAllPageFiles(const std::string & storage_path, bool remove_tmp_ return page_files; } -PageStorage::PageStorage(const std::string & storage_path_, const Config & config_) - : storage_path(storage_path_), config(config_), page_file_log(&Logger::get("PageFile")), log(&Logger::get("PageStorage")) +PageStorage::PageStorage(const String & storage_path_, const Config & config_) + : storage_path(storage_path_), + config(config_), + version_set(), + page_file_log(&Poco::Logger::get("PageFile")), + log(&Poco::Logger::get("PageStorage")) { /// page_files are in ascending ordered by (file_id, level). auto page_files = PageStorage::listAllPageFiles(storage_path, /* remove_tmp_file= */ true, page_file_log); + // recover current version from files + +#ifdef DELTA_VERSION_SET for (auto & page_file : page_files) { - const_cast(page_file).readAndSetPageMetas(page_cache_map); + PageEntriesEdit edit; + const_cast(page_file).readAndSetPageMetas(edit); // Only level 0 is writable. if (page_file.getLevel() == 0) + { write_file = page_file; + } + // apply edit to new version + version_set.apply(edit); } +#else + auto snapshot = version_set.getSnapshot(); - for (const auto & p : page_cache_map) + typename PageEntryMapVersionSet::BuilderType builder( + snapshot->version(), true, log); // If there are invalid ref-pairs, just ignore that + for (auto & page_file : page_files) { - max_page_id = std::max(max_page_id, p.first); + PageEntriesEdit edit; + const_cast(page_file).readAndSetPageMetas(edit); + + // Only level 0 is writable. + if (page_file.getLevel() == 0) + { + write_file = page_file; + } + // apply edit to new version + builder.apply(edit); } + version_set.restore(builder.build()); +#endif } PageId PageStorage::getMaxId() { std::lock_guard write_lock(write_mutex); - - return max_page_id; + return version_set.getSnapshot()->version()->maxId(); } -PageCache PageStorage::getCache(PageId page_id) +PageEntry PageStorage::getEntry(PageId page_id, SnapshotPtr snapshot) { - std::shared_lock lock(read_mutex); + if (snapshot == nullptr) + { + snapshot = this->getSnapshot(); + } - auto it = page_cache_map.find(page_id); - if (it != page_cache_map.end()) - return it->second; - else - return {}; + try + { // this may throw an exception if ref to non-exist page + auto entry = snapshot->version()->find(page_id); + if (entry != nullptr) + return *entry; // A copy of PageEntry + else + return {}; // return invalid PageEntry + } + catch (DB::Exception & e) + { + LOG_WARNING(log, e.message()); + return {}; // return invalid PageEntry + } } PageFile::Writer & PageStorage::getWriter() @@ -103,7 +148,7 @@ PageStorage::ReaderPtr PageStorage::getReader(const PageFileIdAndLevel & file_id std::lock_guard lock(open_read_files_mutex); auto & cached_reader = open_read_files[file_id_level]; - if (!cached_reader) + if (cached_reader == nullptr) { auto page_file = PageFile::openPageFileForRead(file_id_level.first, file_id_level.second, storage_path, page_file_log); cached_reader = page_file.createReader(); @@ -113,55 +158,55 @@ PageStorage::ReaderPtr PageStorage::getReader(const PageFileIdAndLevel & file_id void PageStorage::write(const WriteBatch & wb) { - PageCacheMap caches; - { - std::lock_guard lock(write_mutex); - getWriter().write(wb, caches); - - { - std::unique_lock read_lock(read_mutex); + PageEntriesEdit edit; + std::lock_guard lock(write_mutex); + getWriter().write(wb, edit); + + // Apply changes into version_set(generate a new version) + // If there are RefPages to non-exist Pages, just put the ref pair to new version + // instead of throwing exception. Or we can't open PageStorage since we have already + // persist the invalid ref pair into PageFile. + version_set.apply(edit); +} - for (const auto & w : wb.getWrites()) - { - max_page_id = std::max(max_page_id, w.page_id); - if (w.is_put) - page_cache_map[w.page_id] = caches[w.page_id]; - else - page_cache_map.erase(w.page_id); - } - } - } +PageStorage::SnapshotPtr PageStorage::getSnapshot() +{ + return version_set.getSnapshot(); } -Page PageStorage::read(PageId page_id) +Page PageStorage::read(PageId page_id, SnapshotPtr snapshot) { - std::shared_lock lock(read_mutex); + if (snapshot == nullptr) + { + snapshot = this->getSnapshot(); + } - auto it = page_cache_map.find(page_id); - if (it == page_cache_map.end()) + auto page_entry = snapshot->version()->find(page_id); + if (page_entry == nullptr) throw Exception("Page " + DB::toString(page_id) + " not found", ErrorCodes::LOGICAL_ERROR); - const auto & page_cache = it->second; - auto file_id_level = page_cache.fileIdLevel(); - PageIdAndCaches to_read = {{page_id, page_cache}}; - auto file_reader = getReader(file_id_level); + const auto file_id_level = page_entry->fileIdLevel(); + PageIdAndEntries to_read = {{page_id, *page_entry}}; + auto file_reader = getReader(file_id_level); return file_reader->read(to_read)[page_id]; } -PageMap PageStorage::read(const std::vector & page_ids) +PageMap PageStorage::read(const std::vector & page_ids, SnapshotPtr snapshot) { - std::shared_lock lock(read_mutex); + if (snapshot == nullptr) + { + snapshot = this->getSnapshot(); + } - std::map> file_read_infos; + std::map> file_read_infos; for (auto page_id : page_ids) { - auto it = page_cache_map.find(page_id); - if (it == page_cache_map.end()) + auto page_entry = snapshot->version()->find(page_id); + if (page_entry == nullptr) throw Exception("Page " + DB::toString(page_id) + " not found", ErrorCodes::LOGICAL_ERROR); - const auto & page_cache = it->second; - auto file_id_level = page_cache.fileIdLevel(); + auto file_id_level = page_entry->fileIdLevel(); auto & [page_id_and_caches, file_reader] = file_read_infos[file_id_level]; - page_id_and_caches.emplace_back(page_id, page_cache); - if (!file_reader) + page_id_and_caches.emplace_back(page_id, *page_entry); + if (file_reader == nullptr) file_reader = getReader(file_id_level); } @@ -178,21 +223,23 @@ PageMap PageStorage::read(const std::vector & page_ids) return page_map; } -void PageStorage::read(const std::vector & page_ids, PageHandler & handler) +void PageStorage::read(const std::vector & page_ids, PageHandler & handler, SnapshotPtr snapshot) { - std::shared_lock lock(read_mutex); + if (snapshot == nullptr) + { + snapshot = this->getSnapshot(); + } - std::map> file_read_infos; + std::map> file_read_infos; for (auto page_id : page_ids) { - auto it = page_cache_map.find(page_id); - if (it == page_cache_map.end()) + auto page_entry = snapshot->version()->find(page_id); + if (page_entry == nullptr) throw Exception("Page " + DB::toString(page_id) + " not found", ErrorCodes::LOGICAL_ERROR); - const auto & page_cache = it->second; - auto file_id_level = page_cache.fileIdLevel(); + auto file_id_level = page_entry->fileIdLevel(); auto & [page_id_and_caches, file_reader] = file_read_infos[file_id_level]; - page_id_and_caches.emplace_back(page_id, page_cache); - if (!file_reader) + page_id_and_caches.emplace_back(page_id, *page_entry); + if (file_reader == nullptr) file_reader = getReader(file_id_level); } @@ -206,30 +253,74 @@ void PageStorage::read(const std::vector & page_ids, PageHandler & handl } } -void PageStorage::traverse(std::function acceptor) +void PageStorage::traverse(const std::function & acceptor, SnapshotPtr snapshot) { - std::shared_lock lock(read_mutex); + if (snapshot == nullptr) + { + snapshot = this->getSnapshot(); + } std::map file_and_pages; +#ifdef DELTA_VERSION_SET { - for (const auto & [page_id, page_cache] : page_cache_map) - file_and_pages[page_cache.fileIdLevel()].emplace_back(page_id); + auto valid_pages_ids = snapshot->version()->validPageIds(); + for (auto page_id : valid_pages_ids) + { + auto page_entry = snapshot->version()->find(page_id); + if (unlikely(page_entry == nullptr)) + throw Exception("Page[" + DB::toString(page_id) + "] not found when traversing PageStorage", ErrorCodes::LOGICAL_ERROR); + file_and_pages[page_entry->fileIdLevel()].emplace_back(page_id); + } + } +#else + { + for (auto iter = snapshot->version()->cbegin(); iter != snapshot->version()->cend(); ++iter) + { + const PageId page_id = iter.pageId(); + const PageEntry & page_entry = iter.pageEntry(); // this may throw an exception if ref to non-exist page + file_and_pages[page_entry.fileIdLevel()].emplace_back(page_id); + } } +#endif for (const auto & p : file_and_pages) { - auto pages = read(p.second); + auto pages = read(p.second, snapshot); for (const auto & id_page : pages) + { acceptor(id_page.second); + } } } -void PageStorage::traversePageCache(std::function acceptor) +void PageStorage::traversePageEntries( // + const std::function & acceptor, + SnapshotPtr snapshot) { - std::shared_lock lock(read_mutex); + if (snapshot == nullptr) + { + snapshot = this->getSnapshot(); + } - for (const auto & [page_id, page_cache] : page_cache_map) - acceptor(page_id, page_cache); + // traverse over all Pages or RefPages +#ifdef DELTA_VERSION_SET + auto valid_pages_ids = snapshot->version()->validPageIds(); + for (auto page_id : valid_pages_ids) + { + auto page_entry = snapshot->version()->find(page_id); + if (unlikely(page_entry == nullptr)) + throw Exception("Page[" + DB::toString(page_id) + "] not found when traversing PageStorage's entries", + ErrorCodes::LOGICAL_ERROR); + acceptor(page_id, *page_entry); + } +#else + for (auto iter = snapshot->version()->cbegin(); iter != snapshot->version()->cend(); ++iter) + { + const PageId page_id = iter.pageId(); + const PageEntry & page_entry = iter.pageEntry(); // this may throw an exception if ref to non-exist page + acceptor(page_id, page_entry); + } +#endif } @@ -237,9 +328,11 @@ bool PageStorage::gc() { std::lock_guard gc_lock(gc_mutex); // get all PageFiles - auto page_files = PageStorage::listAllPageFiles(storage_path, true, page_file_log); + const auto page_files = PageStorage::listAllPageFiles(storage_path, true, page_file_log); if (page_files.empty()) + { return false; + } LOG_DEBUG(log, "PageStorage GC start"); @@ -250,25 +343,40 @@ bool PageStorage::gc() } std::set merge_files; - PageCacheMap gc_file_page_cache_map; + PageEntriesEdit gc_file_entries_edit; { - /// Select the GC candidates and write them into an new file. - /// Since we don't update any shared information, only a read lock is sufficient. - - std::shared_lock lock(read_mutex); + /// Select the GC candidates files and migrate valid pages into an new file. + /// Acquire a snapshot version of page map, new edit on page map store in `gc_file_entries_edit` + SnapshotPtr snapshot = this->getSnapshot(); std::map> file_valid_pages; { - for (const auto & [page_id, page_cache] : page_cache_map) + // Only scan over normal Pages, excluding RefPages +#ifdef DELTA_VERSION_SET + auto valid_normal_page_ids = snapshot->version()->validNormalPageIds(); + for (auto page_id : valid_normal_page_ids) + { + auto page_entry = snapshot->version()->find(page_id); + if (unlikely(page_entry == nullptr)) + { + throw Exception("PageStorage GC: Normal Page " + DB::toString(page_id) + " not found.", ErrorCodes::LOGICAL_ERROR); + } + auto && [valid_size, valid_page_ids_in_file] = file_valid_pages[page_entry->fileIdLevel()]; + valid_size += page_entry->size; +#else + for (auto iter = snapshot->version()->pages_cbegin(); iter != snapshot->version()->pages_cend(); ++iter) { - auto && [valid_size, valid_page_ids_in_file] = file_valid_pages[page_cache.fileIdLevel()]; - valid_size += page_cache.size; - valid_page_ids_in_file.push_back(page_id); + const PageId page_id = iter->first; + const PageEntry & page_entry = iter->second; + auto && [valid_size, valid_page_ids_in_file] = file_valid_pages[page_entry.fileIdLevel()]; + valid_size += page_entry.size; +#endif + valid_page_ids_in_file.emplace_back(page_id); } } - // select gc candidate files into `merge_files` + // Select gc candidate files into `merge_files` UInt64 candidate_total_size = 0; size_t migrate_page_count = 0; merge_files = gcSelectCandidateFiles(page_files, file_valid_pages, writing_file_id_level, candidate_total_size, migrate_page_count); @@ -282,35 +390,48 @@ bool PageStorage::gc() return false; } - LOG_DEBUG(log, "GC decide to merge " << merge_files.size() << " files, containing " << migrate_page_count << " regions"); + LOG_INFO(log, "GC decide to merge " << merge_files.size() << " files, containing " << migrate_page_count << " regions"); - // if there are no valid pages to be migrated, then jump over - if (migrate_page_count > 0) - { - gc_file_page_cache_map = gcMigratePages(file_valid_pages, merge_files); - } + // There are no valid pages to be migrated but valid ref pages, scan over all `merge_files` and do migrate. + gc_file_entries_edit = gcMigratePages(snapshot, file_valid_pages, merge_files); } - { - /// Here we have to update the cache information which readers need to synchronize, a write lock is needed. - std::unique_lock lock(read_mutex); - gcUpdatePageMap(gc_file_page_cache_map); + std::set live_files; + /// Here we have to apply edit to version_set and generate a new version, then return all files that are in used + live_files = version_set.gcApply(gc_file_entries_edit); - // TODO: potential bug: A read thread may just select a file F, while F is being GCed. And after GC, we remove F from - // reader cache. But after that, A could come in and re-add F reader cache. It is not a very big issue, because - // it only cause a hanging opened fd, which no one will use anymore. - // Remove reader cache. - for (const auto & [file_id, level] : merge_files) + { + // Remove obsolete files' reader cache that are not used by any version + std::lock_guard lock(open_read_files_mutex); + for (const auto & page_file : page_files) { - open_read_files.erase({file_id, level}); + const auto page_id_and_lvl = page_file.fileIdLevel(); + if (page_id_and_lvl >= writing_file_id_level) + { + continue; + } + + if (live_files.count(page_id_and_lvl) == 0) + { + open_read_files.erase(page_id_and_lvl); + } } } - // destroy the files have already been gc - for (const auto & [file_id, level] : merge_files) + // Delete obsolete files that are not used by any version, without lock + for (const auto & page_file : page_files) { - auto page_file = PageFile::openPageFileForRead(file_id, level, storage_path, page_file_log); - page_file.destroy(); + const auto page_id_and_lvl = page_file.fileIdLevel(); + if (page_id_and_lvl >= writing_file_id_level) + { + continue; + } + + if (live_files.count(page_id_and_lvl) == 0) + { + // the page file is not used by any version, remove reader cache + page_file.destroy(); + } } return true; } @@ -325,19 +446,13 @@ PageStorage::GcCandidates PageStorage::gcSelectCandidateFiles( // keep readable GcCandidates merge_files; for (auto & page_file : page_files) { - auto file_size = page_file.getDataFileSize(); - UInt64 valid_size; - float valid_rate; - size_t valid_page_count; + const auto file_size = page_file.getDataFileSize(); + UInt64 valid_size = 0; + float valid_rate = 0.0f; + size_t valid_page_count = 0; auto it = file_valid_pages.find(page_file.fileIdLevel()); - if (it == file_valid_pages.end()) - { - valid_size = 0; - valid_rate = 0; - valid_page_count = 0; - } - else + if (it != file_valid_pages.end()) { valid_size = it->second.first; valid_rate = (float)valid_size / file_size; @@ -348,66 +463,82 @@ PageStorage::GcCandidates PageStorage::gcSelectCandidateFiles( // keep readable bool is_candidate = (page_file.fileIdLevel() != writing_file_id_level) && (valid_rate < config.merge_hint_low_used_rate || file_size < config.file_small_size); if (!is_candidate) + { continue; + } merge_files.emplace(page_file.fileIdLevel()); - migrate_page_count += valid_page_count; candidate_total_size += valid_size; if (candidate_total_size >= config.file_max_size) + { break; + } } return merge_files; } -PageCacheMap PageStorage::gcMigratePages(const GcLivesPages & file_valid_pages, const GcCandidates & merge_files) const +PageEntriesEdit +PageStorage::gcMigratePages(const SnapshotPtr & snapshot, const GcLivesPages & file_valid_pages, const GcCandidates & merge_files) const { - PageCacheMap gc_file_page_cache_map; + PageEntriesEdit gc_file_edit; + // merge `merge_files` to PageFile which PageId = max of all `merge_files` and level = level + 1 auto [largest_file_id, level] = *(merge_files.rbegin()); PageFile gc_file = PageFile::newPageFile(largest_file_id, level + 1, storage_path, /* is_tmp= */ true, page_file_log); size_t num_successful_migrate_pages = 0; + size_t num_valid_ref_pages = 0; + auto * current = snapshot->version(); { + PageEntriesEdit legacy_edit; // All page entries in `merge_files` // No need to sync after each write. Do sync before closing is enough. auto gc_file_writer = gc_file.createWriter(/* sync_on_write= */ false); for (const auto & file_id_level : merge_files) { + PageFile to_merge_file = PageFile::openPageFileForRead(file_id_level.first, file_id_level.second, storage_path, page_file_log); + // Note: This file may not contain any valid page, but valid RefPages which we need to migrate + to_merge_file.readAndSetPageMetas(legacy_edit); + auto it = file_valid_pages.find(file_id_level); if (it == file_valid_pages.end()) { // This file does not contain any valid page. continue; } - const auto & page_ids = it->second.second; - PageFile to_merge_file = PageFile::openPageFileForRead(file_id_level.first, file_id_level.second, storage_path, page_file_log); - auto to_merge_file_reader = to_merge_file.createReader(); - - PageIdAndCaches page_id_and_caches; + auto to_merge_file_reader = to_merge_file.createReader(); + PageIdAndEntries page_id_and_entries; { + const auto & page_ids = it->second.second; for (auto page_id : page_ids) { - auto it2 = page_cache_map.find(page_id); - // This page is already removed. - if (it2 == page_cache_map.end()) - continue; - const auto & page_cache = it2->second; - // This page is covered by newer file. - if (page_cache.fileIdLevel() != file_id_level) - continue; - page_id_and_caches.emplace_back(page_id, page_cache); - num_successful_migrate_pages += 1; + try + { + auto page_entry = current->find(page_id); + if (page_entry == nullptr) + continue; + // This page is covered by newer file. + if (page_entry->fileIdLevel() != file_id_level) + continue; + page_id_and_entries.emplace_back(page_id, *page_entry); + num_successful_migrate_pages += 1; + } + catch (DB::Exception & e) + { + // ignore if it2 is a ref to non-exist page + LOG_WARNING(log, "Ignore invalid RefPage while gcMigratePages: " + e.message()); + } } } - if (!page_id_and_caches.empty()) + if (!page_id_and_entries.empty()) { // copy valid pages from `to_merge_file` to `gc_file` - PageMap pages = to_merge_file_reader->read(page_id_and_caches); + PageMap pages = to_merge_file_reader->read(page_id_and_entries); WriteBatch wb; - for (const auto & [page_id, page_cache] : page_id_and_caches) + for (const auto & [page_id, page_cache] : page_id_and_entries) { auto & page = pages.find(page_id)->second; wb.putPage(page_id, @@ -416,43 +547,41 @@ PageCacheMap PageStorage::gcMigratePages(const GcLivesPages & file_valid_pages, page.data.size()); } - gc_file_writer->write(wb, gc_file_page_cache_map); + gc_file_writer->write(wb, gc_file_edit); } } - } - if (gc_file_page_cache_map.empty()) + { + // Migrate RefPages which are still valid. + WriteBatch batch; + for (const auto & rec : legacy_edit.getRecords()) + { + // Get `normal_page_id` from memory's `page_entry_map`. Note: can not get `normal_page_id` from disk, + // if it is a record of RefPage to another RefPage, the later ref-id is resolve to the actual `normal_page_id`. + auto [is_ref, normal_page_id] = current->isRefId(rec.page_id); + if (is_ref) + { + batch.putRefPage(rec.page_id, normal_page_id); + num_valid_ref_pages += 1; + } + } + gc_file_writer->write(batch, gc_file_edit); + } + } // free gc_file_writer and sync + + if (gc_file_edit.empty() && num_valid_ref_pages == 0) { gc_file.destroy(); } else { gc_file.setFormal(); - auto id = gc_file.fileIdLevel(); - LOG_DEBUG(log, "GC have migrated " << num_successful_migrate_pages << " regions to PageFile_" << id.first << "_" << id.second); - } - return gc_file_page_cache_map; -} - -void PageStorage::gcUpdatePageMap(const PageCacheMap & gc_pages_map) -{ - for (const auto & [page_id, page_cache] : gc_pages_map) - { - auto it = page_cache_map.find(page_id); - // if the gc page have already been remove, just ignore it - if (it == page_cache_map.end()) - { - continue; - } - auto & old_page_cache = it->second; - // In case of page being updated during GC process. - if (old_page_cache.fileIdLevel() < page_cache.fileIdLevel()) - { - // no new page write to `page_cache_map`, replace it with gc page - old_page_cache = page_cache; - } - // else new page written by another thread, gc page is replaced. leave the page for next gc + const auto id = gc_file.fileIdLevel(); + LOG_INFO(log, + "GC have migrated " << num_successful_migrate_pages << " regions and " << num_valid_ref_pages << " RefPages to PageFile_" + << id.first << "_" << id.second); } + return gc_file_edit; } -} // namespace DB \ No newline at end of file +} // namespace DB diff --git a/dbms/src/Storages/Page/PageStorage.h b/dbms/src/Storages/Page/PageStorage.h index 27141b9d54c..cb74dd2b542 100644 --- a/dbms/src/Storages/Page/PageStorage.h +++ b/dbms/src/Storages/Page/PageStorage.h @@ -6,11 +6,18 @@ #include #include +#include +#include #include +#include +#include +#include namespace DB { +#define DELTA_VERSION_SET + /** * A storage system stored pages. Pages are serialized objects referenced by PageId. Store Page with the same PageId * will covered the old ones. The file used to persist the Pages called PageFile. The meta data of a Page, like the @@ -42,21 +49,29 @@ class PageStorage using OpenReadFiles = std::map; public: - PageStorage(const std::string & storage_path, const Config & config_); + PageStorage(const String & storage_path, const Config & config_); + + PageId getMaxId(); + + void write(const WriteBatch & write_batch); - PageId getMaxId(); - PageCache getCache(PageId page_id); +#ifdef DELTA_VERSION_SET + using SnapshotPtr = PageEntriesVersionSetWithDelta::SnapshotPtr; +#else + using SnapshotPtr = PageEntryMapVersionSet::SnapshotPtr; +#endif + SnapshotPtr getSnapshot(); - void write(const WriteBatch & write_batch); - Page read(PageId page_id); - PageMap read(const std::vector & page_ids); - void read(const std::vector & page_ids, PageHandler & handler); - void traverse(std::function acceptor); - void traversePageCache(std::function acceptor); - bool gc(); + PageEntry getEntry(PageId page_id, SnapshotPtr snapshot = nullptr); + Page read(PageId page_id, SnapshotPtr snapshot = nullptr); + PageMap read(const std::vector & page_ids, SnapshotPtr snapshot = nullptr); + void read(const std::vector & page_ids, PageHandler & handler, SnapshotPtr snapshot = nullptr); + void traverse(const std::function & acceptor, SnapshotPtr snapshot = nullptr); + void traversePageEntries(const std::function & acceptor, SnapshotPtr snapshot); + bool gc(); static std::set - listAllPageFiles(const std::string & storage_path, bool remove_tmp_file, Logger * page_file_log); + listAllPageFiles(const String & storage_path, bool remove_tmp_file, Poco::Logger * page_file_log); private: PageFile::Writer & getWriter(); @@ -69,15 +84,18 @@ class PageStorage const PageFileIdAndLevel & writing_file_id_level, UInt64 & candidate_total_size, size_t & migrate_page_count) const; - PageCacheMap gcMigratePages(const GcLivesPages & file_valid_pages, const GcCandidates & merge_files) const; - void gcUpdatePageMap(const PageCacheMap & gc_pages_map); + PageEntriesEdit + gcMigratePages(const SnapshotPtr & snapshot, const GcLivesPages & file_valid_pages, const GcCandidates & merge_files) const; private: - std::string storage_path; - Config config; + String storage_path; + Config config; - PageCacheMap page_cache_map; - PageId max_page_id = 0; +#ifdef DELTA_VERSION_SET + PageEntriesVersionSetWithDelta version_set; +#else + PageEntryMapVersionSet version_set; +#endif PageFile write_file; WriterPtr write_file_writer; @@ -85,12 +103,11 @@ class PageStorage OpenReadFiles open_read_files; std::mutex open_read_files_mutex; // A mutex only used to protect open_read_files. - Logger * page_file_log; - Logger * log; + Poco::Logger * page_file_log; + Poco::Logger * log; - std::mutex write_mutex; - std::shared_mutex read_mutex; - std::mutex gc_mutex; // A mutex used to protect only gc + std::mutex write_mutex; + std::mutex gc_mutex; // A mutex used to protect gc }; } // namespace DB diff --git a/dbms/src/Storages/Page/VersionSet/PageEntriesBuilder.cpp b/dbms/src/Storages/Page/VersionSet/PageEntriesBuilder.cpp new file mode 100644 index 00000000000..27369889f22 --- /dev/null +++ b/dbms/src/Storages/Page/VersionSet/PageEntriesBuilder.cpp @@ -0,0 +1,39 @@ +#include + +namespace DB +{ + +void PageEntriesBuilder::apply(const PageEntriesEdit & edit) +{ + for (const auto & rec : edit.getRecords()) + { + switch (rec.type) + { + case WriteBatch::WriteType::PUT: + current_version->put(rec.page_id, rec.entry); + break; + case WriteBatch::WriteType::DEL: + current_version->del(rec.page_id); + break; + case WriteBatch::WriteType::REF: + if (likely(!ignore_invalid_ref)) + { + current_version->ref(rec.page_id, rec.ori_page_id); + } + else + { + try + { + current_version->ref(rec.page_id, rec.ori_page_id); + } + catch (DB::Exception & e) + { + LOG_WARNING(log, "Ignore invalid RefPage while opening PageStorage: " + e.message()); + } + } + break; + } + } +} + +} // namespace DB diff --git a/dbms/src/Storages/Page/VersionSet/PageEntriesBuilder.h b/dbms/src/Storages/Page/VersionSet/PageEntriesBuilder.h new file mode 100644 index 00000000000..b2eb76ec8c4 --- /dev/null +++ b/dbms/src/Storages/Page/VersionSet/PageEntriesBuilder.h @@ -0,0 +1,70 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +class PageEntriesBuilder +{ +public: + explicit PageEntriesBuilder(const PageEntries * old_version_, // + bool ignore_invalid_ref_ = false, + Poco::Logger * log_ = nullptr) + : old_version(const_cast(old_version_)), + current_version(new PageEntries), // + ignore_invalid_ref(ignore_invalid_ref_), + log(log_) + { +#ifndef NDEBUG + if (ignore_invalid_ref) + { + assert(log != nullptr); + } +#endif + old_version->incrRefCount(); + current_version->copyEntries(*old_version); + } + + ~PageEntriesBuilder() { old_version->decrRefCount(); } + + void apply(const PageEntriesEdit & edit); + + void gcApply(PageEntriesEdit & edit) { gcApplyTemplate(current_version, edit, current_version); } + + PageEntries * build() { return current_version; } + +public: + template + static void gcApplyTemplate(const OldVersionType & old_version, PageEntriesEdit & edit, VersionType & new_version) + { + for (auto & rec : edit.getRecords()) + { + if (rec.type != WriteBatch::WriteType::PUT) + continue; + // Gc only apply PUT for updating page entries + auto old_page_entry = old_version->find(rec.page_id); + // If the gc page have already been removed, or is a ref to non-exist page, just ignore it + if (old_page_entry == nullptr) + continue; + // In case of page being updated during GC process. + if (old_page_entry->fileIdLevel() < rec.entry.fileIdLevel()) + { + // no new page write to `page_entry_map`, replace it with gc page + rec.entry.ref = old_page_entry->ref; + new_version->normal_pages[rec.page_id] = rec.entry; + } + // else new page written by another thread, gc page is replaced. leave the page for next gc + } + } + +private: + PageEntries * old_version; + PageEntries * current_version; + bool ignore_invalid_ref; + Poco::Logger * log; +}; + +} // namespace DB diff --git a/dbms/src/Storages/Page/VersionSet/PageEntriesEdit.h b/dbms/src/Storages/Page/VersionSet/PageEntriesEdit.h new file mode 100644 index 00000000000..dcc53b7ea04 --- /dev/null +++ b/dbms/src/Storages/Page/VersionSet/PageEntriesEdit.h @@ -0,0 +1,78 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +/// Page entries change to apply to version set. +class PageEntriesEdit +{ +public: + PageEntriesEdit() = default; + + void put(PageId page_id, const PageEntry & entry) + { + EditRecord record; + record.type = WriteBatch::WriteType::PUT; + record.page_id = page_id; + record.entry = entry; + records.emplace_back(record); + } + + void del(PageId page_id) + { + EditRecord record; + record.type = WriteBatch::WriteType::DEL; + record.page_id = page_id; + records.emplace_back(record); + } + + void ref(PageId ref_id, PageId page_id) + { + EditRecord record; + record.type = WriteBatch::WriteType::REF; + record.page_id = ref_id; + record.ori_page_id = page_id; + records.emplace_back(record); + } + + bool empty() const { return records.empty(); } + + size_t size() const { return records.size(); } + + struct EditRecord + { + WriteBatch::WriteType type; + char _padding[7]; // 7 bytes unused since type is only 1 byte. + PageId page_id; + PageId ori_page_id; + PageEntry entry; + }; + using EditRecords = std::vector; + + EditRecords & getRecords() { return records; } + const EditRecords & getRecords() const { return records; } + +private: + EditRecords records; + +public: + // No copying allowed + PageEntriesEdit(const PageEntriesEdit &) = delete; + PageEntriesEdit & operator=(const PageEntriesEdit &) = delete; + // Only move allowed + PageEntriesEdit(PageEntriesEdit && rhs) noexcept : PageEntriesEdit() { *this = std::move(rhs); } + PageEntriesEdit & operator=(PageEntriesEdit && rhs) noexcept + { + if (this != &rhs) + { + records.swap(rhs.records); + } + return *this; + } +}; + +} // namespace DB diff --git a/dbms/src/Storages/Page/VersionSet/PageEntriesVersionSet.cpp b/dbms/src/Storages/Page/VersionSet/PageEntriesVersionSet.cpp new file mode 100644 index 00000000000..cd052eafffb --- /dev/null +++ b/dbms/src/Storages/Page/VersionSet/PageEntriesVersionSet.cpp @@ -0,0 +1,37 @@ +#include + +namespace DB +{ + +std::set PageEntriesVersionSet::gcApply(PageEntriesEdit & edit) +{ + std::unique_lock lock(read_mutex); + + // apply edit on base + PageEntries * v = nullptr; + { + PageEntriesBuilder builder(current); + builder.gcApply(edit); + v = builder.build(); + } + + this->appendVersion(v); + + return listAllLiveFiles(); +} + +std::set PageEntriesVersionSet::listAllLiveFiles() const +{ + std::set liveFiles; + for (PageEntries * v = placeholder_node.next; v != &placeholder_node; v = v->next) + { + for (auto it = v->pages_cbegin(); it != v->pages_cend(); ++it) + { + liveFiles.insert(it->second.fileIdLevel()); + } + } + return liveFiles; +} + + +} // namespace DB diff --git a/dbms/src/Storages/Page/VersionSet/PageEntriesVersionSet.h b/dbms/src/Storages/Page/VersionSet/PageEntriesVersionSet.h new file mode 100644 index 00000000000..ebf2176ca31 --- /dev/null +++ b/dbms/src/Storages/Page/VersionSet/PageEntriesVersionSet.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +class PageEntriesVersionSet : public ::DB::MVCC::VersionSet +{ +public: + explicit PageEntriesVersionSet(const ::DB::MVCC::VersionSetConfig & config_ = ::DB::MVCC::VersionSetConfig()) + : ::DB::MVCC::VersionSet(config_) + { + } + +public: + using SnapshotPtr = ::DB::MVCC::VersionSet::SnapshotPtr; + + /// `gcApply` only accept PageEntry's `PUT` changes and will discard changes if PageEntry is invalid + /// append new version to version-list + std::set gcApply(PageEntriesEdit & edit); + + /// List all PageFile that are used by any version + std::set listAllLiveFiles() const; +}; + + +} // namespace DB diff --git a/dbms/src/Storages/Page/VersionSet/PageEntriesVersionSetWithDelta.cpp b/dbms/src/Storages/Page/VersionSet/PageEntriesVersionSetWithDelta.cpp new file mode 100644 index 00000000000..7ae553bdcff --- /dev/null +++ b/dbms/src/Storages/Page/VersionSet/PageEntriesVersionSetWithDelta.cpp @@ -0,0 +1,294 @@ +#include + +#include + +#include + +namespace DB +{ + +//========================================================================================== +// PageEntriesVersionSetWithDelta +//========================================================================================== + +std::set PageEntriesVersionSetWithDelta::gcApply(PageEntriesEdit & edit) +{ + std::unique_lock lock(read_mutex); + + if (current.use_count() == 1 && current->isBase()) + { + // If no readers, we could directly merge edits + EditAcceptor::gcApplyInplace(current, edit); + } + else + { + if (current.use_count() != 1) + { + VersionPtr v = VersionType::createDelta(); + appendVersion(std::move(v)); + } + auto view = std::make_shared(current); + EditAcceptor builder(view.get()); + builder.gcApply(edit); + } + + return listAllLiveFiles(); +} + +std::set PageEntriesVersionSetWithDelta::listAllLiveFiles() const +{ + // Note read_mutex must be hold. + std::set liveFiles; + std::set visitedVersions; // avoid to access same version multiple time + // Iterate all snapshot to collect all PageFile in used. + for (auto s = snapshots->next; s != snapshots.get(); s = s->next) + { + collectLiveFilesFromVersionList(s->version()->getSharedTailVersion(), visitedVersions, liveFiles); + } + // Iterate over `current` + collectLiveFilesFromVersionList(current, visitedVersions, liveFiles); + return liveFiles; +} + +void PageEntriesVersionSetWithDelta::collectLiveFilesFromVersionList( // + VersionPtr v, + std::set & visited, + std::set & liveFiles) const +{ + for (; v != nullptr; v = v->prev) + { + // If this version has been visited, all previous version has been collected. + if (visited.count(v) > 0) + break; + for (auto it = v->pages_cbegin(); it != v->pages_cend(); ++it) + { + // ignore if it is a tombstone entry + if (it->second.ref != 0) + { + liveFiles.insert(it->second.fileIdLevel()); + } + } + visited.insert(v); + } +} + +//========================================================================================== +// Functions used when view release and do compact on version-list +//========================================================================================== + +PageEntriesVersionSetWithDelta::VersionPtr // +PageEntriesVersionSetWithDelta::compactDeltaAndBase( // + const PageEntriesVersionSetWithDelta::VersionPtr & old_base, + PageEntriesVersionSetWithDelta::VersionPtr & delta) const +{ + PageEntriesVersionSetWithDelta::VersionPtr base = PageEntriesForDelta::createBase(); + base->copyEntries(*old_base); + // apply delta edits + delta->prev = base; + base->merge(*delta); + delta->clear(); + return base; +} + +PageEntriesVersionSetWithDelta::VersionPtr // +PageEntriesVersionSetWithDelta::compactDeltas( // + const PageEntriesVersionSetWithDelta::VersionPtr & tail) const +{ + if (tail->prev == nullptr || tail->prev->isBase()) + { + // Only one delta, do nothing + return nullptr; + } + + auto tmp = PageEntriesVersionSetWithDelta::VersionType::createDelta(); + + std::stack nodes; + for (auto node = tail; node != nullptr; node = node->prev) + { + if (node->isBase()) + { + // link `tmp` to `base` version + tmp->prev = node; + } + else + { + nodes.push(node); + } + } + // merge delta forward + while (!nodes.empty()) + { + auto node = nodes.top(); + nodes.pop(); + tmp->merge(*node); + } + + return tmp; +} + +//========================================================================================== +// DeltaVersionEditAcceptor +//========================================================================================== + +DeltaVersionEditAcceptor::DeltaVersionEditAcceptor(const PageEntriesView * view_, bool ignore_invalid_ref_, Logger * log_) + : view(const_cast(view_)), + current_version(view->getSharedTailVersion()), + ignore_invalid_ref(ignore_invalid_ref_), + log(log_) +{ +#ifndef NDEBUG + // tail of view must be a delta + assert(!current_version->isBase()); + if (ignore_invalid_ref) + { + assert(log != nullptr); + } +#endif +} + +DeltaVersionEditAcceptor::~DeltaVersionEditAcceptor() = default; + +/// Apply edits and generate new delta +void DeltaVersionEditAcceptor::apply(PageEntriesEdit & edit) +{ + for (auto && rec : edit.getRecords()) + { + switch (rec.type) + { + case WriteBatch::WriteType::PUT: + this->applyPut(rec); + break; + case WriteBatch::WriteType::DEL: + this->applyDel(rec); + break; + case WriteBatch::WriteType::REF: + this->applyRef(rec); + break; + } + } +} + +void DeltaVersionEditAcceptor::applyPut(PageEntriesEdit::EditRecord & rec) +{ + assert(rec.type == WriteBatch::WriteType::PUT); + current_version->ref_deletions.erase(rec.page_id); + + auto [is_ref_exist, normal_page_id] = view->isRefId(rec.page_id); + if (!is_ref_exist) + { + // if ref not exist, add new ref-pair + normal_page_id = rec.page_id; + current_version->page_ref.emplace(rec.page_id, normal_page_id); + } + + // update normal page's entry + auto old_entry = view->findNormalPageEntry(normal_page_id); + if (is_ref_exist && old_entry == nullptr) + { + throw DB::Exception("Accessing RefPage" + DB::toString(rec.page_id) + " to non-exist Page" + DB::toString(normal_page_id), + ErrorCodes::LOGICAL_ERROR); + } + if (old_entry == nullptr) + { + // Page{normal_page_id} not exist + rec.entry.ref = 1; + current_version->normal_pages[normal_page_id] = rec.entry; + } + else + { + // replace ori Page{normal_page_id}'s entry but inherit ref-counting + rec.entry.ref = old_entry->ref + !is_ref_exist; + current_version->normal_pages[normal_page_id] = rec.entry; + } + + current_version->max_page_id = std::max(current_version->max_page_id, rec.page_id); +} + +void DeltaVersionEditAcceptor::applyDel(PageEntriesEdit::EditRecord & rec) +{ + assert(rec.type == WriteBatch::WriteType::DEL); + const PageId normal_page_id = view->resolveRefId(rec.page_id); + current_version->ref_deletions.insert(rec.page_id); + current_version->page_ref.erase(rec.page_id); + this->decreasePageRef(normal_page_id); +} + +void DeltaVersionEditAcceptor::applyRef(PageEntriesEdit::EditRecord & rec) +{ + assert(rec.type == WriteBatch::WriteType::REF); + current_version->ref_deletions.erase(rec.page_id); + // if `page_id` is a ref-id, collapse the ref-path to actual PageId + // eg. exist RefPage2 -> Page1, add RefPage3 -> RefPage2, collapse to RefPage3 -> Page1 + const PageId normal_page_id = view->resolveRefId(rec.ori_page_id); + auto old_entry = view->findNormalPageEntry(normal_page_id); + if (likely(old_entry != nullptr)) + { + // if RefPage{ref_id} already exist, release that ref first + auto [is_ref_id, old_normal_id] = view->isRefId(rec.page_id); + if (unlikely(is_ref_id)) + { + // if RefPage{ref-id} -> Page{normal_page_id} already exists, just ignore + if (old_normal_id == normal_page_id) + return; + this->decreasePageRef(old_normal_id); + } + current_version->page_ref[rec.page_id] = normal_page_id; + // increase entry's ref-count + auto new_entry = *old_entry; + new_entry.ref += 1; + current_version->normal_pages[rec.page_id] = new_entry; + } + else + { + // The Page to be ref is not exist. + if (ignore_invalid_ref) + { + LOG_WARNING(log, + "Ignore invalid RefPage while opening PageStorage: RefPage" + DB::toString(rec.page_id) + " to non-exist Page" + + DB::toString(rec.ori_page_id)); + } + else + { + // accept dangling ref if we are writing to a tmp entry map. + // like entry map of WriteBatch or Gc or AnalyzeMeta + current_version->page_ref[rec.page_id] = normal_page_id; + } + } + current_version->max_page_id = std::max(current_version->max_page_id, rec.page_id); +} + +void DeltaVersionEditAcceptor::applyInplace(const PageEntriesVersionSetWithDelta::VersionPtr & current, const PageEntriesEdit & edit) +{ + assert(current->isBase()); + assert(current.use_count() == 1); + for (auto && rec : edit.getRecords()) + { + switch (rec.type) + { + case WriteBatch::WriteType::PUT: + current->put(rec.page_id, rec.entry); + break; + case WriteBatch::WriteType::DEL: + current->del(rec.page_id); + break; + case WriteBatch::WriteType::REF: + // Shorten ref-path in case there is RefPage to RefPage + current->ref(rec.page_id, rec.ori_page_id); + break; + } + } +} + +void DeltaVersionEditAcceptor::decreasePageRef(const PageId page_id) +{ + auto old_entry = view->findNormalPageEntry(page_id); + if (old_entry != nullptr) + { + auto entry = *old_entry; + entry.ref = old_entry->ref <= 1 ? 0 : old_entry->ref - 1; + // Keep an tombstone entry (ref-count == 0), so that we can delete this entry when merged to base + current_version->normal_pages[page_id] = entry; + } +} + +} // namespace DB \ No newline at end of file diff --git a/dbms/src/Storages/Page/VersionSet/PageEntriesVersionSetWithDelta.h b/dbms/src/Storages/Page/VersionSet/PageEntriesVersionSetWithDelta.h new file mode 100644 index 00000000000..29769fe60f9 --- /dev/null +++ b/dbms/src/Storages/Page/VersionSet/PageEntriesVersionSetWithDelta.h @@ -0,0 +1,89 @@ +#include + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +class DeltaVersionEditAcceptor; + +class PageEntriesVersionSetWithDelta : public ::DB::MVCC::VersionSetWithDelta< // + PageEntriesForDelta, + PageEntriesView, + PageEntriesEdit, + DeltaVersionEditAcceptor> +{ +public: + using BaseType = ::DB::MVCC::VersionSetWithDelta; + using EditAcceptor = BaseType::EditAcceptor; + using VersionType = BaseType::VersionType; + using VersionPtr = BaseType::VersionPtr; + +public: + explicit PageEntriesVersionSetWithDelta(const ::DB::MVCC::VersionSetConfig & config_ = ::DB::MVCC::VersionSetConfig()) + : BaseType(config_) + { + } + +public: + std::set gcApply(PageEntriesEdit & edit); + + /// List all PageFile that are used by any version + std::set listAllLiveFiles() const; + + VersionPtr compactDeltas(const VersionPtr & tail) const override; + + VersionPtr compactDeltaAndBase(const VersionPtr & old_base, VersionPtr & delta) const override; + +private: + void collectLiveFilesFromVersionList(VersionPtr tail, std::set & visited, std::set & liveFiles) const; +}; + +/// Read old entries state from `view_` and apply new edit to `view_->tail` +class DeltaVersionEditAcceptor +{ +public: + explicit DeltaVersionEditAcceptor(const PageEntriesView * view_, // + bool ignore_invalid_ref_ = false, + Poco::Logger * log_ = nullptr); + + ~DeltaVersionEditAcceptor(); + + void apply(PageEntriesEdit & edit); + + static void applyInplace(const PageEntriesVersionSetWithDelta::VersionPtr & current, const PageEntriesEdit & edit); + + void gcApply(PageEntriesEdit & edit) { PageEntriesBuilder::gcApplyTemplate(view, edit, current_version); } + + static void gcApplyInplace( // + const PageEntriesVersionSetWithDelta::VersionPtr & current, + PageEntriesEdit & edit) + { + assert(current->isBase()); + assert(current.use_count() == 1); + PageEntriesBuilder::gcApplyTemplate(current, edit, current); + } + +private: + // Read old state from `view` and apply new edit to `current_version` + + void applyPut(PageEntriesEdit::EditRecord & record); + void applyDel(PageEntriesEdit::EditRecord & record); + void applyRef(PageEntriesEdit::EditRecord & record); + void decreasePageRef(PageId page_id); + +private: + PageEntriesView * view; + PageEntriesVersionSetWithDelta::VersionPtr current_version; + bool ignore_invalid_ref; + Poco::Logger * log; +}; + +} // namespace DB diff --git a/dbms/src/Storages/Page/VersionSet/PageEntriesView.cpp b/dbms/src/Storages/Page/VersionSet/PageEntriesView.cpp new file mode 100644 index 00000000000..4558c04fded --- /dev/null +++ b/dbms/src/Storages/Page/VersionSet/PageEntriesView.cpp @@ -0,0 +1,154 @@ +#include + +namespace DB +{ + +//// PageEntryMapView + +const PageEntry * PageEntriesView::find(PageId page_id) const +{ + // First we find ref-pairs to get the normal page id + bool found = false; + PageId normal_page_id = 0; + for (auto node = tail; node != nullptr; node = node->prev) + { + if (node->isRefDeleted(page_id)) + { + return nullptr; + } + + auto iter = node->page_ref.find(page_id); + if (iter != node->page_ref.end()) + { + found = true; + normal_page_id = iter->second; + break; + } + } + if (!found) + { + // The page have been deleted. + return nullptr; + } + + auto entry = findNormalPageEntry(normal_page_id); + // RefPage exists, but normal Page do NOT exist. Should NOT call here + if (entry == nullptr) + { + throw DB::Exception("Accessing RefPage" + DB::toString(page_id) + " to non-exist Page" + DB::toString(normal_page_id), + ErrorCodes::LOGICAL_ERROR); + } + return entry; +} + +const PageEntry & PageEntriesView::at(const PageId page_id) const +{ + auto entry = this->find(page_id); + if (entry == nullptr) + { + throw DB::Exception("Accessing non-exist Page[" + DB::toString(page_id) + "]", ErrorCodes::LOGICAL_ERROR); + } + return *entry; +} + +const PageEntry * PageEntriesView::findNormalPageEntry(PageId page_id) const +{ + for (auto node = tail; node != nullptr; node = node->prev) + { + auto iter = node->normal_pages.find(page_id); + if (iter != node->normal_pages.end()) + { + return &iter->second; + } + } + return nullptr; +} + +std::pair PageEntriesView::isRefId(PageId page_id) const +{ + auto node = tail; + for (; !node->isBase(); node = node->prev) + { + if (node->ref_deletions.count(page_id) > 0) + return {false, 0}; + auto iter = node->page_ref.find(page_id); + if (iter != node->page_ref.end()) + return {true, iter->second}; + } + return node->isRefId(page_id); +} + +PageId PageEntriesView::resolveRefId(PageId page_id) const +{ + auto [is_ref, normal_page_id] = isRefId(page_id); + return is_ref ? normal_page_id : page_id; +} + +std::set PageEntriesView::validPageIds() const +{ + std::stack> link_nodes; + for (auto node = tail; node != nullptr; node = node->prev) + { + link_nodes.emplace(node); + } + // Get valid pages, from link-list's head to tail + std::set valid_pages; + while (!link_nodes.empty()) + { + auto node = link_nodes.top(); + link_nodes.pop(); + if (!node->isBase()) + { + for (auto deleted_id : node->ref_deletions) + { + valid_pages.erase(deleted_id); + } + } + for (auto ref_pairs : node->page_ref) + { + valid_pages.insert(ref_pairs.first); + } + } + return valid_pages; +} + +std::set PageEntriesView::validNormalPageIds() const +{ + std::stack> link_nodes; + for (auto node = tail; node != nullptr; node = node->prev) + { + link_nodes.emplace(node); + } + // Get valid normal pages, from link-list's head to tail + std::set valid_normal_pages; + while (!link_nodes.empty()) + { + auto node = link_nodes.top(); + link_nodes.pop(); + if (!node->isBase()) + { + for (auto deleted_id : node->ref_deletions) + { + valid_normal_pages.erase(deleted_id); + } + } + for (auto & [page_id, entry] : node->normal_pages) + { + if (entry.ref != 0) + valid_normal_pages.insert(page_id); + } + } + return valid_normal_pages; +} + +PageId PageEntriesView::maxId() const +{ + PageId max_id = 0; + for (auto node = tail; node != nullptr; node = node->prev) + { + max_id = std::max(max_id, node->maxId()); + } + return max_id; +} + +} // namespace DB diff --git a/dbms/src/Storages/Page/VersionSet/PageEntriesView.h b/dbms/src/Storages/Page/VersionSet/PageEntriesView.h new file mode 100644 index 00000000000..cef486fcaf7 --- /dev/null +++ b/dbms/src/Storages/Page/VersionSet/PageEntriesView.h @@ -0,0 +1,47 @@ +#pragma once + +namespace DB +{ + +/// Treat a list of PageEntriesForDelta as a single PageEntries +class PageEntriesView +{ +private: + // tail of the list + std::shared_ptr tail; + +public: + explicit PageEntriesView(std::shared_ptr tail_) : tail(std::move(tail_)) {} + + const PageEntry * find(PageId page_id) const; + + const PageEntry & at(PageId page_id) const; + + std::pair isRefId(PageId page_id) const; + + // For iterate over all pages + std::set validPageIds() const; + + // For iterate over all normal pages + std::set validNormalPageIds() const; + + PageId maxId() const; + + inline std::shared_ptr getSharedTailVersion() const { return tail; } + + inline std::shared_ptr transferTailVersionOwn() + { + std::shared_ptr owned_ptr; + owned_ptr.swap(tail); + return owned_ptr; + } + +private: + const PageEntry * findNormalPageEntry(PageId page_id) const; + + PageId resolveRefId(PageId page_id) const; + + friend class DeltaVersionEditAcceptor; +}; + +} // namespace DB diff --git a/dbms/src/Storages/Page/WriteBatch.h b/dbms/src/Storages/Page/WriteBatch.h index b8cfc438e3d..fd127432e5b 100644 --- a/dbms/src/Storages/Page/WriteBatch.h +++ b/dbms/src/Storages/Page/WriteBatch.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -10,35 +10,55 @@ namespace DB class WriteBatch { +public: + enum class WriteType : UInt8 + { + DEL = 0, + PUT = 1, + REF = 2, + }; + private: struct Write { - bool is_put; - PageId page_id; - UInt64 tag; + WriteType type; + PageId page_id; + UInt64 tag; + // Page's data and size ReadBufferPtr read_buffer; UInt32 size; + // RefPage's origin page + PageId ori_page_id; }; using Writes = std::vector; public: void putPage(PageId page_id, UInt64 tag, const ReadBufferPtr & read_buffer, UInt32 size) { - Write w = {true, page_id, tag, read_buffer, size}; - writes.push_back(w); + Write w = {WriteType::PUT, page_id, tag, read_buffer, size, 0}; + writes.emplace_back(w); } + + // Add RefPage{ref_id} -> Page{page_id} + void putRefPage(PageId ref_id, PageId page_id) + { + Write w = {WriteType::REF, ref_id, 0, {}, 0, page_id}; + writes.emplace_back(w); + } + void delPage(PageId page_id) { - Write w = {false, page_id, 0, {}, 0}; - writes.push_back(w); + Write w = {WriteType::DEL, page_id, 0, {}, 0, 0}; + writes.emplace_back(w); } + const Writes & getWrites() const { return writes; } size_t putWriteCount() const { size_t count = 0; for (auto & w : writes) - count += w.is_put; + count += (w.type == WriteType::PUT); return count; } diff --git a/dbms/src/Storages/Page/mvcc/VersionSet.h b/dbms/src/Storages/Page/mvcc/VersionSet.h new file mode 100644 index 00000000000..761e83e71d3 --- /dev/null +++ b/dbms/src/Storages/Page/mvcc/VersionSet.h @@ -0,0 +1,219 @@ +#pragma once + +#include +#include +#include +#include + +#include + +namespace DB +{ +namespace MVCC +{ + +/// Config +struct VersionSetConfig +{ + size_t compact_hint_delta_deletions = 5000; + size_t compact_hint_delta_entries = 200 * 1000; +}; + +/// Base type for VersionType of VersionSet +template +struct MultiVersionCountable +{ +public: + std::atomic ref_count; + T * next; + T * prev; + +public: + explicit MultiVersionCountable(T * self) : ref_count(0), next(self), prev(self) {} + virtual ~MultiVersionCountable() + { + assert(ref_count == 0); + + // Remove from linked list + prev->next = next; + next->prev = prev; + } + + void incrRefCount() { ++ref_count; } + + void decrRefCount(std::shared_mutex & mutex) + { + assert(ref_count >= 1); + if (--ref_count == 0) + { + // in case two neighbor nodes remove from linked list + std::unique_lock lock(mutex); + delete this; + } + } + + // Not thread-safe, caller ensure. + void decrRefCount() + { + assert(ref_count >= 1); + if (--ref_count == 0) + { + delete this; // remove this node from version set + } + } +}; + +/// VersionSet -- Manage multiple versions +/// +/// \tparam Version_t +/// member required: +/// Version_t::prev +/// -- previous version +/// Version_t::next +/// -- next version +/// functions required: +/// void Version_t::incrRefCount() +/// -- increase version's ref count +/// -- Note: must be thread safe +/// void Version_t::decrRefCount(std::shared_mutex &mutex) +/// -- decrease version's ref count. If version's ref count down to 0, it acquire unique_lock for mutex and then remove itself from version set +/// +/// \tparam VersionEdit_t -- Changes between two version +/// \tparam Builder_t -- Apply one or more VersionEdit_t to base version and build a new version +/// functions required: +/// Builder_t(Version_t *base) +/// -- Create a builder base on version `base` +/// void Builder_t::apply(const VersionEdit_t &) +/// -- Apply edit to builder +/// Version_t* Builder_t::build() +/// -- Build new version +template +class VersionSet +{ +public: + using BuilderType = TBuilder; + using VersionType = TVersion; + using VersionPtr = VersionType *; + +public: + explicit VersionSet(const VersionSetConfig & config_ = VersionSetConfig()) : placeholder_node(), current(nullptr) + { + (void)config_; // just ignore config + // append a init version to link + appendVersion(new VersionType); + } + + virtual ~VersionSet() + { + current->decrRefCount(); + assert(placeholder_node.next == &placeholder_node); // List must be empty + } + + void restore(VersionPtr const v) + { + std::unique_lock read_lock(read_mutex); + appendVersion(v); + } + + /// `apply` accept changes and append new version to version-list + void apply(const TVersionEdit & edit) + { + std::unique_lock read_lock(read_mutex); + + // apply edit on base + VersionPtr v = nullptr; + { + BuilderType builder(current); + builder.apply(edit); + v = builder.build(); + } + + appendVersion(v); + } + + size_t size() const + { + std::unique_lock read_lock(read_mutex); + size_t sz = 0; + for (VersionPtr v = current; v != &placeholder_node; v = v->prev) + sz += 1; + return sz; + } + + std::string toDebugStringUnlocked() const + { + std::string s; + for (VersionPtr v = placeholder_node.next; v != &placeholder_node; v = v->next) + { + if (!s.empty()) + s += "->"; + s += "{\"rc\":"; + s += DB::toString(uint32_t(v->ref_count)); + s += '}'; + } + return s; + } + +public: + /// A snapshot class for holding particular version + class Snapshot + { + private: + VersionPtr v; // particular version + std::shared_mutex * mutex; // mutex to be used when freeing version + + public: + Snapshot(VersionPtr version_, std::shared_mutex * mutex_) : v(version_), mutex(mutex_) { v->incrRefCount(); } + ~Snapshot() { v->decrRefCount(*mutex); } + + VersionPtr version() const { return v; } + + public: + // No copying allowed. + Snapshot(const Snapshot &) = delete; + Snapshot & operator=(const Snapshot &) = delete; + }; + using SnapshotPtr = std::shared_ptr; + + /// Create a snapshot for current version + SnapshotPtr getSnapshot() + { + std::shared_lock lock(read_mutex); + return std::make_shared(current, &read_mutex); + } + +protected: + VersionType placeholder_node; // Head of circular double-linked list of all versions + VersionPtr current; // current version; current == placeholder_node.prev + + mutable std::shared_mutex read_mutex; + +protected: + void appendVersion(VersionPtr const v) + { + // Make "v" become "current" + assert(v->ref_count == 0); + assert(v != current); + if (current != nullptr) + { + current->decrRefCount(); + } + current = v; + current->incrRefCount(); + + // Append to linked list + current->prev = placeholder_node.prev; + current->next = &placeholder_node; + current->prev->next = current; + current->next->prev = current; + } + +public: + // No copying allowed + VersionSet(const VersionSet &) = delete; + VersionSet & operator=(const VersionSet &) = delete; +}; + + +} // namespace MVCC +} // namespace DB diff --git a/dbms/src/Storages/Page/mvcc/VersionSetWithDelta.h b/dbms/src/Storages/Page/mvcc/VersionSetWithDelta.h new file mode 100644 index 00000000000..4861871ba76 --- /dev/null +++ b/dbms/src/Storages/Page/mvcc/VersionSetWithDelta.h @@ -0,0 +1,332 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace ProfileEvents +{ +extern const Event PSMVCCCompactOnDelta; +extern const Event PSMVCCCompactOnDeltaRebaseRejected; +extern const Event PSMVCCCompactOnBase; +extern const Event PSMVCCApplyOnCurrentBase; +extern const Event PSMVCCApplyOnCurrentDelta; +extern const Event PSMVCCApplyOnNewDelta; +} // namespace ProfileEvents + +namespace DB +{ +namespace MVCC +{ +/// Base type for VersionType of VersionSetWithDelta +template +struct MultiVersionCountableForDelta +{ +public: + std::shared_ptr prev; + +public: + explicit MultiVersionCountableForDelta() : prev(nullptr) {} + + virtual ~MultiVersionCountableForDelta() = default; +}; + +/// \tparam TVersion -- Single version on version-list. Require for a `prev` member, see `MultiVersionDeltaCountable` +/// \tparam TVersionView -- A view to see a list of versions as a single version +/// \tparam TVersionEdit -- Changes to apply to version set for generating new version +/// \tparam TEditAcceptor -- Accept a read view and apply edits to new version +template < // + typename TVersion, + typename TVersionView, + typename TVersionEdit, + typename TEditAcceptor> +class VersionSetWithDelta +{ +public: + using EditAcceptor = TEditAcceptor; + using VersionType = TVersion; + using VersionPtr = std::shared_ptr; + +public: + explicit VersionSetWithDelta(const ::DB::MVCC::VersionSetConfig & config_ = ::DB::MVCC::VersionSetConfig()) + : current(std::move(VersionType::createBase())), // + snapshots(std::move(std::make_shared(this, nullptr))), // + config(config_) + { + } + + virtual ~VersionSetWithDelta() + { + current.reset(); + // snapshot list is empty + assert(snapshots->prev == snapshots.get()); + } + + void apply(TVersionEdit & edit) + { + std::unique_lock read_lock(read_mutex); + + if (current.use_count() == 1 && current->isBase()) + { + ProfileEvents::increment(ProfileEvents::PSMVCCApplyOnCurrentBase); + // If no readers, we could directly merge edits. + TEditAcceptor::applyInplace(current, edit); + } + else + { + if (current.use_count() != 1) + { + ProfileEvents::increment(ProfileEvents::PSMVCCApplyOnNewDelta); + // There are reader(s) on current, generate new delta version and append to version-list + VersionPtr v = VersionType::createDelta(); + appendVersion(std::move(v)); + } + else + { + ProfileEvents::increment(ProfileEvents::PSMVCCApplyOnCurrentDelta); + } + // Make a view from head to new version, then apply edits on `current`. + auto view = std::make_shared(current); + EditAcceptor builder(view.get()); + builder.apply(edit); + } + } + +public: + /// Snapshot. + /// When snapshot object is free, it will call `view.release()` to compact VersionList, + /// and remove itself from VersionSet's snapshots list. + class Snapshot + { + public: + VersionSetWithDelta * vset; + TVersionView view; + + Snapshot * prev; + Snapshot * next; + + public: + Snapshot(VersionSetWithDelta * vset_, VersionPtr tail_) : vset(vset_), view(std::move(tail_)), prev(this), next(this) {} + + ~Snapshot() + { + vset->compactOnDeltaRelease(view.transferTailVersionOwn()); + // Remove snapshot from linked list + std::unique_lock lock = vset->acquireForLock(); + prev->next = next; + next->prev = prev; + } + + const TVersionView * version() const { return &view; } + + template + friend class VersionSetWithDelta; + }; + + using SnapshotPtr = std::shared_ptr; + + /// Create a snapshot for current version. + /// call `snapshot.reset()` or let `snapshot` gone if you don't need it anymore. + SnapshotPtr getSnapshot() + { + // acquire for unique_lock since we need to add all snapshots to link list + std::unique_lock lock(read_mutex); + auto s = std::make_shared(this, current); + // Register snapshot to VersionSet + s->prev = snapshots->prev; + s->next = snapshots.get(); + snapshots->prev->next = s.get(); + snapshots->prev = s.get(); + return s; + } + +protected: + void appendVersion(VersionPtr && v) + { + assert(v != current); + // Append to linked list + v->prev = current; + current = v; + } + +protected: + enum class RebaseResult + { + SUCCESS, + INVALID_VERSION, + }; + + /// Use after do compact on VersionList, rebase all + /// successor Version of Version{`old_base`} onto Version{`new_base`}. + /// Specially, if no successor version of Version{`old_base`}, which + /// means `current`==`old_base`, replace `current` with `new_base`. + /// Examples: + /// ┌────────────────────────────────┬───────────────────────────────────┐ + /// │ Before rebase │ After rebase │ + /// ├────────────────────────────────┼───────────────────────────────────┤ + /// │ Va <- Vb <- Vc │ Vd <- Vc │ + /// │ (old_base) (current) │ (new_base) (current) │ + /// ├────────────────────────────────┼───────────────────────────────────┤ + /// │ Va <- Vb <- Vc │ Vd │ + /// │ (current,old_base) │ (current, new_base) │ + /// └────────────────────────────────┴───────────────────────────────────┘ + /// Caller should ensure old_base is in VersionSet's link + RebaseResult rebase(const VersionPtr & old_base, const VersionPtr & new_base) + { + assert(old_base != nullptr); + std::unique_lock lock(read_mutex); + // Should check `old_base` is valid + if (!isValidVersion(old_base)) + { + return RebaseResult::INVALID_VERSION; + } + if (old_base == current) + { + current = new_base; + return RebaseResult::SUCCESS; + } + + auto q = current, p = current->prev; + while (p != nullptr && p != old_base) + { + q = p; + p = q->prev; + } + // p must point to `old_base` now + assert(p == old_base); + // rebase q on `new_base` + q->prev = new_base; + return RebaseResult::SUCCESS; + } + + std::unique_lock acquireForLock() { return std::unique_lock(read_mutex); } + + // Return true if `tail` is in current version-list + bool isValidVersion(const VersionPtr tail) const + { + for (auto node = current; node != nullptr; node = node->prev) + { + if (node == tail) + { + return true; + } + } + return false; + } + + // If `tail` is in current + // Do compaction on version-list [head, tail]. If there some versions after tail, use vset's `rebase` to concat them. + void compactOnDeltaRelease(VersionPtr && tail) + { + do + { + if (tail == nullptr || tail->isBase()) + { + break; + } + { + // If we can not found tail from `current` version-list, then other view has already + // do compaction on `tail` version, and we can just free that version + std::shared_lock lock(read_mutex); + if (!isValidVersion(tail)) + break; + } + // do compact on delta + ProfileEvents::increment(ProfileEvents::PSMVCCCompactOnDelta); + VersionPtr tmp = compactDeltas(tail); // Note: May be compacted by different threads + if (tmp != nullptr) + { + // rebase vset->current on `this->tail` to base on `tmp` + if (this->rebase(tail, tmp) == RebaseResult::INVALID_VERSION) + { + // Another thread may have done compaction and rebase, then we just release `tail` + ProfileEvents::increment(ProfileEvents::PSMVCCCompactOnDeltaRebaseRejected); + break; + } + // release tail ref on this view, replace with tmp + tail = tmp; + tmp.reset(); + } + // do compact on base + if (tail->shouldCompactToBase(config)) + { + ProfileEvents::increment(ProfileEvents::PSMVCCCompactOnBase); + auto old_base = tail->prev; + assert(old_base != nullptr); + VersionPtr new_base = compactDeltaAndBase(old_base, tail); + // replace nodes [head, tail] -> new_base + if (this->rebase(tail, new_base) == RebaseResult::INVALID_VERSION) + { + // Another thread may have done compaction and rebase, then we just release `tail`. In case we may add more code after do compaction on base + break; + } + } + } while (false); + tail.reset(); + } + + virtual VersionPtr compactDeltas(const VersionPtr & tail) const = 0; + + virtual VersionPtr compactDeltaAndBase(const VersionPtr & old_base, VersionPtr & delta) const = 0; + +public: + /// Some helper functions + + size_t size() const + { + std::unique_lock read_lock(read_mutex); + return sizeUnlocked(); + } + + size_t sizeUnlocked() const + { + size_t sz = 0; + for (auto v = current; v != nullptr; v = v->prev) + { + sz += 1; + } + return sz; + } + + std::string toDebugStringUnlocked() const { return versionToDebugString(current); } + + static std::string versionToDebugString(VersionPtr tail) + { + std::string s; + bool is_first = true; + std::stack deltas; + for (auto v = tail; v != nullptr; v = v->prev) + { + deltas.emplace(v); + } + while (!deltas.empty()) + { + auto v = deltas.top(); + deltas.pop(); + s += is_first ? "" : "<-"; + is_first = false; + s += "{\"rc\":"; + s += DB::toString(v.use_count() - 1); + s += ",\"addr\":", s += DB::ptrToString(v.get()); + s += '}'; + } + return s; + } + +protected: + mutable std::shared_mutex read_mutex; + VersionPtr current; + SnapshotPtr snapshots; + ::DB::MVCC::VersionSetConfig config; +}; + +} // namespace MVCC +} // namespace DB diff --git a/dbms/src/Storages/Page/tests/CMakeLists.txt b/dbms/src/Storages/Page/tests/CMakeLists.txt index ccc38c1cf4f..913fcbf33b9 100644 --- a/dbms/src/Storages/Page/tests/CMakeLists.txt +++ b/dbms/src/Storages/Page/tests/CMakeLists.txt @@ -1,9 +1,11 @@ add_headers_and_sources(page_storage ../) -add_library(page_storage +add_headers_and_sources(page_storage ../mvcc) +add_headers_and_sources(page_storage ../VersionSet) +add_library(page_storage EXCLUDE_FROM_ALL ${page_storage_headers} ${page_storage_sources}) target_link_libraries(page_storage clickhouse_common_io) -# glob all unit tests of dm into gtests_page_storage +# glob all unit tests of PageStorage into gtests_page_storage macro(grep_gtest_sources BASE_DIR DST_VAR) # Cold match files that are not in tests/ directories file(GLOB_RECURSE "${DST_VAR}" RELATIVE "${BASE_DIR}" "gtest*.cpp") @@ -30,3 +32,5 @@ add_executable(test_page_storage_write_disk_full test_page_storage_write_disk_fu target_link_libraries(test_page_storage_write_disk_full dbms) target_compile_options(test_page_storage_write_disk_full PRIVATE -Wno-format) +add_executable(mem_usage_test mem_usage_test.cpp) + diff --git a/dbms/src/Storages/Page/tests/gtest_page_entry_map.cpp b/dbms/src/Storages/Page/tests/gtest_page_entry_map.cpp new file mode 100644 index 00000000000..f3cc41a51e7 --- /dev/null +++ b/dbms/src/Storages/Page/tests/gtest_page_entry_map.cpp @@ -0,0 +1,359 @@ +#include "gtest/gtest.h" + +#include + +namespace DB +{ +namespace tests +{ + +class PageEntryMap_test : public ::testing::Test +{ +public: + PageEntryMap_test() : map(nullptr), versions() {} + +protected: + void SetUp() override + { + // Generate an empty PageEntries for each test + auto snapshot = versions.getSnapshot(); + PageEntriesBuilder builder(snapshot->version()); + map = builder.build(); + } + + void TearDown() override { delete map; } + + PageEntries * map; + +private: + PageEntriesVersionSet versions; +}; + +TEST_F(PageEntryMap_test, Empty) +{ + size_t item_count = 0; + for (auto iter = map->cbegin(); iter != map->cend(); ++iter) + { + item_count += 1; + } + ASSERT_EQ(item_count, 0UL); + ASSERT_EQ(map->maxId(), 0UL); + + + // add some Pages, RefPages + PageEntry p0entry; + p0entry.file_id = 1; + p0entry.level = 0; + p0entry.checksum = 0x123; + map->put(0, p0entry); + map->ref(1, 0); + item_count = 0; + for (auto iter = map->cbegin(); iter != map->cend(); ++iter) + { + item_count += 1; + } + ASSERT_EQ(item_count, 2UL); + ASSERT_EQ(map->maxId(), 1UL); + + map->clear(); + item_count = 0; + for (auto iter = map->cbegin(); iter != map->cend(); ++iter) + { + item_count += 1; + } + ASSERT_EQ(item_count, 0UL); + ASSERT_EQ(map->maxId(), 0UL); +} + +TEST_F(PageEntryMap_test, UpdatePageEntry) +{ + const PageId page_id = 0; + PageEntry entry0; + entry0.checksum = 0x123; + map->put(page_id, entry0); + ASSERT_EQ(map->at(page_id).checksum, entry0.checksum); + + PageEntry entry1; + entry1.checksum = 0x456; + map->put(page_id, entry1); + ASSERT_EQ(map->at(page_id).checksum, entry1.checksum); + + map->del(page_id); + ASSERT_EQ(map->find(page_id), nullptr); +} + +TEST_F(PageEntryMap_test, PutDel) +{ + PageEntry p0entry; + p0entry.file_id = 1; + p0entry.level = 0; + p0entry.checksum = 0x123; + map->put(0, p0entry); + { + ASSERT_NE(map->find(0), nullptr); + const PageEntry & entry = map->at(0); + EXPECT_EQ(entry.file_id, p0entry.file_id); + EXPECT_EQ(entry.level, p0entry.level); + EXPECT_EQ(entry.checksum, p0entry.checksum); + } + // add RefPage2 -> Page0 + map->ref(2, 0); + { + ASSERT_NE(map->find(2), nullptr); + const PageEntry & entry = map->at(2); + EXPECT_EQ(entry.file_id, p0entry.file_id); + EXPECT_EQ(entry.level, p0entry.level); + EXPECT_EQ(entry.checksum, p0entry.checksum); + } + + // remove RefPage0 + map->del(0); + // now RefPage0 removed + ASSERT_EQ(map->find(0), nullptr); + { + // RefPage2 exist + ASSERT_NE(map->find(2), nullptr); + const PageEntry & entry = map->at(2); + EXPECT_EQ(entry.file_id, p0entry.file_id); + EXPECT_EQ(entry.level, p0entry.level); + EXPECT_EQ(entry.checksum, p0entry.checksum); + } + + // remove RefPage2 + map->del(2); + ASSERT_EQ(map->find(0), nullptr); + ASSERT_EQ(map->find(2), nullptr); +} + +TEST_F(PageEntryMap_test, UpdateRefPageEntry) +{ + const PageId page_id = 0; + const PageId ref_id = 1; // RefPage1 -> Page0 + PageEntry entry0; + entry0.checksum = 0x123; + map->put(page_id, entry0); + ASSERT_NE(map->find(page_id), nullptr); + ASSERT_EQ(map->at(page_id).checksum, entry0.checksum); + + map->ref(ref_id, page_id); + ASSERT_NE(map->find(ref_id), nullptr); + ASSERT_EQ(map->at(ref_id).checksum, entry0.checksum); + + // update on Page0, both Page0 and RefPage1 entry get update + PageEntry entry1; + entry1.checksum = 0x456; + map->put(page_id, entry1); + ASSERT_EQ(map->at(page_id).checksum, entry1.checksum); + ASSERT_EQ(map->at(ref_id).checksum, entry1.checksum); + + // update on RefPage1, both Page0 and RefPage1 entry get update + PageEntry entry2; + entry2.checksum = 0x789; + map->put(page_id, entry2); + ASSERT_EQ(map->at(page_id).checksum, entry2.checksum); + ASSERT_EQ(map->at(ref_id).checksum, entry2.checksum); + + // delete pages + map->del(page_id); + ASSERT_EQ(map->find(page_id), nullptr); + ASSERT_NE(map->find(ref_id), nullptr); + + map->del(ref_id); + ASSERT_EQ(map->find(ref_id), nullptr); +} + +TEST_F(PageEntryMap_test, UpdateRefPageEntry2) +{ + PageEntry entry0; + entry0.checksum = 0xf; + map->put(0, entry0); + map->ref(1, 0); + map->del(0); + ASSERT_EQ(map->find(0), nullptr); + ASSERT_EQ(map->at(1).checksum, 0xfUL); + + // update Page0, both Page0 and RefPage1 got update + PageEntry entry1; + entry1.checksum = 0x1; + map->put(0, entry1); + ASSERT_EQ(map->at(0).checksum, 0x1UL); + ASSERT_EQ(map->at(1).checksum, 0x1UL); +} + +TEST_F(PageEntryMap_test, AddRefToNonExistPage) +{ + PageEntry p0entry; + p0entry.file_id = 1; + p0entry.level = 0, p0entry.checksum = 0x123; + map->put(0, p0entry); + // if try to add ref to non-exist page + ASSERT_THROW({ map->ref(3, 2); }, DB::Exception); + // if try to access to non exist page, we get an exception + ASSERT_THROW({ map->at(3); }, DB::Exception); + + // accept add RefPage{3} to non-exist Page{2} + ASSERT_NO_THROW(map->ref(3, 2)); + // FIXME we can find iterator by RefPage's id + //auto iter_to_non_exist_ref_page = map->find(3); + //ASSERT_NE(iter_to_non_exist_ref_page, nullptr); + // FIXME but if we want to access that non-exist Page, we get an exception + //ASSERT_THROW({ iter_to_non_exist_ref_page.pageEntry(); }, DB::Exception); + // if try to access to non exist page, we get an exception + ASSERT_THROW({ map->at(3); }, DB::Exception); +} + +TEST_F(PageEntryMap_test, PutDuplicateRef) +{ + PageEntry p0entry; + p0entry.checksum = 0xFF; + map->put(0, p0entry); + ASSERT_EQ(map->at(0).checksum, p0entry.checksum); + + // if put RefPage1 -> Page0 twice, the second ref call is collapse + map->ref(1, 0); + ASSERT_EQ(map->at(1).checksum, p0entry.checksum); + map->ref(1, 0); + ASSERT_EQ(map->at(1).checksum, p0entry.checksum); + + map->del(0); + ASSERT_EQ(map->find(0), nullptr); + ASSERT_EQ(map->at(1).checksum, p0entry.checksum); +} + +TEST_F(PageEntryMap_test, PutRefOnRef) +{ + PageEntry p0entry; + p0entry.file_id = 1; + p0entry.level = 0; + p0entry.checksum = 0x123; + // put Page0 + map->put(0, p0entry); + // add RefPage2 -> Page0 + map->ref(2, 0); + // add RefPage3 -> RefPage2 -> Page0 + map->ref(3, 2); + { + ASSERT_NE(map->find(3), nullptr); + const PageEntry & entry = map->at(3); + EXPECT_EQ(entry.file_id, p0entry.file_id); + EXPECT_EQ(entry.level, p0entry.level); + EXPECT_EQ(entry.checksum, p0entry.checksum); + } + + // remove RefPage2 + map->del(2); + // now RefPage2 removed + ASSERT_EQ(map->find(2), nullptr); + { + // RefPage0 exist + ASSERT_NE(map->find(0), nullptr); + const PageEntry & entry = map->at(0); + EXPECT_EQ(entry.file_id, p0entry.file_id); + EXPECT_EQ(entry.level, p0entry.level); + EXPECT_EQ(entry.checksum, p0entry.checksum); + } + { + // RefPage3 exist + ASSERT_NE(map->find(3), nullptr); + const PageEntry & entry = map->at(3); + EXPECT_EQ(entry.file_id, p0entry.file_id); + EXPECT_EQ(entry.level, p0entry.level); + EXPECT_EQ(entry.checksum, p0entry.checksum); + } + + // remove RefPage0 + map->del(0); + // now RefPage0 is removed + ASSERT_EQ(map->find(0), nullptr); + ASSERT_EQ(map->find(2), nullptr); + { + // RefPage3 exist + ASSERT_NE(map->find(3), nullptr); + const PageEntry & entry = map->at(3); + EXPECT_EQ(entry.file_id, p0entry.file_id); + EXPECT_EQ(entry.level, p0entry.level); + EXPECT_EQ(entry.checksum, p0entry.checksum); + } + + // remove RefPage3 + map->del(3); + // now RefPage3 is removed + ASSERT_EQ(map->find(3), nullptr); + ASSERT_EQ(map->find(0), nullptr); + ASSERT_EQ(map->find(2), nullptr); +} + +TEST_F(PageEntryMap_test, ReBindRef) +{ + PageEntry entry0; + entry0.file_id = 1; + entry0.level = 0; + entry0.checksum = 0x123; + PageEntry entry1; + entry1.file_id = 1; + entry1.level = 0; + entry1.checksum = 0x123; + // put Page0, Page1 + map->put(0, entry0); + ASSERT_EQ(map->at(0).checksum, entry0.checksum); + map->put(1, entry1); + ASSERT_EQ(map->at(1).checksum, entry1.checksum); + + // rebind RefPage0 -> Page1 + map->ref(0, 1); + ASSERT_EQ(map->at(0).checksum, entry1.checksum); + + map->del(1); + ASSERT_EQ(map->at(0).checksum, entry1.checksum); + map->del(0); +} + +TEST_F(PageEntryMap_test, Scan) +{ + PageEntry p0entry; + p0entry.file_id = 1; + p0entry.level = 0; + p0entry.checksum = 0x123; + PageEntry p1entry; + p1entry.file_id = 1; + p1entry.level = 0; + p1entry.checksum = 0x456; + map->put(0, p0entry); + map->put(1, p1entry); + map->ref(10, 0); + map->ref(11, 1); + + // scan through all RefPages {0, 1, 10, 11} + std::set page_ids; + for (auto iter = map->cbegin(); iter != map->cend(); ++iter) + { + page_ids.insert(iter.pageId()); + if (iter.pageId() % 10 == 0) + { + const PageEntry & entry = iter.pageEntry(); + EXPECT_EQ(entry.file_id, p0entry.file_id); + EXPECT_EQ(entry.level, p0entry.level); + EXPECT_EQ(entry.checksum, p0entry.checksum); + } + else if (iter.pageId() % 10 == 1) + { + const PageEntry & entry = iter.pageEntry(); + EXPECT_EQ(entry.file_id, p1entry.file_id); + EXPECT_EQ(entry.level, p1entry.level); + EXPECT_EQ(entry.checksum, p1entry.checksum); + } + } + ASSERT_EQ(page_ids.size(), 4UL); + + // clear all mapping + map->clear(); + page_ids.clear(); + for (auto iter = map->cbegin(); iter != map->cend(); ++iter) + { + page_ids.insert(iter.pageId()); + } + ASSERT_TRUE(page_ids.empty()); +} + +} // namespace tests +} // namespace DB diff --git a/dbms/src/Storages/Page/tests/gtest_page_file.cpp b/dbms/src/Storages/Page/tests/gtest_page_file.cpp index 47e79b84cad..b557bb90f84 100644 --- a/dbms/src/Storages/Page/tests/gtest_page_file.cpp +++ b/dbms/src/Storages/Page/tests/gtest_page_file.cpp @@ -1,4 +1,7 @@ #include "gtest/gtest.h" + +#include + #include namespace DB @@ -8,8 +11,8 @@ namespace tests TEST(PageFile_test, Compare) { - PageFile pf0 = PageFile::openPageFileForRead(0, 0, ".", &Logger::get("PageFile")); - PageFile pf1 = PageFile::openPageFileForRead(0, 1, ".", &Logger::get("PageFile")); + PageFile pf0 = PageFile::openPageFileForRead(0, 0, ".", &Poco::Logger::get("PageFile")); + PageFile pf1 = PageFile::openPageFileForRead(0, 1, ".", &Poco::Logger::get("PageFile")); PageFile::Comparator comp; ASSERT_EQ(comp(pf0, pf1), true); diff --git a/dbms/src/Storages/Page/tests/gtest_page_map_version_set.cpp b/dbms/src/Storages/Page/tests/gtest_page_map_version_set.cpp new file mode 100644 index 00000000000..a41bc17114a --- /dev/null +++ b/dbms/src/Storages/Page/tests/gtest_page_map_version_set.cpp @@ -0,0 +1,643 @@ +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace tests +{ + +template +class PageMapVersionSet_test : public ::testing::Test +{ +public: + static void SetUpTestCase() + { + Poco::AutoPtr channel = new Poco::ConsoleChannel(std::cerr); + Poco::AutoPtr formatter(new Poco::PatternFormatter); + formatter->setProperty("pattern", "%L%Y-%m-%d %H:%M:%S.%i <%p> %s: %t"); + Poco::AutoPtr formatting_channel(new Poco::FormattingChannel(formatter, channel)); + Logger::root().setChannel(formatting_channel); + Logger::root().setLevel("trace"); + } + +public: + void SetUp() override + { + config_.compact_hint_delta_entries = 1; + config_.compact_hint_delta_deletions = 1; + } + +protected: + ::DB::MVCC::VersionSetConfig config_; +}; + +TYPED_TEST_CASE_P(PageMapVersionSet_test); + +TYPED_TEST_P(PageMapVersionSet_test, ApplyEdit) +{ + TypeParam versions(this->config_); + LOG_TRACE(&Logger::root(), "init :" + versions.toDebugStringUnlocked()); + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0x123; + edit.put(0, e); + versions.apply(edit); + } + // VersionSet, new version generate && old version removed at the same time + // VersionSetWithDelta, delta version merged + LOG_TRACE(&Logger::root(), "apply A:" + versions.toDebugStringUnlocked()); + EXPECT_EQ(versions.size(), 1UL); + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0x456; + edit.put(1, e); + versions.apply(edit); + } + LOG_TRACE(&Logger::root(), "apply B:" + versions.toDebugStringUnlocked()); + auto s2 = versions.getSnapshot(); + EXPECT_EQ(versions.size(), 1UL); + auto entry = s2->version()->at(0); + ASSERT_EQ(entry.checksum, 0x123UL); + auto entry2 = s2->version()->at(1); + ASSERT_EQ(entry2.checksum, 0x456UL); + s2.reset(); // release snapshot + EXPECT_EQ(versions.size(), 1UL); +} + +/// Generate two different snapshot(s1, s2) with apply new edits. +/// s2 released first, then release s1 +TYPED_TEST_P(PageMapVersionSet_test, ApplyEditWithReadLock) +{ + TypeParam versions(this->config_); + auto s1 = versions.getSnapshot(); + EXPECT_EQ(versions.size(), 1UL); + LOG_TRACE(&Logger::root(), "snapshot 1:" + versions.toDebugStringUnlocked()); + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0x123; + edit.put(0, e); + versions.apply(edit); + } + EXPECT_EQ(versions.size(), 2UL); // former node is hold by s1, append new version + LOG_TRACE(&Logger::root(), "apply B:" + versions.toDebugStringUnlocked()); + + // Get snapshot for checking edit is success + auto s2 = versions.getSnapshot(); + LOG_TRACE(&Logger::root(), "snapshot 2:" + versions.toDebugStringUnlocked()); + auto entry = s2->version()->at(0); + ASSERT_EQ(entry.checksum, 0x123UL); + + // Release snapshot2 + s2.reset(); + LOG_TRACE(&Logger::root(), "rel snap 2:" + versions.toDebugStringUnlocked()); + /// For VersionSet, size is 2 since A is still hold by s1 + /// For VersionDeltaSet, size is 1 since we do a compaction on delta + if constexpr (std::is_same_v) + EXPECT_EQ(versions.size(), 2UL); + else + EXPECT_EQ(versions.size(), 1UL); + + s1.reset(); + LOG_TRACE(&Logger::root(), "rel snap 1:" + versions.toDebugStringUnlocked()); + // VersionSet, old version removed from version set + // VersionSetWithDelta, delta version merged + EXPECT_EQ(versions.size(), 1UL); + + // Ensure that after old snapshot released, new snapshot get the same content + auto s3 = versions.getSnapshot(); + entry = s3->version()->at(0); + ASSERT_EQ(entry.checksum, 0x123UL); + s3.reset(); + + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0x456; + edit.put(0, e); + versions.apply(edit); + } + LOG_TRACE(&Logger::root(), "apply C:" + versions.toDebugStringUnlocked()); + // VersionSet, new version gen and old version remove at the same time + // VersionSetWithDelta, C merge to delta + EXPECT_EQ(versions.size(), 1UL); + auto s4 = versions.getSnapshot(); + entry = s4->version()->at(0); + ASSERT_EQ(entry.checksum, 0x456UL); +} + +/// Generate two different snapshot(s1, s2) with apply new edits. +/// s1 released first, then release s2 +TYPED_TEST_P(PageMapVersionSet_test, ApplyEditWithReadLock2) +{ + TypeParam versions(this->config_); + auto s1 = versions.getSnapshot(); + LOG_TRACE(&Logger::root(), "snapshot 1:" + versions.toDebugStringUnlocked()); + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0x123; + edit.put(0, e); + versions.apply(edit); + LOG_TRACE(&Logger::root(), "apply B:" + versions.toDebugStringUnlocked()); + auto s2 = versions.getSnapshot(); + auto entry = s2->version()->at(0); + ASSERT_EQ(entry.checksum, 0x123UL); + + s1.reset(); + LOG_TRACE(&Logger::root(), "rel snap 1:" + versions.toDebugStringUnlocked()); + // VersionSet, size decrease to 1 when s1 release + // VersionSetWithDelta, size is 2 since we can not do a compaction on delta + if constexpr (std::is_same_v) + EXPECT_EQ(versions.size(), 1UL); + else + EXPECT_EQ(versions.size(), 2UL); + + s2.reset(); + LOG_TRACE(&Logger::root(), "rel snap 2:" + versions.toDebugStringUnlocked()); + EXPECT_EQ(versions.size(), 1UL); +} + +/// Generate two different snapshot(s1, s2) with apply new edits. +/// s1 released first, then release s2 +TYPED_TEST_P(PageMapVersionSet_test, ApplyEditWithReadLock3) +{ + TypeParam versions(this->config_); + auto s1 = versions.getSnapshot(); + LOG_TRACE(&Logger::root(), "snapshot 1:" + versions.toDebugStringUnlocked()); + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0x123; + edit.put(0, e); + versions.apply(edit); + } + LOG_TRACE(&Logger::root(), "apply B:" + versions.toDebugStringUnlocked()); + auto s2 = versions.getSnapshot(); + auto entry = s2->version()->at(0); + ASSERT_EQ(entry.checksum, 0x123UL); + + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0xff; + edit.put(1, e); + versions.apply(edit); + } + LOG_TRACE(&Logger::root(), "apply C:" + versions.toDebugStringUnlocked()); + auto s3 = versions.getSnapshot(); + entry = s3->version()->at(1); + ASSERT_EQ(entry.checksum, 0xFFUL); + + s1.reset(); + LOG_TRACE(&Logger::root(), "rel snap 1:" + versions.toDebugStringUnlocked()); + // VersionSet, size decrease to 2 when s1 release + // VersionSetWithDelta, size is 3 since we can not do a compaction on delta + if constexpr (std::is_same_v) + EXPECT_EQ(versions.size(), 2UL); + else + EXPECT_EQ(versions.size(), 3UL); + + s2.reset(); + LOG_TRACE(&Logger::root(), "rel snap 2:" + versions.toDebugStringUnlocked()); + if constexpr (std::is_same_v) + EXPECT_EQ(versions.size(), 1UL); + else + EXPECT_EQ(versions.size(), 2UL); + + s3.reset(); + LOG_TRACE(&Logger::root(), "rel snap 3:" + versions.toDebugStringUnlocked()); + EXPECT_EQ(versions.size(), 1UL); +} + +TYPED_TEST_P(PageMapVersionSet_test, Restore) +{ + TypeParam versions(this->config_); + if constexpr (std::is_same_v) + { + auto s1 = versions.getSnapshot(); + + typename TypeParam::BuilderType builder(s1->version(), true, &Poco::Logger::root()); + + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 1; + edit.put(1, e); + edit.del(1); + e.checksum = 2; + edit.put(2, e); + e.checksum = 3; + edit.put(3, e); + builder.apply(edit); + } + { + PageEntriesEdit edit; + edit.del(2); + builder.apply(edit); + } + versions.restore(builder.build()); + } + else + { + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 1; + edit.put(1, e); + edit.del(1); + e.checksum = 2; + edit.put(2, e); + e.checksum = 3; + edit.put(3, e); + versions.apply(edit); + } + { + PageEntriesEdit edit; + edit.del(2); + versions.apply(edit); + } + } + + auto s = versions.getSnapshot(); + auto entry = s->version()->find(1); + ASSERT_EQ(entry, nullptr); + auto entry2 = s->version()->find(2); + ASSERT_EQ(entry2, nullptr); + auto entry3 = s->version()->find(3); + ASSERT_NE(entry3, nullptr); + ASSERT_EQ(entry3->checksum, 3UL); + + std::set valid_normal_page_ids; + if constexpr (std::is_same_v) + { + for (auto iter = s->version()->pages_cbegin(); iter != s->version()->pages_cend(); iter++) + valid_normal_page_ids.insert(iter->first); + } + else + { + valid_normal_page_ids = s->version()->validNormalPageIds(); + } + ASSERT_EQ(valid_normal_page_ids.count(1), 0UL); + ASSERT_EQ(valid_normal_page_ids.count(2), 0UL); + ASSERT_EQ(valid_normal_page_ids.count(3), 1UL); +} + +TYPED_TEST_P(PageMapVersionSet_test, GcConcurrencyDelPage) +{ + PageId pid = 0; + TypeParam versions(this->config_); + // Page0 is in PageFile{2, 0} at first + { + PageEntriesEdit init_edit; + PageEntry e; + e.file_id = 2; + e.level = 1; + init_edit.put(pid, e); + versions.apply(init_edit); + } + + // gc try to move Page0 -> PageFile{5, 1}, but is interrupt by write thread before gcApply + PageEntriesEdit gc_edit; + PageEntry e; + e.file_id = 5; + e.level = 1; + gc_edit.put(pid, e); + + { + // write thread del Page0 before gc thread get unique_lock of `read_mutex` + PageEntriesEdit write_edit; + write_edit.del(0); + versions.apply(write_edit); + } + + // gc continue + versions.gcApply(gc_edit); + + // Page0 don't update to page_map + auto snapshot = versions.getSnapshot(); + auto entry = snapshot->version()->find(pid); + ASSERT_EQ(entry, nullptr); +} + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunneeded-internal-declaration" +static void EXPECT_PagePos_LT(PageFileIdAndLevel p0, PageFileIdAndLevel p1) +{ + EXPECT_LT(p0, p1); +} +#pragma clang diagnostic pop + +TYPED_TEST_P(PageMapVersionSet_test, GcPageMove) +{ + EXPECT_PagePos_LT({4, 0}, {5, 1}); + EXPECT_PagePos_LT({5, 0}, {5, 1}); + EXPECT_PagePos_LT({5, 1}, {6, 1}); + EXPECT_PagePos_LT({5, 2}, {6, 1}); + + TypeParam versions(this->config_); + + const PageId pid = 0; + const PageId ref_pid = 1; + // old Page0 is in PageFile{5, 0} + { + PageEntriesEdit init_edit; + PageEntry e; + e.file_id = 5; + e.level = 0; + init_edit.put(pid, e); + init_edit.ref(ref_pid, pid); + versions.apply(init_edit); + } + + // gc move Page0 -> PageFile{5,1} + PageEntriesEdit gc_edit; + { + PageEntry e; + e.file_id = 5; + e.level = 1; + gc_edit.put(pid, e); + versions.gcApply(gc_edit); + } + + // Page get updated + auto snapshot = versions.getSnapshot(); + PageEntry entry = snapshot->version()->at(pid); + ASSERT_TRUE(entry.isValid()); + ASSERT_EQ(entry.file_id, 5ULL); + ASSERT_EQ(entry.level, 1U); + ASSERT_EQ(entry.ref, 2u); + + // RefPage got update at the same time + entry = snapshot->version()->at(ref_pid); + ASSERT_TRUE(entry.isValid()); + ASSERT_EQ(entry.file_id, 5u); + ASSERT_EQ(entry.level, 1u); + ASSERT_EQ(entry.ref, 2u); +} + +TYPED_TEST_P(PageMapVersionSet_test, GcConcurrencySetPage) +{ + const PageId pid = 0; + TypeParam versions(this->config_); + + + // gc move Page0 -> PageFile{5,1} + PageEntriesEdit gc_edit; + { + PageEntry e; + e.file_id = 5; + e.level = 1; + gc_edit.put(pid, e); + } + + { + // write thread insert newer Page0 before gc thread get unique_lock on `read_mutex` + PageEntriesEdit write_edit; + PageEntry e; + e.file_id = 6; + e.level = 0; + write_edit.put(pid, e); + versions.apply(write_edit); + } + + // gc continue + versions.gcApply(gc_edit); + + // read + auto snapshot = versions.getSnapshot(); + const PageEntry entry = snapshot->version()->at(pid); + ASSERT_TRUE(entry.isValid()); + ASSERT_EQ(entry.file_id, 6ULL); + ASSERT_EQ(entry.level, 0U); +} + +TYPED_TEST_P(PageMapVersionSet_test, UpdateOnRefPage) +{ + TypeParam versions(this->config_); + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0xf; + edit.put(2, e); + edit.ref(3, 2); + versions.apply(edit); + } + auto s1 = versions.getSnapshot(); + ASSERT_EQ(s1->version()->at(2).checksum, 0xfUL); + ASSERT_EQ(s1->version()->at(3).checksum, 0xfUL); + + // Update RefPage3, both Page2 and RefPage3 got updated. + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0xff; + edit.put(3, e); + versions.apply(edit); + } + auto s2 = versions.getSnapshot(); + ASSERT_EQ(s2->version()->at(3).checksum, 0xffUL); + ASSERT_EQ(s2->version()->at(2).checksum, 0xffUL); + s2.reset(); + s1.reset(); + auto s3 = versions.getSnapshot(); + ASSERT_EQ(s3->version()->at(3).checksum, 0xffUL); + ASSERT_EQ(s3->version()->at(2).checksum, 0xffUL); + //s3.reset(); + + // Del Page2, RefPage3 still there + { + PageEntriesEdit edit; + edit.del(2); + versions.apply(edit); + } + auto s4 = versions.getSnapshot(); + ASSERT_EQ(s4->version()->find(2), nullptr); + ASSERT_EQ(s4->version()->at(3).checksum, 0xffUL); + s4.reset(); + ASSERT_EQ(s3->version()->at(2).checksum, 0xffUL); + ASSERT_EQ(s3->version()->at(3).checksum, 0xffUL); + s3.reset(); + + auto s5 = versions.getSnapshot(); + ASSERT_EQ(s5->version()->find(2), nullptr); + ASSERT_EQ(s5->version()->at(3).checksum, 0xffUL); +} + +TYPED_TEST_P(PageMapVersionSet_test, UpdateOnRefPage2) +{ + TypeParam versions(this->config_); + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0xf; + edit.put(2, e); + edit.ref(3, 2); + edit.del(2); + versions.apply(edit); + } + auto s1 = versions.getSnapshot(); + ASSERT_EQ(s1->version()->find(2), nullptr); + ASSERT_EQ(s1->version()->at(3).checksum, 0xfUL); + + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0x9; + edit.put(2, e); + edit.del(2); + versions.apply(edit); + } + auto s2 = versions.getSnapshot(); + ASSERT_EQ(s2->version()->find(2), nullptr); + ASSERT_EQ(s2->version()->at(3).checksum, 0x9UL); +} + +TYPED_TEST_P(PageMapVersionSet_test, IsRefId) +{ + TypeParam versions(this->config_); + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0xf; + edit.put(1, e); + edit.ref(2, 1); + versions.apply(edit); + } + auto s1 = versions.getSnapshot(); + bool is_ref; + PageId normal_page_id; + std::tie(is_ref, normal_page_id) = s1->version()->isRefId(2); + ASSERT_TRUE(is_ref); + ASSERT_EQ(normal_page_id, 1UL); + + { + PageEntriesEdit edit; + edit.del(2); + versions.apply(edit); + } + auto s2 = versions.getSnapshot(); + std::tie(is_ref, normal_page_id) = s2->version()->isRefId(2); + ASSERT_FALSE(is_ref); +} + +TYPED_TEST_P(PageMapVersionSet_test, Snapshot) +{ + TypeParam versions(this->config_); + ASSERT_EQ(versions.size(), 1UL); + { + PageEntriesEdit init_edit; + PageEntry e; + e.checksum = 0x123; + init_edit.put(0, e); + e.checksum = 0x1234; + init_edit.put(1, e); + versions.apply(init_edit); + ASSERT_EQ(versions.size(), 1UL); + } + + auto s1 = versions.getSnapshot(); + { + PageEntriesEdit edit; + PageEntry e; + e.checksum = 0x456; + edit.put(0, e); + edit.del(1); + versions.apply(edit); + } + ASSERT_EQ(versions.size(), 2UL); // previous version is hold by `s1`, list size grow to 2 + + auto s2 = versions.getSnapshot(); + auto p0 = s2->version()->find(0); + ASSERT_NE(p0, nullptr); + ASSERT_EQ(p0->checksum, 0x456UL); // entry is updated in snapshot 2 + auto p1 = s2->version()->find(1); + ASSERT_EQ(p1, nullptr); +} + +TYPED_TEST_P(PageMapVersionSet_test, LiveFiles) +{ + TypeParam versions(this->config_); + + { + PageEntriesEdit edit; + PageEntry e; + e.file_id = 1; + e.level = 0; + edit.put(0, e); + e.file_id = 2; + edit.put(1, e); + e.file_id = 3; + edit.put(2, e); + versions.apply(edit); + } + auto s1 = versions.getSnapshot(); + { + PageEntriesEdit edit; + edit.del(0); + PageEntry e; + e.file_id = 3; + e.level = 1; + edit.put(3, e); + versions.apply(edit); + } + auto s2 = versions.getSnapshot(); + { + PageEntriesEdit edit; + edit.del(3); + versions.apply(edit); + } + auto s3 = versions.getSnapshot(); + s3.reset(); // do compact on version-list, and + auto livefiles = versions.listAllLiveFiles(); + ASSERT_EQ(livefiles.size(), 4UL); + ASSERT_EQ(livefiles.count(std::make_pair(1, 0)), 1UL); // hold by s1 + ASSERT_EQ(livefiles.count(std::make_pair(2, 0)), 1UL); // hold by current, s1, s2 + ASSERT_EQ(livefiles.count(std::make_pair(3, 0)), 1UL); // hold by current, s1, s2 + ASSERT_EQ(livefiles.count(std::make_pair(3, 1)), 1UL); // hold by s2 + + s2.reset(); + livefiles = versions.listAllLiveFiles(); + ASSERT_EQ(livefiles.size(), 3UL); + ASSERT_EQ(livefiles.count(std::make_pair(1, 0)), 1UL); // hold by s1 + ASSERT_EQ(livefiles.count(std::make_pair(2, 0)), 1UL); // hold by current, s1 + ASSERT_EQ(livefiles.count(std::make_pair(3, 0)), 1UL); // hold by current, s1 + + s1.reset(); + livefiles = versions.listAllLiveFiles(); + ASSERT_EQ(livefiles.size(), 2UL); + ASSERT_EQ(livefiles.count(std::make_pair(2, 0)), 1UL); // hold by current + ASSERT_EQ(livefiles.count(std::make_pair(3, 0)), 1UL); // hold by current +} + +REGISTER_TYPED_TEST_CASE_P(PageMapVersionSet_test, + ApplyEdit, + ApplyEditWithReadLock, + ApplyEditWithReadLock2, + ApplyEditWithReadLock3, + Restore, + GcConcurrencyDelPage, + GcPageMove, + GcConcurrencySetPage, + UpdateOnRefPage, + UpdateOnRefPage2, + IsRefId, + Snapshot, + LiveFiles); + +using VersionSetTypes = ::testing::Types; +INSTANTIATE_TYPED_TEST_CASE_P(VersionSetTypedTest, PageMapVersionSet_test, VersionSetTypes); + + +} // namespace tests +} // namespace DB diff --git a/dbms/src/Storages/Page/tests/gtest_page_storage.cpp b/dbms/src/Storages/Page/tests/gtest_page_storage.cpp index ab7e671b999..09966c3a6ee 100644 --- a/dbms/src/Storages/Page/tests/gtest_page_storage.cpp +++ b/dbms/src/Storages/Page/tests/gtest_page_storage.cpp @@ -1,5 +1,19 @@ #include "gtest/gtest.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + #define private public #include #undef private @@ -12,40 +26,61 @@ namespace tests class PageStorage_test : public ::testing::Test { public: - PageStorage_test(): path("./t"), storage() {} + PageStorage_test() : path("./t"), storage() {} protected: - void SetUp() override { + static void SetUpTestCase() + { + Poco::AutoPtr channel = new Poco::ConsoleChannel(std::cerr); + Poco::AutoPtr formatter(new Poco::PatternFormatter); + formatter->setProperty("pattern", "%L%Y-%m-%d %H:%M:%S.%i <%p> %s: %t"); + Poco::AutoPtr formatting_channel(new Poco::FormattingChannel(formatter, channel)); + Logger::root().setChannel(formatting_channel); + Logger::root().setLevel("trace"); + } + + void SetUp() override + { // drop dir if exists Poco::File file(path); - if (file.exists()) { + if (file.exists()) + { file.remove(true); } - config.file_roll_size = 512; + // default test config + config.file_roll_size = 512; config.merge_hint_low_used_file_num = 1; - storage = std::make_shared(path, config); + + storage = reopenWithConfig(config); + } + + std::shared_ptr reopenWithConfig(const PageStorage::Config & config_) + { + return std::make_shared(path, config_); } + protected: - String path; - PageStorage::Config config; + String path; + PageStorage::Config config; std::shared_ptr storage; }; TEST_F(PageStorage_test, WriteRead) { + const UInt64 tag = 0; const size_t buf_sz = 1024; - char c_buff[buf_sz]; + char c_buff[buf_sz]; for (size_t i = 0; i < buf_sz; ++i) { c_buff[i] = i % 0xff; } { - WriteBatch batch; - ReadBufferPtr buff = std::make_shared(c_buff,sizeof(c_buff)); - batch.putPage(0, 0, buff, buf_sz); + WriteBatch batch; + ReadBufferPtr buff = std::make_shared(c_buff, sizeof(c_buff)); + batch.putPage(0, tag, buff, buf_sz); buff = std::make_shared(c_buff, sizeof(c_buff)); - batch.putPage(1, 0, buff, buf_sz); + batch.putPage(1, tag, buff, buf_sz); storage->write(batch); } @@ -65,14 +100,53 @@ TEST_F(PageStorage_test, WriteRead) } } -TEST_F(PageStorage_test, WriteReadGc) +TEST_F(PageStorage_test, WriteMultipleBatchRead) +{ + const UInt64 tag = 0; + const size_t buf_sz = 1024; + char c_buff[buf_sz]; + for (size_t i = 0; i < buf_sz; ++i) + { + c_buff[i] = i % 0xff; + } + + { + WriteBatch batch; + ReadBufferPtr buff = std::make_shared(c_buff, sizeof(c_buff)); + batch.putPage(0, tag, buff, buf_sz); + storage->write(batch); + } + { + WriteBatch batch; + ReadBufferPtr buff = std::make_shared(c_buff, sizeof(c_buff)); + batch.putPage(1, tag, buff, buf_sz); + storage->write(batch); + } + + Page page0 = storage->read(0); + ASSERT_EQ(page0.data.size(), buf_sz); + ASSERT_EQ(page0.page_id, 0UL); + for (size_t i = 0; i < buf_sz; ++i) + { + EXPECT_EQ(*(page0.data.begin() + i), static_cast(i % 0xff)); + } + Page page1 = storage->read(1); + ASSERT_EQ(page1.data.size(), buf_sz); + ASSERT_EQ(page1.page_id, 1UL); + for (size_t i = 0; i < buf_sz; ++i) + { + EXPECT_EQ(*(page1.data.begin() + i), static_cast(i % 0xff)); + } +} + +TEST_F(PageStorage_test, WriteReadAfterGc) { const size_t buf_sz = 256; - char c_buff[buf_sz]; + char c_buff[buf_sz]; const size_t num_repeat = 10; - PageId pid = 1; - const char page0_byte = 0x3f; + PageId pid = 1; + const char page0_byte = 0x3f; { // put page0 WriteBatch batch; @@ -128,65 +202,556 @@ TEST_F(PageStorage_test, WriteReadGc) EXPECT_EQ(*(page1.data.begin() + i), static_cast(num_repeat % 0xff)); } } +} + +TEST_F(PageStorage_test, GcMigrateValidRefPages) +{ + const size_t buf_sz = 1024; + char buf[buf_sz] = {0}; + const PageId ref_id = 1024; + const PageId page_id = 32; + + const PageId placeholder_page_id = 33; + const PageId deleted_ref_id = 1025; + + { + // prepare ref page record without any valid pages + PageStorage::Config tmp_config(config); + tmp_config.file_roll_size = 1; + storage = reopenWithConfig(tmp_config); + { + // this batch is written to PageFile{1,0} + WriteBatch batch; + batch.putPage(page_id, 0, std::make_shared(buf, buf_sz), buf_sz); + storage->write(batch); + } + { + // this batch is written to PageFile{2,0} + WriteBatch batch; + batch.putRefPage(ref_id, page_id); + batch.delPage(page_id); + // deleted ref pages will not migrate + batch.putRefPage(deleted_ref_id, page_id); + batch.putPage(placeholder_page_id, 0, std::make_shared(buf, buf_sz), buf_sz); + storage->write(batch); + } + { + // this batch is written to PageFile{3,0} + WriteBatch batch; + batch.delPage(deleted_ref_id); + storage->write(batch); + } + const PageEntry entry2 = storage->getEntry(ref_id); + ASSERT_TRUE(entry2.isValid()); + } + const PageStorage::GcLivesPages lives_pages; + PageStorage::GcCandidates candidates; + PageStorage::SnapshotPtr snapshot = storage->getSnapshot(); + //candidates.insert(PageFileIdAndLevel{1, 0}); + candidates.insert(PageFileIdAndLevel{2, 0}); + const PageEntriesEdit gc_file_edit = storage->gcMigratePages(snapshot, lives_pages, candidates); + ASSERT_FALSE(gc_file_edit.empty()); + // check the ref is migrated. + // check the deleted ref is not migrated. + bool is_deleted_ref_id_exists = false; + for (const auto & rec : gc_file_edit.getRecords()) + { + if (rec.type == WriteBatch::WriteType::REF) + { + if (rec.page_id == ref_id) + { + ASSERT_EQ(rec.ori_page_id, page_id); + } + if (rec.page_id == deleted_ref_id) + { + ASSERT_NE(rec.ori_page_id, page_id); + is_deleted_ref_id_exists = true; + } + } + } + ASSERT_FALSE(is_deleted_ref_id_exists); } -TEST_F(PageStorage_test, GcConcurrencyDelPage) -{ - PageId pid = 0; - // gc move Page0 -> PageFile{5,1} - PageCacheMap map; - map.emplace(pid, PageCache{.file_id=1, .level=0}); - // write thread del Page0 in page_map before gc thread get unique_lock of `read_mutex` - storage->page_cache_map.clear(); - // gc continue - storage->gcUpdatePageMap(map); - // page0 don't update to page_map - const PageCache entry = storage->getCache(pid); - ASSERT_FALSE(entry.isValid()); +TEST_F(PageStorage_test, GcMoveRefPage) +{ + const size_t buf_sz = 256; + char c_buff[buf_sz]; + + { + WriteBatch batch; + memset(c_buff, 0xf, buf_sz); + ReadBufferPtr buff = std::make_shared(c_buff, sizeof(c_buff)); + batch.putPage(1, 0, buff, buf_sz); + batch.putRefPage(2, 1); + batch.putRefPage(3, 2); + + batch.delPage(2); + + storage->write(batch); + } + + PageFileIdAndLevel id_and_lvl = {1, 0}; // PageFile{1, 0} is ready to be migrated by gc + PageStorage::GcLivesPages livesPages{{id_and_lvl, + {buf_sz, + { + 1, + }}}}; + PageStorage::GcCandidates candidates{ + id_and_lvl, + }; + auto s0 = storage->getSnapshot(); + PageEntriesEdit edit = storage->gcMigratePages(s0, livesPages, candidates); + + // After migrate, RefPage 3 -> 1 is still valid + bool exist = false; + for (const auto & rec : edit.getRecords()) + { + if (rec.type == WriteBatch::WriteType::REF && rec.page_id == 3 && rec.ori_page_id == 1) + { + exist = true; + break; + } + } + ASSERT_TRUE(exist); + s0.reset(); + + // reopen PageStorage, RefPage 3 -> 1 is still valid + storage = reopenWithConfig(config); + auto s1 = storage->getSnapshot(); + auto [is_ref, normal_page_id] = s1->version()->isRefId(3); + ASSERT_TRUE(is_ref); + ASSERT_EQ(normal_page_id, 1UL); } -static void EXPECT_PagePos_LT(PageFileIdAndLevel p0, PageFileIdAndLevel p1) +/** + * PageStorage tests with predefine Page1 && Page2 + */ +class PageStorageWith2Pages_test : public PageStorage_test { - EXPECT_LT(p0, p1); +public: + PageStorageWith2Pages_test() : PageStorage_test() {} + +protected: + void SetUp() override + { + PageStorage_test::SetUp(); + // put predefine Page1, Page2 + const size_t buf_sz = 1024; + char buf[buf_sz]; + { + WriteBatch wb; + memset(buf, 0x01, buf_sz); + wb.putPage(1, 0, std::make_shared(buf, buf_sz), buf_sz); + storage->write(wb); + } + { + WriteBatch wb; + memset(buf, 0x02, buf_sz); + wb.putPage(2, 0, std::make_shared(buf, buf_sz), buf_sz); + storage->write(wb); + } + } +}; + +TEST_F(PageStorageWith2Pages_test, UpdateRefPages) +{ + /// update on RefPage, all references get updated. + const UInt64 tag = 0; + // put ref page: RefPage3 -> Page2 + { + WriteBatch batch; + batch.putRefPage(3, 2); + storage->write(batch); + } + const size_t buf_sz = 1024; + char buf[buf_sz]; + // if update PageId == 3 or PageId == 2, both RefPage3 && Page2 get updated + { + // update RefPage3 + WriteBatch batch; + char ch_to_update = 0x0f; + memset(buf, ch_to_update, buf_sz); + batch.putPage(3, tag, std::make_shared(buf, buf_sz), buf_sz); + storage->write(batch); + + // check RefPage3 and Page2 both get updated + const Page page3 = storage->read(3); + for (size_t i = 0; i < page3.data.size(); ++i) + { + EXPECT_EQ(*(page3.data.begin() + i), ch_to_update); + } + const Page page2 = storage->read(2); + for (size_t i = 0; i < page2.data.size(); ++i) + { + EXPECT_EQ(*(page2.data.begin() + i), ch_to_update); + } + } + { + // update Page2 + WriteBatch batch; + char ch_to_update = 0xef; + memset(buf, ch_to_update, buf_sz); + batch.putPage(2, tag, std::make_shared(buf, buf_sz), buf_sz); + storage->write(batch); + + // check RefPage3 and Page2 both get updated + const Page page3 = storage->read(3); + for (size_t i = 0; i < page3.data.size(); ++i) + { + EXPECT_EQ(*(page3.data.begin() + i), ch_to_update); + } + const Page page2 = storage->read(2); + for (size_t i = 0; i < page2.data.size(); ++i) + { + EXPECT_EQ(*(page2.data.begin() + i), ch_to_update); + } + } } -TEST_F(PageStorage_test, GcPageMove) -{ - EXPECT_PagePos_LT({4, 0}, {5, 1}); - EXPECT_PagePos_LT({5, 0}, {5, 1}); - EXPECT_PagePos_LT({5, 1}, {6, 1}); - EXPECT_PagePos_LT({5, 2}, {6, 1}); - - const PageId pid = 0; - // old Page0 is in PageFile{5, 0} - storage->page_cache_map.emplace(pid, PageCache{.file_id=5, .level=0,}); - // gc move Page0 -> PageFile{5,1} - PageCacheMap map; - map.emplace(pid, PageCache{.file_id=5, .level=1,}); - storage->gcUpdatePageMap(map); - // page_map get updated - const PageCache entry = storage->getCache(pid); - ASSERT_TRUE(entry.isValid()); - ASSERT_EQ(entry.file_id, 5u); - ASSERT_EQ(entry.level, 1u); +TEST_F(PageStorageWith2Pages_test, DeleteRefPages) +{ + // put ref page: RefPage3 -> Page2, RefPage4 -> Page2 + { + WriteBatch batch; + batch.putRefPage(3, 2); + batch.putRefPage(4, 2); + storage->write(batch); + } + { // tests for delete Page + // delete RefPage3, RefPage4 don't get deleted + { + WriteBatch batch; + batch.delPage(3); + storage->write(batch); + EXPECT_FALSE(storage->getEntry(3).isValid()); + EXPECT_TRUE(storage->getEntry(4).isValid()); + } + // delete RefPage4 + { + WriteBatch batch; + batch.delPage(4); + storage->write(batch); + EXPECT_FALSE(storage->getEntry(4).isValid()); + } + } } -TEST_F(PageStorage_test, GcConcurrencySetPage) -{ - const PageId pid = 0; - // gc move Page0 -> PageFile{5,1} - PageCacheMap map; - map.emplace(pid, PageCache{.file_id=5, .level=1,}); - // write thread insert newer Page0 before gc thread get unique_lock on `read_mutex` - storage->page_cache_map.emplace(pid, PageCache{.file_id=6, .level=0,}); - // gc continue - storage->gcUpdatePageMap(map); - // read - const PageCache entry = storage->getCache(pid); - ASSERT_TRUE(entry.isValid()); - ASSERT_EQ(entry.file_id, 6u); - ASSERT_EQ(entry.level, 0u); +TEST_F(PageStorageWith2Pages_test, PutRefPagesOverRefPages) +{ + /// put ref page to ref page, ref path collapse to normal page + { + WriteBatch batch; + // RefPage3 -> Page1 + batch.putRefPage(3, 1); + // RefPage4 -> RefPage3 -> Page1 + batch.putRefPage(4, 3); + storage->write(batch); + } + + const auto p0entry = storage->getEntry(1); + + { + // check that RefPage3 -> Page1 + auto entry = storage->getEntry(3); + ASSERT_EQ(entry.fileIdLevel(), p0entry.fileIdLevel()); + ASSERT_EQ(entry.offset, p0entry.offset); + ASSERT_EQ(entry.size, p0entry.size); + const Page page3 = storage->read(3); + for (size_t i = 0; i < page3.data.size(); ++i) + { + EXPECT_EQ(*(page3.data.begin() + i), 0x01); + } + } + + { + // check that RefPage4 -> Page1 + auto entry = storage->getEntry(4); + ASSERT_EQ(entry.fileIdLevel(), p0entry.fileIdLevel()); + ASSERT_EQ(entry.offset, p0entry.offset); + ASSERT_EQ(entry.size, p0entry.size); + const Page page4 = storage->read(4); + for (size_t i = 0; i < page4.data.size(); ++i) + { + EXPECT_EQ(*(page4.data.begin() + i), 0x01); + } + } +} + +TEST_F(PageStorageWith2Pages_test, PutDuplicateRefPages) +{ + /// put duplicated RefPages in different WriteBatch + { + WriteBatch batch; + batch.putRefPage(3, 1); + storage->write(batch); + + WriteBatch batch2; + batch2.putRefPage(3, 1); + storage->write(batch); + // now Page1's entry has ref count == 2 but not 3 + } + PageEntry entry1 = storage->getEntry(1); + ASSERT_TRUE(entry1.isValid()); + PageEntry entry3 = storage->getEntry(3); + ASSERT_TRUE(entry3.isValid()); + + EXPECT_EQ(entry1.fileIdLevel(), entry3.fileIdLevel()); + EXPECT_EQ(entry1.offset, entry3.offset); + EXPECT_EQ(entry1.size, entry3.size); + EXPECT_EQ(entry1.checksum, entry3.checksum); + + // check Page1's entry has ref count == 2 but not 1 + { + WriteBatch batch; + batch.delPage(1); + storage->write(batch); + PageEntry entry_after_del1 = storage->getEntry(3); + ASSERT_TRUE(entry_after_del1.isValid()); + EXPECT_EQ(entry1.fileIdLevel(), entry_after_del1.fileIdLevel()); + EXPECT_EQ(entry1.offset, entry_after_del1.offset); + EXPECT_EQ(entry1.size, entry_after_del1.size); + EXPECT_EQ(entry1.checksum, entry_after_del1.checksum); + + WriteBatch batch2; + batch2.delPage(3); + storage->write(batch2); + PageEntry entry_after_del2 = storage->getEntry(3); + ASSERT_FALSE(entry_after_del2.isValid()); + } +} + +TEST_F(PageStorageWith2Pages_test, PutCollapseDuplicatedRefPages) +{ + /// put duplicated RefPages due to ref-path-collapse + { + WriteBatch batch; + // RefPage3 -> Page1 + batch.putRefPage(3, 1); + // RefPage4 -> RefPage3, collapse to RefPage4 -> Page1 + batch.putRefPage(4, 3); + storage->write(batch); + + WriteBatch batch2; + // RefPage4 -> Page1, duplicated due to ref-path-collapse + batch2.putRefPage(4, 1); + storage->write(batch); + // now Page1's entry has ref count == 3 but not 2 + } + + PageEntry entry1 = storage->getEntry(1); + ASSERT_TRUE(entry1.isValid()); + PageEntry entry3 = storage->getEntry(3); + ASSERT_TRUE(entry3.isValid()); + PageEntry entry4 = storage->getEntry(4); + ASSERT_TRUE(entry4.isValid()); + + EXPECT_EQ(entry1.fileIdLevel(), entry4.fileIdLevel()); + EXPECT_EQ(entry1.offset, entry4.offset); + EXPECT_EQ(entry1.size, entry4.size); + EXPECT_EQ(entry1.checksum, entry4.checksum); + + // check Page1's entry has ref count == 3 but not 2 + { + WriteBatch batch; + batch.delPage(1); + batch.delPage(4); + storage->write(batch); + PageEntry entry_after_del2 = storage->getEntry(3); + ASSERT_TRUE(entry_after_del2.isValid()); + EXPECT_EQ(entry1.fileIdLevel(), entry_after_del2.fileIdLevel()); + EXPECT_EQ(entry1.offset, entry_after_del2.offset); + EXPECT_EQ(entry1.size, entry_after_del2.size); + EXPECT_EQ(entry1.checksum, entry_after_del2.checksum); + + WriteBatch batch2; + batch2.delPage(3); + storage->write(batch2); + PageEntry entry_after_del3 = storage->getEntry(3); + ASSERT_FALSE(entry_after_del3.isValid()); + } +} + +TEST_F(PageStorageWith2Pages_test, AddRefPageToNonExistPage) +{ + { + WriteBatch batch; + // RefPage3 -> non-exist Page999 + batch.putRefPage(3, 999); + ASSERT_NO_THROW(storage->write(batch)); + } + + ASSERT_FALSE(storage->getEntry(3).isValid()); + ASSERT_THROW(storage->read(3), DB::Exception); + + // Invalid Pages is filtered after reopen PageStorage + ASSERT_NO_THROW(reopenWithConfig(config)); + ASSERT_FALSE(storage->getEntry(3).isValid()); +} + +TEST_F(PageStorageWith2Pages_test, SnapshotReadSnapshotVersion) +{ + char ch_before = 0x01; + char ch_update = 0xFF; + auto snapshot = storage->getSnapshot(); + PageEntry p1_snapshot_entry = storage->getEntry(1, snapshot); + + { + // write new version of Page1 + const size_t buf_sz = 1024; + char buf[buf_sz]; + { + WriteBatch wb; + memset(buf, ch_update, buf_sz); + wb.putPage(1, 0, std::make_shared(buf, buf_sz), buf_sz); + wb.putPage(3, 0, std::make_shared(buf, buf_sz), buf_sz); + storage->write(wb); + } + } + + { + /// read without snapshot + PageEntry p1_entry = storage->getEntry(1); + ASSERT_NE(p1_entry.checksum, p1_snapshot_entry.checksum); + + Page page1 = storage->read(1); + ASSERT_EQ(*page1.data.begin(), ch_update); + + // Page3 + PageEntry p3_entry = storage->getEntry(3); + ASSERT_TRUE(p3_entry.isValid()); + Page page3 = storage->read(3); + ASSERT_EQ(*page3.data.begin(), ch_update); + } + + { + /// read with snapshot + // getEntry with snapshot + PageEntry p1_entry = storage->getEntry(1, snapshot); + ASSERT_EQ(p1_entry.checksum, p1_snapshot_entry.checksum); + + // read(PageId) with snapshot + Page page1 = storage->read(1, snapshot); + ASSERT_EQ(*page1.data.begin(), ch_before); + + // read(vec) with snapshot + PageIds ids{ + 1, + }; + auto pages = storage->read(ids, snapshot); + ASSERT_EQ(pages.count(1), 1UL); + ASSERT_EQ(*pages[1].data.begin(), ch_before); + // TODO read(vec, callback) with snapshot + + // new page do appear while read with snapshot + PageEntry p3_entry = storage->getEntry(3, snapshot); + ASSERT_FALSE(p3_entry.isValid()); + ASSERT_THROW({ storage->read(3, snapshot); }, DB::Exception); + } +} + +TEST_F(PageStorageWith2Pages_test, GetIdenticalSnapshots) +{ + char ch_before = 0x01; + char ch_update = 0xFF; + PageEntry p1_snapshot_entry = storage->getEntry(1); + auto s1 = storage->getSnapshot(); + auto s2 = storage->getSnapshot(); + auto s3 = storage->getSnapshot(); + + { + // write new version of Page1 + const size_t buf_sz = 1024; + char buf[buf_sz]; + { + WriteBatch wb; + memset(buf, ch_update, buf_sz); + wb.putPage(1, 0, std::make_shared(buf, buf_sz), buf_sz); + wb.putPage(3, 0, std::make_shared(buf, buf_sz), buf_sz); + storage->write(wb); + } + } + + /// read with snapshot + const PageIds ids{ + 1, + }; + // getEntry with snapshot + PageEntry p1_entry = storage->getEntry(1, s1); + ASSERT_EQ(p1_entry.checksum, p1_snapshot_entry.checksum); + p1_entry = storage->getEntry(1, s2); + ASSERT_EQ(p1_entry.checksum, p1_snapshot_entry.checksum); + p1_entry = storage->getEntry(1, s3); + ASSERT_EQ(p1_entry.checksum, p1_snapshot_entry.checksum); + // read(PageId) with snapshot + Page page1 = storage->read(1, s1); + ASSERT_EQ(*page1.data.begin(), ch_before); + page1 = storage->read(1, s2); + ASSERT_EQ(*page1.data.begin(), ch_before); + page1 = storage->read(1, s3); + ASSERT_EQ(*page1.data.begin(), ch_before); + // read(vec) with snapshot + auto pages = storage->read(ids, s1); + ASSERT_EQ(pages.count(1), 1UL); + ASSERT_EQ(*pages[1].data.begin(), ch_before); + pages = storage->read(ids, s2); + ASSERT_EQ(pages.count(1), 1UL); + ASSERT_EQ(*pages[1].data.begin(), ch_before); + pages = storage->read(ids, s3); + ASSERT_EQ(pages.count(1), 1UL); + ASSERT_EQ(*pages[1].data.begin(), ch_before); + // TODO read(vec, callback) with snapshot + // without snapshot + p1_entry = storage->getEntry(1); + ASSERT_NE(p1_entry.checksum, p1_snapshot_entry.checksum); + + s1.reset(); /// free snapshot 1 + + // getEntry with snapshot + p1_entry = storage->getEntry(1, s2); + ASSERT_EQ(p1_entry.checksum, p1_snapshot_entry.checksum); + p1_entry = storage->getEntry(1, s3); + ASSERT_EQ(p1_entry.checksum, p1_snapshot_entry.checksum); + // read(PageId) with snapshot + page1 = storage->read(1, s2); + ASSERT_EQ(*page1.data.begin(), ch_before); + page1 = storage->read(1, s3); + ASSERT_EQ(*page1.data.begin(), ch_before); + // read(vec) with snapshot + ASSERT_EQ(*pages[1].data.begin(), ch_before); + pages = storage->read(ids, s2); + ASSERT_EQ(pages.count(1), 1UL); + ASSERT_EQ(*pages[1].data.begin(), ch_before); + pages = storage->read(ids, s3); + ASSERT_EQ(pages.count(1), 1UL); + ASSERT_EQ(*pages[1].data.begin(), ch_before); + // TODO read(vec, callback) with snapshot + // without snapshot + p1_entry = storage->getEntry(1); + ASSERT_NE(p1_entry.checksum, p1_snapshot_entry.checksum); + + s2.reset(); /// free snapshot 2 + + // getEntry with snapshot + p1_entry = storage->getEntry(1, s3); + ASSERT_EQ(p1_entry.checksum, p1_snapshot_entry.checksum); + // read(PageId) with snapshot + page1 = storage->read(1, s3); + ASSERT_EQ(*page1.data.begin(), ch_before); + // read(vec) with snapshot + pages = storage->read(ids, s3); + ASSERT_EQ(pages.count(1), 1UL); + ASSERT_EQ(*pages[1].data.begin(), ch_before); + // TODO read(vec, callback) with snapshot + // without snapshot + p1_entry = storage->getEntry(1); + ASSERT_NE(p1_entry.checksum, p1_snapshot_entry.checksum); + + s3.reset(); /// free snapshot 3 + + // without snapshot + p1_entry = storage->getEntry(1); + ASSERT_NE(p1_entry.checksum, p1_snapshot_entry.checksum); } } // namespace tests diff --git a/dbms/src/Storages/Page/tests/mem_usage_test.cpp b/dbms/src/Storages/Page/tests/mem_usage_test.cpp new file mode 100644 index 00000000000..21ed9a0e771 --- /dev/null +++ b/dbms/src/Storages/Page/tests/mem_usage_test.cpp @@ -0,0 +1,81 @@ +#include +#include + +#include +#include +#include +#include +#include + + +using PageId = uint64_t; +struct PageEntry +{ + // if file_id == 0, means it is invalid + uint64_t file_id = 0; + uint32_t level = 0; + uint32_t size = 0; + uint64_t offset = 0; + uint64_t tag = 0; + uint64_t checksum = 0; + uint32_t ref = 1; // for ref counting + + inline bool isValid() const { return file_id != 0; } + inline std::pair fileIdLevel() const { return std::make_pair(file_id, level); } +}; + +int main(int argc, char ** argv) +{ + std::string mode; + size_t num_entries = 9 * 1000 * 1000; + if (argc < 2) + { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + else + { + mode = argv[1]; + if (mode != "hash" && mode != "vec" && mode != "tree") + { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + if (argc >= 3) + { + num_entries = strtol(argv[2], nullptr, 10); + } + } + + printf("inserting to %s with size: %zu\n", mode.c_str(), num_entries); + std::unordered_map entries_map; + std::map entries_tree_map; + std::vector> entries_vec; + for (size_t i = 0; i < num_entries; ++i) + { + if (i % (1000 * 1000) == 0) + printf("insert %zu done.\n", i); + if (mode == "hash") + { + // 9,000,000 entries 804.6 MB + // 18,100,100 entries 1.57 GB + entries_map.insert(std::make_pair(i, PageEntry{.file_id = i})); + } + else if (mode == "tree") + { + // 9,000,000 entries 837.3 MB + // 18,000,000 entries 1.64 GB + entries_tree_map.insert(std::make_pair(i, PageEntry{.file_id = i})); + } + else if (mode == "vec") + { + // 9,000,000 entries 488.0 MB + // 18,000,000 entries 968.7 MB + entries_vec.push_back(std::make_pair(i, PageEntry{.file_id = i})); + } + } + printf("All insert to %s done.\n", mode.c_str()); + std::cin.get(); + + return 0; +} diff --git a/dbms/src/Storages/Page/tests/stress_page_stroage.cpp b/dbms/src/Storages/Page/tests/stress_page_stroage.cpp index 47def472b6a..d6afd4a16fc 100644 --- a/dbms/src/Storages/Page/tests/stress_page_stroage.cpp +++ b/dbms/src/Storages/Page/tests/stress_page_stroage.cpp @@ -1,78 +1,166 @@ #include +#include #include #include #include +#include #include +#include #include +#include #include #include #include #include +#include +#include #include +#include + +using std::chrono::high_resolution_clock; +using std::chrono::milliseconds; using PSPtr = std::shared_ptr; const DB::PageId MAX_PAGE_ID = 1000; std::atomic running_without_exception = true; +std::atomic running_without_timeout = true; class PSWriter : public Poco::Runnable { PSPtr ps; std::mt19937 gen; + static size_t approx_page_mb; + public: - PSWriter(const PSPtr & ps_) : ps(ps_), gen() {} + PSWriter(const PSPtr & ps_) : ps(ps_), gen(), bytes_written(0), pages_written(0) {} + + static void setApproxPageSize(size_t size_mb) + { + LOG_INFO(&Logger::get("root"), "Page approx size is set to " + DB::toString(size_mb) + "MB"); + approx_page_mb = size_mb; + } + + static DB::ReadBufferPtr genRandomData(const DB::PageId pageId, DB::MemHolder & holder) + { + // fill page with random bytes + const size_t buff_sz = approx_page_mb * 1024 * 1024 + random() % 3000; + char * buff = (char *)malloc(buff_sz); + const char buff_ch = pageId % 0xFF; + memset(buff, buff_ch, buff_sz); + + holder = DB::createMemHolder(buff, [&](char * p) { free(p); }); + + return std::make_shared(buff, buff_sz); + } + + static void fillAllPages(const PSPtr & ps) + { + for (DB::PageId pageId = 0; pageId < MAX_PAGE_ID; ++pageId) + { + DB::MemHolder holder; + DB::ReadBufferPtr buff = genRandomData(pageId, holder); + + DB::WriteBatch wb; + wb.putPage(pageId, 0, buff, buff->buffer().size()); + ps->write(wb); + if (pageId % 100 == 0) + LOG_INFO(&Logger::get("root"), "writer wrote page" + DB::toString(pageId)); + } + } + + size_t bytes_written; + size_t pages_written; + void run() override { - while (running_without_exception) + while (running_without_exception && running_without_timeout) { assert(ps != nullptr); std::normal_distribution<> d{MAX_PAGE_ID / 2, 150}; const DB::PageId pageId = static_cast(std::round(d(gen))) % MAX_PAGE_ID; //const DB::PageId pageId = random() % MAX_PAGE_ID; - DB::WriteBatch wb; - // fill page with random bytes - const size_t buff_sz = 2048 * 1024 + random() % 3000; - char *buff = new char[buff_sz]; - const char buff_ch = random() % 0xFF; - memset(buff, buff_ch, buff_sz); - wb.putPage(pageId, 0, std::make_shared(buff, buff_sz), buff_sz); - delete []buff; + DB::MemHolder holder; + DB::ReadBufferPtr buff = genRandomData(pageId, holder); + DB::WriteBatch wb; + wb.putPage(pageId, 0, buff, buff->buffer().size()); ps->write(wb); + ++pages_written; + bytes_written += buff->buffer().size(); + //LOG_INFO(&Logger::get("root"), "writer wrote page" + DB::toString(pageId)); } LOG_INFO(&Logger::get("root"), "writer exit"); } }; +size_t PSWriter::approx_page_mb = 2; + class PSReader : public Poco::Runnable { - PSPtr ps; + PSPtr ps; + const size_t heavy_read_delay_ms; public: - PSReader(const PSPtr & ps_) : ps(ps_) {} + PSReader(const PSPtr & ps_, size_t delay_ms) : ps(ps_), heavy_read_delay_ms(delay_ms), pages_read(0), bytes_read(0) {} + + size_t pages_read; + size_t bytes_read; + void run() override { - while (running_without_exception) + while (running_without_exception && running_without_timeout) { { - const uint32_t micro_seconds_to_sleep = random() % 50; + // sleep [0~10) ms + const uint32_t micro_seconds_to_sleep = random() % 10; usleep(micro_seconds_to_sleep * 1000); } assert(ps != nullptr); +#if 0 const DB::PageId pageId = random() % MAX_PAGE_ID; try { - ps->read(pageId); + DB::Page page = ps->read(pageId); + ++pages_read; + bytes_read += page.data.size(); + } + catch (DB::Exception & e) + { + LOG_TRACE(&Logger::get("root"), e.displayText()); + } +#else + std::vector pageIds; + for (size_t i = 0; i < 5; ++i) + { + pageIds.emplace_back(random() % MAX_PAGE_ID); + } + try + { + // std::function; + DB::PageHandler handler = [&](DB::PageId page_id, const DB::Page & page) { + (void)page_id; + // use `sleep` to mock heavy read + if (heavy_read_delay_ms > 0) + { + //const uint32_t micro_seconds_to_sleep = 10; + usleep(heavy_read_delay_ms * 1000); + } + ++pages_read; + bytes_read += page.data.size(); + }; + ps->read(pageIds, handler); } catch (DB::Exception & e) { LOG_TRACE(&Logger::get("root"), e.displayText()); } +#endif } LOG_INFO(&Logger::get("root"), "reader exit"); } @@ -100,19 +188,21 @@ class PSGc } }; +class StressTimeout +{ +public: + void onTime(Poco::Timer & /* t */) + { + LOG_INFO(&Logger::get("root"), "timeout."); + running_without_timeout = false; + } +}; + int main(int argc, char ** argv) { (void)argc; (void)argv; - bool drop_before_run = false; - if (argc > 2) { - DB::String drop_str = argv[2]; - if (drop_str == "drop") { - drop_before_run = true; - } - } - Poco::AutoPtr channel = new Poco::ConsoleChannel(std::cerr); Poco::AutoPtr formatter(new Poco::PatternFormatter); formatter->setProperty("pattern", "%L%Y-%m-%d %H:%M:%S.%i <%p> %s: %t"); @@ -120,12 +210,47 @@ int main(int argc, char ** argv) Logger::root().setChannel(formatting_channel); Logger::root().setLevel("trace"); + bool drop_before_run = false; + long timeout_s = 0; + size_t num_writers = 1; + size_t num_readers = 4; + size_t heavy_read_delay_ms = 0; + if (argc >= 2) + { + DB::String drop_str = argv[1]; + if (drop_str == "drop") + drop_before_run = true; + // timeout for benchmark + if (argc >= 3) + timeout_s = strtol(argv[2], nullptr, 10); + // num writers + if (argc >= 4) + num_writers = strtoul(argv[3], nullptr, 10); + // num readers + if (argc >= 5) + num_readers = strtoul(argv[4], nullptr, 10); + if (argc >= 6) + { + size_t page_mb = strtoul(argv[5], nullptr, 10); + page_mb = std::max(page_mb, 1UL); + PSWriter::setApproxPageSize(page_mb); + } + if (argc >= 7) + { + heavy_read_delay_ms = strtoul(argv[6], nullptr, 10); + heavy_read_delay_ms = std::max(heavy_read_delay_ms, 0); + LOG_INFO(&Logger::get("root"), "read dealy: " + DB::toString(heavy_read_delay_ms) + "ms"); + } + } + // set random seed + srand(0x123987); const DB::String path = "./stress"; // drop dir if exists Poco::File file(path); if (file.exists() && drop_before_run) { + LOG_INFO(&Logger::get("root"), "All pages have been drop."); file.remove(true); } @@ -133,30 +258,86 @@ int main(int argc, char ** argv) DB::PageStorage::Config config; PSPtr ps = std::make_shared(path, config); + // init all pages in PageStorage + PSWriter::fillAllPages(ps); + LOG_INFO(&Logger::get("root"), "All pages have been init."); + + high_resolution_clock::time_point beginTime = high_resolution_clock::now(); + // create thread pool - const size_t num_readers = 4; - Poco::ThreadPool pool(/* minCapacity= */ 2 + num_readers); + LOG_INFO(&Logger::get("root"), + "start running with these threads: W:" + DB::toString(num_writers) + ",R:" + DB::toString(num_readers) + ",Gc:1"); + Poco::ThreadPool pool(/* minCapacity= */ 1 + num_writers + num_readers, 1 + num_writers + num_readers); // start one writer thread - PSWriter writer(ps); - pool.start(writer, "writer"); + std::vector> writers(num_writers); + for (size_t i = 0; i < num_writers; ++i) + { + writers[i] = std::make_shared(ps); + pool.start(*writers[i], "writer"); + } // start one gc thread PSGc gc(ps); - Poco::Timer timer(0, 30 * 1000); + Poco::Timer timer(0); timer.setStartInterval(1000); timer.setPeriodicInterval(30 * 1000); timer.start(Poco::TimerCallback(gc, &PSGc::onTime)); - // start mutiple read thread + // start multiple read thread std::vector> readers(num_readers); for (size_t i = 0; i < num_readers; ++i) { - readers[i] = std::make_shared(ps); + readers[i] = std::make_shared(ps, heavy_read_delay_ms); pool.start(*readers[i]); } + // set timeout + Poco::Timer timeout_timer(timeout_s); + StressTimeout canceler; + if (timeout_s > 0) + { + LOG_INFO(&Logger::get("root"), "benchmark timeout: " + DB::toString(timeout_s) + "s"); + timeout_timer.setStartInterval(timeout_s * 1000); + timeout_timer.start(Poco::TimerCallback(canceler, &StressTimeout::onTime)); + } + pool.joinAll(); + high_resolution_clock::time_point endTime = high_resolution_clock::now(); + milliseconds timeInterval = std::chrono::duration_cast(endTime - beginTime); + fprintf(stderr, "end in %ldms\n", timeInterval.count()); + double seconds_run = 1.0 * timeInterval.count() / 1000; + + size_t total_pages_written = 0; + size_t total_bytes_written = 0; + for (auto & writer : writers) + { + total_pages_written += writer->pages_written; + total_bytes_written += writer->bytes_written; + } + + size_t total_pages_read = 0; + size_t total_bytes_read = 0; + for (auto & reader : readers) + { + total_pages_read += reader->pages_read; + total_bytes_read += reader->bytes_read; + } + + const double GB = 1024 * 1024 * 1024; + fprintf(stderr, + "W: %zu pages, %.4lf GB, %.4lf GB/s\n", + total_pages_written, + total_bytes_written / GB, + total_bytes_written / GB / seconds_run); + fprintf(stderr, "R: %zu pages, %.4lf GB, %.4lf GB/s\n", total_pages_read, total_bytes_read / GB, total_bytes_read / GB / seconds_run); - return -1; -} + if (running_without_exception) + { + return 0; + } + else + { + return -1; + } +} \ No newline at end of file diff --git a/dbms/src/Storages/Page/tests/test_page_storage_write_disk_full.cpp b/dbms/src/Storages/Page/tests/test_page_storage_write_disk_full.cpp index ee477325880..edc911b4a76 100644 --- a/dbms/src/Storages/Page/tests/test_page_storage_write_disk_full.cpp +++ b/dbms/src/Storages/Page/tests/test_page_storage_write_disk_full.cpp @@ -3,12 +3,15 @@ #include #include +#include #include +#include #include #include #include #include #include +#include #include @@ -21,7 +24,7 @@ using PSPtr = std::shared_ptr; const DB::PageId MAX_PAGE_ID = 500; -void printPageEntry(const DB::PageId pid, const DB::PageCache & entry) +void printPageEntry(const DB::PageId pid, const DB::PageEntry & entry) { printf("\tpid:%9lu\t\t" "%9lu\t%u\t%u\t%9lu\t%9lu\t%016lx\n", @@ -134,7 +137,7 @@ int main(int argc, char ** argv) // Create PageStorage DB::PageStorage::Config config; config.file_roll_size = 96UL * 1024 * 1024; - PSPtr ps = std::make_shared(path, config); + PSPtr ps = std::make_shared(path, config); // Write until disk is full PSWriter writer(ps, path); @@ -144,13 +147,14 @@ int main(int argc, char ** argv) auto page_files = DB::PageStorage::listAllPageFiles(path, true, &Logger::get("root")); for (auto & page_file : page_files) { - DB::PageCacheMap page_entries; - const_cast(page_file).readAndSetPageMetas(page_entries); - printf("File: page_%lu_%u with %zu entries:\n", page_file.getFileId(), page_file.getLevel(), - page_entries.size()); - DB::PageIdAndCaches id_and_caches; - for (auto &[pid, entry] : page_entries) + DB::PageEntries page_entries; + const_cast(page_file).readAndSetPageMetas(page_entries, false); + printf("File: page_%lu_%u with %zu entries:\n", page_file.getFileId(), page_file.getLevel(), page_entries.size()); + DB::PageIdAndEntries id_and_caches; + for (auto iter = page_entries.cbegin(); iter != page_entries.cend(); ++iter) { + auto pid = iter.pageId(); + auto entry = iter.pageEntry(); id_and_caches.emplace_back(pid, entry); printPageEntry(pid, entry); } @@ -160,7 +164,7 @@ int main(int argc, char ** argv) fprintf(stderr, "Scanning over data.\n"); auto page_map = reader->read(id_and_caches); } - catch (DB::Exception &e) + catch (DB::Exception & e) { fprintf(stderr, "%s\n", e.displayText().c_str()); return 1; // Error diff --git a/dbms/src/Storages/Page/tests/utils_get_valid_pages.cpp b/dbms/src/Storages/Page/tests/utils_get_valid_pages.cpp index c483d4087fa..5dc886f82ba 100644 --- a/dbms/src/Storages/Page/tests/utils_get_valid_pages.cpp +++ b/dbms/src/Storages/Page/tests/utils_get_valid_pages.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -17,10 +18,10 @@ void Usage(const char * prog) prog); } -void printPageEntry(const DB::PageId pid, const DB::PageCache & entry) +void printPageEntry(const DB::PageId pid, const DB::PageEntry & entry) { - printf("\tpid:%9lu\t\t" - "%9lu\t%u\t%u\t%9lu\t%9lu\t%016lx\n", + printf("\tpid:%9lld\t\t" + "%9llu\t%9u\t%9u\t%9llu\t%9llu\t%016llx\n", pid, // entry.file_id, entry.level, @@ -45,8 +46,8 @@ int main(int argc, char ** argv) Poco::AutoPtr formatter(new Poco::PatternFormatter); formatter->setProperty("pattern", "%L%Y-%m-%d %H:%M:%S.%i <%p> %s: %t"); Poco::AutoPtr formatting_channel(new Poco::FormattingChannel(formatter, channel)); - Logger::root().setChannel(formatting_channel); - Logger::root().setLevel("trace"); + Poco::Logger::root().setChannel(formatting_channel); + Poco::Logger::root().setLevel("trace"); DB::String path = argv[1]; const int32_t MODE_DUMP_ALL_ENTRIES = 1; @@ -58,23 +59,36 @@ int main(int argc, char ** argv) Usage(argv[0]); return 1; } - auto page_files = DB::PageStorage::listAllPageFiles(path, true, &Logger::get("root")); + auto page_files = DB::PageStorage::listAllPageFiles(path, true, &Poco::Logger::get("root")); - DB::PageCacheMap valid_page_entries; + //DB::PageEntriesVersionSet versions; + DB::PageEntriesVersionSetWithDelta versions; for (auto & page_file : page_files) { - DB::PageCacheMap page_entries; - const_cast(page_file).readAndSetPageMetas(page_entries); - printf("File: page_%lu_%u with %zu entries:\n", page_file.getFileId(), page_file.getLevel(), page_entries.size()); - DB::PageIdAndCaches id_and_caches; - for (auto & [pid, entry] : page_entries) + DB::PageEntriesEdit edit; + DB::PageIdAndEntries id_and_caches; + const_cast(page_file).readAndSetPageMetas(edit); + + printf("File: page_%llu_%u with %zu entries:\n", page_file.getFileId(), page_file.getLevel(), edit.size()); + for (const auto & record : edit.getRecords()) { - id_and_caches.emplace_back(pid, entry); if (mode == MODE_DUMP_ALL_ENTRIES) { - printPageEntry(pid, entry); + switch (record.type) + { + case DB::WriteBatch::WriteType::PUT: + printf("PUT"); + printPageEntry(record.page_id, record.entry); + id_and_caches.emplace_back(std::make_pair(record.page_id, record.entry)); + break; + case DB::WriteBatch::WriteType::DEL: + printf("DEL\t%lld\n", record.page_id); + break; + case DB::WriteBatch::WriteType::REF: + printf("REF\t%lld\t%lld\n", record.page_id, record.ori_page_id); + break; + } } - valid_page_entries[pid] = entry; } // Read correspond page and check checksum auto reader = const_cast(page_file).createReader(); @@ -87,17 +101,28 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s\n", e.displayText().c_str()); } + + versions.apply(edit); } if (mode == MODE_DUMP_VALID_ENTRIES) { - printf("Valid page entries: %zu\n", valid_page_entries.size()); - for (auto & [pid, entry] : valid_page_entries) + auto snapshot = versions.getSnapshot(); + auto page_ids = snapshot->version()->validPageIds(); + for (auto page_id : page_ids) + { + const DB::PageEntry * entry = snapshot->version()->find(page_id); + printPageEntry(page_id, *entry); + } +#if 0 + //printf("Valid page entries: %zu\n", valid_page_entries->size()); + for (auto iter = snapshot->version()->cbegin(); iter != snapshot->version()->cend(); ++iter) { + const DB::PageId pid = iter.pageId(); + const DB::PageEntry & entry = iter.pageEntry(); printPageEntry(pid, entry); } +#endif } - return 0; } - diff --git a/dbms/src/Storages/Transaction/RegionPersister.cpp b/dbms/src/Storages/Transaction/RegionPersister.cpp index e52fc86e9f1..2622b11d203 100644 --- a/dbms/src/Storages/Transaction/RegionPersister.cpp +++ b/dbms/src/Storages/Transaction/RegionPersister.cpp @@ -60,10 +60,10 @@ void RegionPersister::doPersist(RegionCacheWriteElement & region_write_buffer, c std::lock_guard lock(mutex); - auto cache = page_storage.getCache(region_id); - if (cache.isValid() && cache.tag > applied_index) + auto entry = page_storage.getEntry(region_id); + if (entry.isValid() && entry.tag > applied_index) { - LOG_DEBUG(log, "[region " << region_id << ", applied index " << applied_index << "] have already persisted index " << cache.tag); + LOG_DEBUG(log, "[region " << region_id << ", applied index " << applied_index << "] have already persisted index " << entry.tag); return; }