From 3129b694d0b6cccf87ae2cf5dd87b78ce4f7d862 Mon Sep 17 00:00:00 2001 From: Jung-Sang Ahn Date: Sat, 28 Dec 2019 09:41:06 -0800 Subject: [PATCH] Initial commit --- .gitmodules | 3 + CMakeLists.txt | 215 +++ LICENSE | 201 +++ README.md | 203 +++ cmake/CodeCoverage.cmake | 239 +++ cmake/FindSnappy.cmake | 56 + docs/basic_operations.md | 308 ++++ docs/how_to_use.md | 15 + docs/overview.md | 54 + docs/seq_numbers.md | 24 + examples/CMakeLists.txt | 35 + examples/README.md | 14 + examples/example_get_set_del.cc | 64 + examples/example_iterator.cc | 93 ++ examples/example_iterator_adv.cc | 142 ++ examples/example_snapshot_checkpoint.cc | 108 ++ examples/example_snapshot_instant.cc | 76 + include/libjungle/db_config.h | 446 ++++++ include/libjungle/db_stats.h | 55 + include/libjungle/iterator.h | 168 +++ include/libjungle/jungle.h | 719 +++++++++ include/libjungle/keyvalue.h | 231 +++ include/libjungle/record.h | 217 +++ include/libjungle/sized_buf.h | 497 +++++++ include/libjungle/status.h | 145 ++ manifest.sh | 1 + prepare.sh | 32 + scripts/lcov_cobertura.py | 414 ++++++ scripts/runtests.sh | 34 + src/ashared_ptr.h | 154 ++ src/avltree.cc | 671 +++++++++ src/avltree.h | 90 ++ src/backtrace.h | 393 +++++ src/bloomfilter.cc | 103 ++ src/bloomfilter.h | 59 + src/cmd_handler.cc | 174 +++ src/cmd_handler.h | 50 + src/compactor.cc | 224 +++ src/compactor.h | 57 + src/configs.h | 21 + src/crc32.cc | 386 +++++ src/crc32.h | 39 + src/db_config.cc | 72 + src/db_group.cc | 166 +++ src/db_internal.h | 289 ++++ src/db_manifest.cc | 239 +++ src/db_manifest.h | 62 + src/db_mgr.cc | 452 ++++++ src/db_mgr.h | 191 +++ src/endian_encode.h | 118 ++ src/event_awaiter.h | 97 ++ src/fileops_base.h | 134 ++ src/fileops_directio.cc | 613 ++++++++ src/fileops_directio.h | 81 ++ src/fileops_posix.cc | 221 +++ src/fileops_posix.h | 64 + src/flusher.cc | 262 ++++ src/flusher.h | 88 ++ src/generic_bitmap.h | 203 +++ src/hex_dump.h | 251 ++++ src/histogram.h | 324 +++++ src/internal_helper.cc | 492 +++++++ src/internal_helper.h | 684 +++++++++ src/iterator.cc | 598 ++++++++ src/jungle.cc | 818 +++++++++++ src/latency_collector.h | 485 ++++++ src/latency_dump.h | 339 +++++ src/list.cc | 156 ++ src/list.h | 115 ++ src/log_file.cc | 806 ++++++++++ src/log_file.h | 236 +++ src/log_iterator.cc | 601 ++++++++ src/log_manifest.cc | 746 ++++++++++ src/log_manifest.h | 287 ++++ src/log_mgr.cc | 1404 ++++++++++++++++++ src/log_mgr.h | 340 +++++ src/log_reclaimer.cc | 78 + src/log_reclaimer.h | 41 + src/logger.cc | 1239 ++++++++++++++++ src/logger.h | 485 ++++++ src/memtable.cc | 1250 ++++++++++++++++ src/memtable.h | 236 +++ src/memtable_iterator.cc | 450 ++++++ src/murmurhash3.cc | 358 +++++ src/murmurhash3.h | 58 + src/simple_thread_pool.h | 634 ++++++++ src/skiplist.cc | 999 +++++++++++++ src/skiplist.h | 144 ++ src/table_append.cc | 48 + src/table_compact_condition.cc | 605 ++++++++ src/table_compaction.cc | 754 ++++++++++ src/table_file.cc | 1263 ++++++++++++++++ src/table_file.h | 366 +++++ src/table_file_compaction.cc | 433 ++++++ src/table_file_iterator.cc | 328 +++++ src/table_helper.cc | 211 +++ src/table_helper.h | 78 + src/table_iterator.cc | 499 +++++++ src/table_lookup_booster.cc | 76 + src/table_lookup_booster.h | 96 ++ src/table_manifest.cc | 658 +++++++++ src/table_manifest.h | 242 +++ src/table_mgr.cc | 739 ++++++++++ src/table_mgr.h | 477 ++++++ src/table_set_batch.cc | 247 ++++ src/table_split.cc | 439 ++++++ src/table_writer.cc | 179 +++ src/table_writer.h | 225 +++ src/worker_mgr.cc | 160 ++ src/worker_mgr.h | 101 ++ tests/CMakeLists.txt | 200 +++ tests/bench/adapter_selector.h | 33 + tests/bench/bench.cc | 947 ++++++++++++ tests/bench/bench_config.h | 164 +++ tests/bench/bench_worker.h | 119 ++ tests/bench/db_adapter.h | 117 ++ tests/bench/db_adapter_jungle.cc | 219 +++ tests/bench/db_adapter_jungle.h | 67 + tests/bench/dist_def.h | 250 ++++ tests/bench/dist_def_test.cc | 367 +++++ tests/bench/example_config.json | 55 + tests/bench/json.hpp | 676 +++++++++ tests/bench/json_common.h | 77 + tests/bench/json_to_dist_def.h | 52 + tests/config_test_common.h | 26 + tests/jungle/basic_op_test.cc | 1780 +++++++++++++++++++++++ tests/jungle/casual_test.cc | 729 ++++++++++ tests/jungle/compaction_test.cc | 964 ++++++++++++ tests/jungle/corruption_test.cc | 924 ++++++++++++ tests/jungle/custom_cmp_test.cc | 410 ++++++ tests/jungle/jungle_test_common.h | 360 +++++ tests/jungle/key_itr_test.cc | 905 ++++++++++++ tests/jungle/large_test.cc | 98 ++ tests/jungle/level_extension_test.cc | 1077 ++++++++++++++ tests/jungle/log_reclaim_test.cc | 966 ++++++++++++ tests/jungle/mt_test.cc | 188 +++ tests/jungle/seq_itr_test.cc | 552 +++++++ tests/jungle/snapshot_test.cc | 1019 +++++++++++++ tests/robust/basic_robust_child.cc | 117 ++ tests/robust/basic_robust_main.cc | 16 + tests/stress/compactor_stress_test.cc | 395 +++++ tests/stress/flush_stress_test.cc | 229 +++ tests/stress/iterator_stress_test.cc | 161 ++ tests/stress/log_reclaim_stress_test.cc | 191 +++ tests/stress/many_log_files_test.cc | 78 + tests/stress/purge_stress_test.cc | 217 +++ tests/test_common.h | 1561 ++++++++++++++++++++ tests/unit/crc32_test.cc | 60 + tests/unit/fileops_directio_test.cc | 599 ++++++++ tests/unit/fileops_test.cc | 333 +++++ tests/unit/keyvalue_test.cc | 364 +++++ tests/unit/memtable_test.cc | 182 +++ tests/unit/table_lookup_booster_test.cc | 141 ++ tools/CMakeLists.txt | 18 + tools/bloomfilter_generator.cc | 196 +++ tools/jungle_checker.cc | 406 ++++++ 156 files changed, 52065 insertions(+) create mode 100644 .gitmodules create mode 100644 CMakeLists.txt create mode 100644 LICENSE create mode 100644 README.md create mode 100644 cmake/CodeCoverage.cmake create mode 100644 cmake/FindSnappy.cmake create mode 100644 docs/basic_operations.md create mode 100644 docs/how_to_use.md create mode 100644 docs/overview.md create mode 100644 docs/seq_numbers.md create mode 100644 examples/CMakeLists.txt create mode 100644 examples/README.md create mode 100644 examples/example_get_set_del.cc create mode 100644 examples/example_iterator.cc create mode 100644 examples/example_iterator_adv.cc create mode 100644 examples/example_snapshot_checkpoint.cc create mode 100644 examples/example_snapshot_instant.cc create mode 100644 include/libjungle/db_config.h create mode 100644 include/libjungle/db_stats.h create mode 100644 include/libjungle/iterator.h create mode 100644 include/libjungle/jungle.h create mode 100644 include/libjungle/keyvalue.h create mode 100644 include/libjungle/record.h create mode 100644 include/libjungle/sized_buf.h create mode 100644 include/libjungle/status.h create mode 100644 manifest.sh create mode 100644 prepare.sh create mode 100644 scripts/lcov_cobertura.py create mode 100644 scripts/runtests.sh create mode 100644 src/ashared_ptr.h create mode 100644 src/avltree.cc create mode 100644 src/avltree.h create mode 100644 src/backtrace.h create mode 100644 src/bloomfilter.cc create mode 100644 src/bloomfilter.h create mode 100644 src/cmd_handler.cc create mode 100644 src/cmd_handler.h create mode 100644 src/compactor.cc create mode 100644 src/compactor.h create mode 100644 src/configs.h create mode 100644 src/crc32.cc create mode 100644 src/crc32.h create mode 100644 src/db_config.cc create mode 100644 src/db_group.cc create mode 100644 src/db_internal.h create mode 100644 src/db_manifest.cc create mode 100644 src/db_manifest.h create mode 100644 src/db_mgr.cc create mode 100644 src/db_mgr.h create mode 100644 src/endian_encode.h create mode 100644 src/event_awaiter.h create mode 100644 src/fileops_base.h create mode 100644 src/fileops_directio.cc create mode 100644 src/fileops_directio.h create mode 100644 src/fileops_posix.cc create mode 100644 src/fileops_posix.h create mode 100644 src/flusher.cc create mode 100644 src/flusher.h create mode 100644 src/generic_bitmap.h create mode 100644 src/hex_dump.h create mode 100644 src/histogram.h create mode 100644 src/internal_helper.cc create mode 100644 src/internal_helper.h create mode 100644 src/iterator.cc create mode 100644 src/jungle.cc create mode 100644 src/latency_collector.h create mode 100644 src/latency_dump.h create mode 100644 src/list.cc create mode 100644 src/list.h create mode 100644 src/log_file.cc create mode 100644 src/log_file.h create mode 100644 src/log_iterator.cc create mode 100644 src/log_manifest.cc create mode 100644 src/log_manifest.h create mode 100644 src/log_mgr.cc create mode 100644 src/log_mgr.h create mode 100644 src/log_reclaimer.cc create mode 100644 src/log_reclaimer.h create mode 100644 src/logger.cc create mode 100644 src/logger.h create mode 100644 src/memtable.cc create mode 100644 src/memtable.h create mode 100644 src/memtable_iterator.cc create mode 100644 src/murmurhash3.cc create mode 100644 src/murmurhash3.h create mode 100644 src/simple_thread_pool.h create mode 100644 src/skiplist.cc create mode 100644 src/skiplist.h create mode 100644 src/table_append.cc create mode 100644 src/table_compact_condition.cc create mode 100644 src/table_compaction.cc create mode 100644 src/table_file.cc create mode 100644 src/table_file.h create mode 100644 src/table_file_compaction.cc create mode 100644 src/table_file_iterator.cc create mode 100644 src/table_helper.cc create mode 100644 src/table_helper.h create mode 100644 src/table_iterator.cc create mode 100644 src/table_lookup_booster.cc create mode 100644 src/table_lookup_booster.h create mode 100644 src/table_manifest.cc create mode 100644 src/table_manifest.h create mode 100644 src/table_mgr.cc create mode 100644 src/table_mgr.h create mode 100644 src/table_set_batch.cc create mode 100644 src/table_split.cc create mode 100644 src/table_writer.cc create mode 100644 src/table_writer.h create mode 100644 src/worker_mgr.cc create mode 100644 src/worker_mgr.h create mode 100644 tests/CMakeLists.txt create mode 100644 tests/bench/adapter_selector.h create mode 100644 tests/bench/bench.cc create mode 100644 tests/bench/bench_config.h create mode 100644 tests/bench/bench_worker.h create mode 100644 tests/bench/db_adapter.h create mode 100644 tests/bench/db_adapter_jungle.cc create mode 100644 tests/bench/db_adapter_jungle.h create mode 100644 tests/bench/dist_def.h create mode 100644 tests/bench/dist_def_test.cc create mode 100644 tests/bench/example_config.json create mode 100644 tests/bench/json.hpp create mode 100644 tests/bench/json_common.h create mode 100644 tests/bench/json_to_dist_def.h create mode 100644 tests/config_test_common.h create mode 100644 tests/jungle/basic_op_test.cc create mode 100644 tests/jungle/casual_test.cc create mode 100644 tests/jungle/compaction_test.cc create mode 100644 tests/jungle/corruption_test.cc create mode 100644 tests/jungle/custom_cmp_test.cc create mode 100644 tests/jungle/jungle_test_common.h create mode 100644 tests/jungle/key_itr_test.cc create mode 100644 tests/jungle/large_test.cc create mode 100644 tests/jungle/level_extension_test.cc create mode 100644 tests/jungle/log_reclaim_test.cc create mode 100644 tests/jungle/mt_test.cc create mode 100644 tests/jungle/seq_itr_test.cc create mode 100644 tests/jungle/snapshot_test.cc create mode 100644 tests/robust/basic_robust_child.cc create mode 100644 tests/robust/basic_robust_main.cc create mode 100644 tests/stress/compactor_stress_test.cc create mode 100644 tests/stress/flush_stress_test.cc create mode 100644 tests/stress/iterator_stress_test.cc create mode 100644 tests/stress/log_reclaim_stress_test.cc create mode 100644 tests/stress/many_log_files_test.cc create mode 100644 tests/stress/purge_stress_test.cc create mode 100644 tests/test_common.h create mode 100644 tests/unit/crc32_test.cc create mode 100644 tests/unit/fileops_directio_test.cc create mode 100644 tests/unit/fileops_test.cc create mode 100644 tests/unit/keyvalue_test.cc create mode 100644 tests/unit/memtable_test.cc create mode 100644 tests/unit/table_lookup_booster_test.cc create mode 100644 tools/CMakeLists.txt create mode 100644 tools/bloomfilter_generator.cc create mode 100644 tools/jungle_checker.cc diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..8cf4adf --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "third_party/forestdb"] + path = third_party/forestdb + url = https://github.com/ForestDB-KVStore/Simple-ForestDB.git diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..4c6988e --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,215 @@ +cmake_minimum_required(VERSION 3.5) +project(Jungle VERSION 1.0.0 LANGUAGES CXX) + +# === Build type (default: RelWithDebInfo, O2) =========== +if (NOT CMAKE_BUILD_TYPE) + set(BUILD_TYPE_OPTIONS + "Choose the type of build, " + "options are: Debug Release RelWithDebInfo MinSizeRel.") + set(CMAKE_BUILD_TYPE "RelWithDebInfo" + CACHE ${BUILD_TYPE_OPTIONS} FORCE) + message(STATUS "Build type is not given, use default.") +endif () +message(STATUS "Build type: " ${CMAKE_BUILD_TYPE}) + + +set(ROOT_SRC ${PROJECT_SOURCE_DIR}/src) +set(CMAKE_DIR ${PROJECT_SOURCE_DIR}/cmake) +set(FDB_LIB_DIR ${PROJECT_SOURCE_DIR}/third_party/forestdb/build) + +if (CODE_COVERAGE GREATER 0) + set(CMAKE_BUILD_TYPE "Debug") + include(cmake/CodeCoverage.cmake) + message(STATUS "---- CODE COVERAGE DETECTION MODE ----") +endif() + +# Libraries +#set(LIBZ z) + +if (SNAPPY_OPTION STREQUAL "Enable") + include(cmake/FindSnappy.cmake) + + if (SNAPPY_FOUND) + set(LIBSNAPPY ${SNAPPY_LIBRARIES}) + else() + MESSAGE(FATAL_ERROR "Can't find snappy, if you want to build without snappy set" + " \"-DSNAPPY_OPTION=Disable\"") + endif(SNAPPY_FOUND) +endif() + +set(LIBDL dl) + +# Includes +include_directories(BEFORE ./) +include_directories(BEFORE ${PROJECT_SOURCE_DIR}/src) +include_directories(BEFORE ${PROJECT_SOURCE_DIR}/include) +include_directories(BEFORE ${PROJECT_SOURCE_DIR}/tools) +include_directories(BEFORE ${PROJECT_SOURCE_DIR}/tests) + +# Compiler flags +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pessimizing-move") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") +if (NOT APPLE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") +endif () + +if (CODE_COVERAGE GREATER 0) + APPEND_COVERAGE_COMPILER_FLAGS() + #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions") + #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-inline") + #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_NO_EXCEPTION") + set(COVERAGE_EXCLUDES + 'third_party/*' + 'usr/*' + 'tests/*' + 'tools/*' + ) +endif() + + +# === SANITIZER === + +if (ADDRESS_SANITIZER GREATER 0) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fuse-ld=gold") + message(STATUS "---- ADDRESS SANITIZER IS ON ----") +endif() + +if (THREAD_SANITIZER GREATER 0) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread") + message(STATUS "---- THREAD SANITIZER IS ON ----") +endif() + +if (LEAK_SANITIZER GREATER 0) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=leak") + message(STATUS "---- LEAK SANITIZER IS ON ----") +endif() + + + +# === Program flags === + +if (TESTSUITE_NO_COLOR GREATER 0) + add_definitions(-DTESTSUITE_NO_COLOR=1) +endif() + +file(COPY ${CMAKE_SOURCE_DIR}/scripts/runtests.sh + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + +# === CUSTOM LOGGER === + +if (LOGGER_PATH) + set(LOGGER_CC_FILE "${LOGGER_PATH}/logger.cc") + set(LOGGER_HEADER_FILE "${LOGGER_PATH}/logger.h") +else () + set(LOGGER_CC_FILE "${ROOT_SRC}/logger.cc") + set(LOGGER_HEADER_FILE "${ROOT_SRC}/logger.h") +endif () +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLOGGER_H=${LOGGER_HEADER_FILE}") +message(STATUS "Simple logger cc file: ${LOGGER_CC_FILE}") +message(STATUS "Simple logger header file: ${LOGGER_HEADER_FILE}") + + +# === Source files =================== +set(LOGGER_SRC ${LOGGER_CC_FILE}) + +if (DETACH_LOGGER GREATER 0) + message(STATUS "---- DETACH LOGGER ----") + set(LIBSIMPLELOGGER "${CMAKE_CURRENT_BINARY_DIR}/libsimplelogger.a") + + add_library(simplelogger_lib ${LOGGER_SRC}) + set_target_properties(simplelogger_lib PROPERTIES OUTPUT_NAME simplelogger + CLEAN_DIRECT_OUTPUT 1) + +else () + set(LOGGER_SRC_TO_CORE ${LOGGER_SRC}) + +endif () + +set(JUNGLE_CORE + ${ROOT_SRC}/avltree.cc + ${ROOT_SRC}/bloomfilter.cc + ${ROOT_SRC}/cmd_handler.cc + ${ROOT_SRC}/compactor.cc + ${ROOT_SRC}/crc32.cc + ${ROOT_SRC}/db_config.cc + ${ROOT_SRC}/db_group.cc + ${ROOT_SRC}/db_manifest.cc + ${ROOT_SRC}/db_mgr.cc + ${ROOT_SRC}/fileops_posix.cc + ${ROOT_SRC}/fileops_directio.cc + ${ROOT_SRC}/flusher.cc + ${ROOT_SRC}/internal_helper.cc + ${ROOT_SRC}/iterator.cc + ${ROOT_SRC}/jungle.cc + ${ROOT_SRC}/list.cc + ${ROOT_SRC}/log_file.cc + ${ROOT_SRC}/log_iterator.cc + ${ROOT_SRC}/log_manifest.cc + ${ROOT_SRC}/log_mgr.cc + ${ROOT_SRC}/log_reclaimer.cc + ${ROOT_SRC}/memtable.cc + ${ROOT_SRC}/memtable_iterator.cc + ${ROOT_SRC}/murmurhash3.cc + ${ROOT_SRC}/skiplist.cc + ${ROOT_SRC}/table_append.cc + ${ROOT_SRC}/table_compact_condition.cc + ${ROOT_SRC}/table_compaction.cc + ${ROOT_SRC}/table_file.cc + ${ROOT_SRC}/table_file_compaction.cc + ${ROOT_SRC}/table_file_iterator.cc + ${ROOT_SRC}/table_helper.cc + ${ROOT_SRC}/table_iterator.cc + ${ROOT_SRC}/table_lookup_booster.cc + ${ROOT_SRC}/table_manifest.cc + ${ROOT_SRC}/table_mgr.cc + ${ROOT_SRC}/table_set_batch.cc + ${ROOT_SRC}/table_split.cc + ${ROOT_SRC}/table_writer.cc + ${ROOT_SRC}/worker_mgr.cc + ${LOGGER_SRC_TO_CORE} + ${OPEN_MEMSTREAM} +) +#add_library(JUNGLE_CORE_OBJ OBJECT ${JUNGLE_CORE}) + +# Note: static libraries MUST be located in front of all shared libraries. +set(JUNGLE_DEPS + ${FDB_LIB_DIR}/libforestdb.a + ${LIBSNAPPY} + ${LIBDL}) + +add_library(static_lib ${JUNGLE_CORE}) +target_link_libraries(static_lib ${JUNGLE_DEPS}) +set_target_properties(static_lib PROPERTIES OUTPUT_NAME jungle + CLEAN_DIRECT_OUTPUT 1) +if (DETACH_LOGGER GREATER 0) + add_dependencies(static_lib simplelogger_lib) +endif () + +add_subdirectory("${PROJECT_SOURCE_DIR}/examples") +add_subdirectory("${PROJECT_SOURCE_DIR}/tests") +add_subdirectory("${PROJECT_SOURCE_DIR}/tools") + +if (CODE_COVERAGE GREATER 0) + SETUP_TARGET_FOR_COVERAGE( + NAME jungle_cov + EXECUTABLE ./runtests.sh + DEPENDENCIES keyvalue_test + crc32_test + fileops_test + fileops_directio_test + memtable_test + basic_op_test + seq_itr_test + key_itr_test + snapshot_test + custom_cmp_test + corruption_test + compaction_test + mt_test + log_reclaim_test + level_extension_test + table_lookup_booster_test + ) +endif() diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6dfd014 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2016 andy.yx.chen@outlook.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..30c52bc --- /dev/null +++ b/README.md @@ -0,0 +1,203 @@ + + +Jungle +====== +Embedded key-value storage library, based on a combined index of [LSM-tree](https://en.wikipedia.org/wiki/Log-structured_merge-tree) and [copy-on-write B+tree](https://www.usenix.org/legacy/events/lsf07/tech/rodeh.pdf). Please refer to our [paper](https://www.usenix.org/conference/hotstorage19/presentation/ahn). + +Jungle is specialized for building [replicated state machine](https://en.wikipedia.org/wiki/State_machine_replication) of consensus protocols such as [Paxos](https://en.wikipedia.org/wiki/Paxos_(computer_science)) or [Raft](https://raft.github.io/), by providing chronological ordering and lightweight persistent snapshot. + + +Features +-------- +* Ordered mapping of key and its value on disk (file system). Both key and value are arbitrary length binary. +* Monotonically increasing sequence number for each key-value modification. +* Point lookup on both key and sequence number. +* Range lookup on both key and sequence number, by using iterator: + * Snapshot isolation: each individual iterator is a snapshot. + * Bi-directional traversal and jump: `prev`, `next`, `gotoBegin`, `gotoEnd`, and `seek`. +* Lightweight persistent snapshot, based on sequence number: + * Nearly no overhead for the creation of a snapshot. + * Snapshots are durable; preserved even after process restart. +* Tunable configurations: + * The number of threads for log flushing and compaction. + * Custom size ratio between LSM levels. + * Compaction factor (please refer to the paper). +* Log store mode: + * Ordered mapping of sequence number and value, eliminating key indexing. + * Lightweight log truncation based on sequence number. + +### Things we DO NOT (and also WILL NOT) support +* Secondary indexing, or SQL-like query: + * Jungle will not understand the contents of value. Value is just a binary from Jungle's point of view. +* Server-client style service, or all other network-involving tasks such as replication: + * Jungle is a library that should be embedded into your process. + + +Benefits +-------- +Compared to other widely used LSM-based key-value storage libraries, benefits of Jungle are as follows: + +* Smaller write amplification. + * Jungle will have 4-5 times less write amplification, while providing the similar level of write performance. +* Chronological ordering of key-value pairs + * Along with persistent logical snapshot, this feature is very useful when you use it as a replicated state machine for Paxos or Raft. + + + +How to Build +------------ +#### 1. Install `cmake`: #### +* Ubuntu +```sh +$ sudo apt-get install cmake +``` + +* OSX +```sh +$ brew install cmake +``` + +#### 2. Build #### +```sh +jungle$ ./prepare.sh -j8 +jungle$ mkdir build +jungle$ cd build +jungle/build$ cmake ../ +jungle/build$ make +``` + +Run unit tests: +``` +jungle/build$ ./runtests.sh +``` + + +How to Use +---------- +Please refer to [this document](./docs/how_to_use.md). + + +Example Implementation +----------------------- +Please refer to [examples](./examples). + + +Supported Platforms +------------------- +* Ubuntu (tested on 14.04, 16.04, and 18.04) +* Centos (tested on 7) +* OSX (tested on 10.13 and 10.14) + +#### Platforms will be supported in the future +* Windows + + +Contributing to This Project +---------------------------- +We welcome contributions. If you find any bugs, potential flaws and edge cases, improvements, new feature suggestions or discussions, please submit issues or pull requests. + + +Contact +------- +* Jung-Sang Ahn + + +Coding Convention +----------------- +* Recommended not to exceed 90 characters per line. +* Indent: 4 spaces, K&R (1TBS). +* Class & struct name: `UpperCamelCase`. +* Member function and member variable name: `lowerCamelCase`. +* Local variable, helper function, and parameter name: `snake_case`. + +```C++ +class MyClass { +public: + void myFunction(int my_parameter) { + int local_var = my_parameter + 1; + if (local_var < myVariable) { + // ... + } else { + // ... + } + } +private: + int myVariable; +}; + +int helper_function() { + return 0; +} +``` + +* Header include order: local to global. + 1. Header file corresponding to this source file (if applicable). + 2. Header files in the same project (i.e., Jungle). + 3. Header files from the other projects. + 4. C++ system header files. + 5. C system header files. + * Note: alphabetical order within the same category. + * Example (`my_file.cc`): +```C++ +#include "my_file.h" // Corresponding header file. + +#include "table_file.h" // Header files in the same project. +#include "table_helper.h" + +#include "forestdb.h" // Header files from the other projects. + +#include // C++ header files. +#include +#include + +#include // C header files. +#include +#include +``` + +License Information +-------------------- +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + +https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + + +3rd Party Code +-------------- +1. URL: https://github.com/couchbase/forestdb
+License: https://github.com/couchbase/forestdb/blob/master/LICENSE
+Originally licensed under the Apache 2.0 license. + +2. URL: https://github.com/stbrumme/crc32
+Original Copyright 2011-2016 Stephan Brumme
+See Original ZLib License: https://github.com/stbrumme/crc32/blob/master/LICENSE + +3. URL: https://github.com/greensky00/simple_logger
+License: https://github.com/greensky00/simple_logger/blob/master/LICENSE
+Originally licensed under the MIT license. + +4. URL: https://github.com/greensky00/testsuite
+License: https://github.com/greensky00/testsuite/blob/master/LICENSE
+Originally licensed under the MIT license. + +5. URL: https://github.com/greensky00/latency-collector
+License: https://github.com/greensky00/latency-collector/blob/master/LICENSE
+Originally licensed under the MIT license. + +6. URL: https://github.com/eriwen/lcov-to-cobertura-xml/blob/master/lcov_cobertura/lcov_cobertura.py
+License: https://github.com/eriwen/lcov-to-cobertura-xml/blob/master/LICENSE
+Copyright 2011-2012 Eric Wendelin
+Originally licensed under the Apache 2.0 license. + +7. URL: https://github.com/bilke/cmake-modules
+License: https://github.com/bilke/cmake-modules/blob/master/LICENSE_1_0.txt
+Copyright 2012-2017 Lars Bilke
+Originally licensed under the BSD license. + +8. URL: https://github.com/aappleby/smhasher/tree/master/src
+Copyright 2016 Austin Appleby
+Originally licensed under the MIT license. diff --git a/cmake/CodeCoverage.cmake b/cmake/CodeCoverage.cmake new file mode 100644 index 0000000..ecd1010 --- /dev/null +++ b/cmake/CodeCoverage.cmake @@ -0,0 +1,239 @@ +# Copyright (c) 2012 - 2017, Lars Bilke +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CHANGES: +# +# 2012-01-31, Lars Bilke +# - Enable Code Coverage +# +# 2013-09-17, Joakim Söderberg +# - Added support for Clang. +# - Some additional usage instructions. +# +# 2016-02-03, Lars Bilke +# - Refactored functions to use named parameters +# +# 2017-06-02, Lars Bilke +# - Merged with modified version from github.com/ufz/ogs +# +# +# USAGE: +# +# 1. Copy this file into your cmake modules path. +# +# 2. Add the following line to your CMakeLists.txt: +# include(CodeCoverage) +# +# 3. Append necessary compiler flags: +# APPEND_COVERAGE_COMPILER_FLAGS() +# +# 4. If you need to exclude additional directories from the report, specify them +# using the COVERAGE_EXCLUDES variable before calling SETUP_TARGET_FOR_COVERAGE. +# Example: +# set(COVERAGE_EXCLUDES 'dir1/*' 'dir2/*') +# +# 5. Use the functions described below to create a custom make target which +# runs your test executable and produces a code coverage report. +# +# 6. Build a Debug build: +# cmake -DCMAKE_BUILD_TYPE=Debug .. +# make +# make my_coverage_target +# + +include(CMakeParseArguments) + +# Check prereqs +find_program( GCOV_PATH gcov ) +find_program( LCOV_PATH NAMES lcov lcov.bat lcov.exe lcov.perl) +find_program( GENHTML_PATH NAMES genhtml genhtml.perl genhtml.bat ) +find_program( GCOVR_PATH gcovr PATHS ${CMAKE_SOURCE_DIR}/scripts/test) +find_program( SIMPLE_PYTHON_EXECUTABLE python ) + +if(NOT GCOV_PATH) + message(FATAL_ERROR "gcov not found! Aborting...") +endif() # NOT GCOV_PATH + +if("${CMAKE_CXX_COMPILER_ID}" MATCHES "(Apple)?[Cc]lang") + if("${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 3) + message(FATAL_ERROR "Clang version must be 3.0.0 or greater! Aborting...") + endif() +elseif(NOT CMAKE_COMPILER_IS_GNUCXX) + message(FATAL_ERROR "Compiler is not GNU gcc! Aborting...") +endif() + +set(COVERAGE_COMPILER_FLAGS "-g -O0 --coverage -fprofile-arcs -ftest-coverage" #-fno-exceptions" + CACHE INTERNAL "") + +set(CMAKE_CXX_FLAGS_COVERAGE + ${COVERAGE_COMPILER_FLAGS} + CACHE STRING "Flags used by the C++ compiler during coverage builds." + FORCE ) +set(CMAKE_C_FLAGS_COVERAGE + ${COVERAGE_COMPILER_FLAGS} + CACHE STRING "Flags used by the C compiler during coverage builds." + FORCE ) +set(CMAKE_EXE_LINKER_FLAGS_COVERAGE + "" + CACHE STRING "Flags used for linking binaries during coverage builds." + FORCE ) +set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE + "" + CACHE STRING "Flags used by the shared libraries linker during coverage builds." + FORCE ) +mark_as_advanced( + CMAKE_CXX_FLAGS_COVERAGE + CMAKE_C_FLAGS_COVERAGE + CMAKE_EXE_LINKER_FLAGS_COVERAGE + CMAKE_SHARED_LINKER_FLAGS_COVERAGE ) + +if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + message(WARNING "Code coverage results with an optimised (non-Debug) build may be misleading") +endif() # NOT CMAKE_BUILD_TYPE STREQUAL "Debug" + +if(CMAKE_C_COMPILER_ID STREQUAL "GNU") + link_libraries(gcov) +else() + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --coverage") +endif() + +# Defines a target for running and collection code coverage information +# Builds dependencies, runs the given executable and outputs reports. +# NOTE! The executable should always have a ZERO as exit code otherwise +# the coverage generation will not complete. +# +# SETUP_TARGET_FOR_COVERAGE( +# NAME testrunner_coverage # New target name +# EXECUTABLE testrunner -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR +# DEPENDENCIES testrunner # Dependencies to build first +# ) +function(SETUP_TARGET_FOR_COVERAGE) + + set(options NONE) + set(oneValueArgs NAME) + set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES) + cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT LCOV_PATH) + message(FATAL_ERROR "lcov not found! Aborting...") + endif() # NOT LCOV_PATH + + if(NOT GENHTML_PATH) + message(FATAL_ERROR "genhtml not found! Aborting...") + endif() # NOT GENHTML_PATH + + # Setup target + add_custom_target(${Coverage_NAME} + + # Cleanup lcov + COMMAND ${LCOV_PATH} --directory . --zerocounters + # Create baseline to make sure untouched files show up in the report + COMMAND ${LCOV_PATH} -c -i -d . -o ${Coverage_NAME}.base + + # Run tests + COMMAND ${Coverage_EXECUTABLE} + + # Capturing lcov counters and generating report + COMMAND ${LCOV_PATH} --directory . --capture --output-file ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info + # add baseline counters + COMMAND ${LCOV_PATH} -a ${PROJECT_BINARY_DIR}/${Coverage_NAME}.base -a ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info --output-file ${PROJECT_BINARY_DIR}/${Coverage_NAME}.total + COMMAND ${LCOV_PATH} --remove ${Coverage_NAME}.total ${COVERAGE_EXCLUDES} --output-file ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned + COMMAND ${GENHTML_PATH} -o ${PROJECT_BINARY_DIR}/${Coverage_NAME} ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned + #COMMAND ${CMAKE_COMMAND} -E remove ${Coverage_NAME}.base ${Coverage_NAME}.info ${Coverage_NAME}.total ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned + + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + DEPENDS ${Coverage_DEPENDENCIES} + COMMENT "Resetting code coverage counters to zero.\nProcessing code coverage counters and generating report." + ) + + # Show info where to find the report + add_custom_command(TARGET ${Coverage_NAME} POST_BUILD + COMMAND ; + COMMENT "Open ./${Coverage_NAME}/index.html in your browser to view the coverage report." + ) + +endfunction() # SETUP_TARGET_FOR_COVERAGE + +# Defines a target for running and collection code coverage information +# Builds dependencies, runs the given executable and outputs reports. +# NOTE! The executable should always have a ZERO as exit code otherwise +# the coverage generation will not complete. +# +# SETUP_TARGET_FOR_COVERAGE_COBERTURA( +# NAME ctest_coverage # New target name +# EXECUTABLE ctest -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR +# DEPENDENCIES executable_target # Dependencies to build first +# ) +function(SETUP_TARGET_FOR_COVERAGE_COBERTURA) + + set(options NONE) + set(oneValueArgs NAME) + set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES) + cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT SIMPLE_PYTHON_EXECUTABLE) + message(FATAL_ERROR "python not found! Aborting...") + endif() # NOT SIMPLE_PYTHON_EXECUTABLE + + if(NOT GCOVR_PATH) + message(FATAL_ERROR "gcovr not found! Aborting...") + endif() # NOT GCOVR_PATH + + # Combine excludes to several -e arguments + set(COBERTURA_EXCLUDES "") + foreach(EXCLUDE ${COVERAGE_EXCLUDES}) + set(COBERTURA_EXCLUDES "-e ${EXCLUDE} ${COBERTURA_EXCLUDES}") + endforeach() + message(STATUS ${COBERTURA_EXCLUDES}) + + add_custom_target(${Coverage_NAME} + + # Run tests + ${Coverage_EXECUTABLE} + + # Running gcovr + COMMAND ${GCOVR_PATH} -x -r ${CMAKE_SOURCE_DIR} ${COBERTURA_EXCLUDES} + -o ${Coverage_NAME}.xml -s + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + DEPENDS ${Coverage_DEPENDENCIES} + COMMENT "Running gcovr to produce Cobertura code coverage report." + ) + + # Show info where to find the report + add_custom_command(TARGET ${Coverage_NAME} POST_BUILD + COMMAND ; + COMMENT "Cobertura code coverage report saved in ${Coverage_NAME}.xml." + ) + +endfunction() # SETUP_TARGET_FOR_COVERAGE_COBERTURA + +function(APPEND_COVERAGE_COMPILER_FLAGS) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COVERAGE_COMPILER_FLAGS}" PARENT_SCOPE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_COMPILER_FLAGS}" PARENT_SCOPE) + message(STATUS "Appending code coverage compiler flags: ${COVERAGE_COMPILER_FLAGS}") +endfunction() # APPEND_COVERAGE_COMPILER_FLAGS diff --git a/cmake/FindSnappy.cmake b/cmake/FindSnappy.cmake new file mode 100644 index 0000000..fe6d4ff --- /dev/null +++ b/cmake/FindSnappy.cmake @@ -0,0 +1,56 @@ +# Locate snappy library +# This module defines +# SNAPPY_FOUND, if false, do not try to link with snappy +# LIBSNAPPY, Library path and libs +# SNAPPY_INCLUDE_DIR, where to find the ICU headers + +FIND_PATH(SNAPPY_INCLUDE_DIR snappy.h + HINTS + ENV SNAPPY_DIR + PATH_SUFFIXES include + PATHS + ~/Library/Frameworks + /Library/Frameworks + /usr/local + /opt/local + /opt/csw + /opt/snappy + /opt) + +FIND_LIBRARY(SNAPPY_LIBRARIES + NAMES snappy + HINTS + ENV SNAPPY_DIR + PATHS + ~/Library/Frameworks + /Library/Frameworks + /usr/local + /opt/local + /opt/csw + /opt/snappy + /opt) + +FIND_LIBRARY(SNAPPY_STATIC_LIBRARIES + NAMES libsnappy.a + HINTS + ENV SNAPPY_DIR + PATHS + ~/Library/Frameworks + /Library/Frameworks + /usr/local + /opt/local + /opt/csw + /opt/snappy + /opt) + +IF (SNAPPY_LIBRARIES AND SNAPPY_INCLUDE_DIR ) + include_directories(AFTER ${SNAPPY_INCLUDE_DIR}) + MESSAGE(STATUS "Found snappy in ${SNAPPY_INCLUDE_DIR} : ${SNAPPY_LIBRARIES}") + SET(SNAPPY_FOUND ON) + + MARK_AS_ADVANCED(SNAPPY_INCLUDE_DIR SNAPPY_LIBRARIES) +ELSE (SNAPPY_LIBRARIES AND SNAPPY_INCLUDE_DIR ) + MESSAGE(STATUS "Snappy : NOT Found") + SET(SNAPPY_FOUND OFF) + +ENDIF (SNAPPY_LIBRARIES AND SNAPPY_INCLUDE_DIR ) diff --git a/docs/basic_operations.md b/docs/basic_operations.md new file mode 100644 index 0000000..6ac8125 --- /dev/null +++ b/docs/basic_operations.md @@ -0,0 +1,308 @@ +Basic Operations +=== + +All APIs and their parameters are explained in [header files](../include/libjungle): +* [db_config.h](../include/libjungle/db_config.h): DB configurations. +* [db_stats.h](../include/libjungle/db_stats.h): DB statistics. +* [iterator.h](../include/libjungle/iterator.h): Iterator operations. +* [jungle.h](../include/libjungle/jungle.h): DB operations. +* [keyvalue.h](../include/libjungle/keyvalue.h): Key-value pair. +* [record.h](../include/libjungle/record.h): Record. +* [sized_buf.h](../include/libjungle/sized_buf.h): Basic buffer. +* [status.h](../include/libjungle/status.h): Operation result status. + +The basic unit of indexing in Jungle is a [record](../include/libjungle/record.h), which consists of [key-value pair](../include/libjungle/keyvalue.h), sequence number, operation type, and custom metadata. Key, value, and metadata are based on variable size buffer (i.e., a memory buffer and its length), called [SizedBuf](../include/libjungle/sized_buf.h) in Jungle. + + +### Contents +* [Initialization](#initialization) +* [Open and close](#open-and-close) +* [Set operation](#set-operation) +* [Get operation](#get-operation) +* [Delete operation](#delete-operation) +* [Iterator](#iterator) + +----- +### [Initialization](#contents) + +Jungle defines various global resources shared among different DB instances in the same process, such as cache and thread pool. You can explicitly initialize or release them. + +Initialize: +```C++ +GlobalConfig global_config; +// ... set your config ... +jungle::init(global_config); +``` + +Release: +```C++ +jungle::shutdown(); +``` + +Note that above APIs can be skipped. In such case, initialization will be done on the first DB open with the default configurations, and they will be release at the termination of the process. + + +----- +### [Open and close](#contents) + +Each Jungle instance is a single directory which contains multiple files such as DB logs, tables, and some debugging logs. To open a DB instance, a path to the DB should be given. +```C++ +DBConfig db_config; +// ... set your config ... +DB* db = nullptr; +Status s = DB::open(&db, "./my_db", db_config); +``` +Once opening DB is successful, `s.ok()` will be `true`. Otherwise, you can see the result value by calling `s.getValue()` or `s.toString()`. + +If the given path does not exist or empty, Jungle will create a new one. + +You can close the DB instance by calling `close` API: +```C++ +Status s = db->close(); +``` + + +----- +### [Set operation](#contents) + +There are four different ways to set (upsert) a record. + +##### Set a key value pair: `jungle::set`. +* Custom metadata will be empty, and sequence number will be automatically generated. +```C++ +db->set(KV("key", "value")); +``` + +##### Set a key value pair with custom sequence number: `jungle::setSN`. + * Custom metadata will be empty, and sequence number will be set to the given number. + * Sequence number should be unique and increasing. If not, undefined behavior including system crash or data corruption will happen. +```C++ +KV key_value; +db->setSN(100, KV("key", "value")); +``` + + +##### Set a record: `jungle::setRecordByKey`. + * Custom metadata will be set to the given data, and sequence number will be automatically generated. +```C++ +Record record; +// ... set record ... +db->setRecordByKey(record); +``` + + +##### Set a record with custom sequence number: `jungle::setRecord`. + * Both custom metadata and sequence number will be set to the given values. + * Sequence number should be unique and increasing. If not, undefined behavior including system crash or data corruption will happen. +```C++ +Record record; +// ... set record ... +record.seqNum = 100; +db->setRecord(record); +``` + + +----- +### [Get operation](#contents) + +There are four different ways to get (point query) a record. User is responsible for the deallocation of memory returned by get operations. + +##### Get a value corresponding to given key: `jungle::get`. +* Only value part will be returned. +* `get` on deleted key will not succeed. +```C++ +SizedBuf returned_value; +db->get(SizedBuf("key_to_find", returned_value); +returned_value.free(); +``` + +##### Get a key value pair corresponding to given sequence number: `jungle::getSN`. +* Key and value will be returned. +* `getSN` on deleted key will not succeed. +* Note: *currently we support this API only for log store mode.* +```C++ +KV returned_key_value; +db->getSN(100, returned_key_value); +returned_key_value.free(); +``` + + +##### Get a record corresponding to given key: `jungle::getRecordByKey`. +* All fields in record will be returned. +* There is a flag to retrieve a deleted record. +* Below example will not succeed on deleted record: +```C++ +Record returned_record; +db->getRecordByKey(SizedBuf("key_to_find", returned_record); +returned_record.free(); +``` + +* With setting the flag to `true`, deleted record will be returned. Custom metadata will be retrieved if it was set before when the record was deleted. Value part of the returned record will be empty. +* Only logically deleted records will be visible. Once a record is physically deleted by merge or compaction, it will not be retrieved anymore. +```C++ +Record returned_record; +db->getRecordByKey(SizedBuf("key_to_find", returned_record, true); +returned_record.free(); +``` + + +##### Get a record corresponding to given sequence number: `jungle::getRecord`. +* All fields in record will be returned. +* `getRecord` on deleted key will always succeed. +* Note: *currently we support this API only for log store mode.* +```C++ +Record returned_record; +db->getRecord(100, returned_record); +returned_record.free(); +``` + +----- + +### [Delete operation](#contents) + +In Jungle, delete operation is the same as update operation, modifying the existing record as a deletion marker (i.e., tombstone). You can put your custom metadata for each tombstone and retrieve it later. Tombstones are physically purged later, during Table compaction. + +There are three different ways to delete a record. + +##### Delete a key and its value: `jungle::del`. +* Sequence number for this tombstone will be automatically generated. +```C++ +db->del(SizedBuf("key_to_delete"); +``` + +##### Delete a key and its value with custom sequence number: `jungle::delSN`. + * Sequence number for this tombstone will be set to the given number. + * Sequence number should be unique and increasing. If not, undefined behavior including system crash or data corruption will happen. +```C++ +db->delSN(100, SizedBuf("key_to_delete"); +``` + +##### Delete a record with custom metadata and sequence number: `jungle::setRecord`. + * Both custom metadata and sequence number for this tombstone will be set to the given values. + * Sequence number should be unique and increasing. If not, undefined behavior including system crash or data corruption will happen. +```C++ +Record record; +record.meta = ... // metadata for this tombstone +record.seqNum = 100; +db->setRecord(record); +``` + +----- +### [Iterator](#contents) + +Each iterator works as a snapshot, thus any mutations will not be applied to previous iterators already opened. + +Jungle can create an iterator in two different orders: key and sequence number. + + +#### Opening an iterator from DB instance + +##### Key iterator + +```C++ +Iterator itr; +itr.init(db); +``` +This iterator can access all keys in the given `db`. You can also specify the range. +```C++ +itr.init(db, SizedBuf("a"), SizedBuf("z")); // from a to z (inclusive) +``` +or +```C++ +itr.init(db, SizedBuf("a")); // from a to max +``` +or +```C++ +itr.init(db, SizedBuf(), SizedBuf("z")); // from min to z +``` + +##### Sequence number iterator + +```C++ +Iterator itr; +itr.initSN(db); +``` +This iterator can access all sequence numbers in the given `db`. You can also specify the range. +```C++ +itr.initSN(db, 100, 200); // from 100 to 200 (inclusive) +``` +or +```C++ +itr.initSN(db, 100); // from 100 to max +``` +or +```C++ +itr.initSN(db, DB::NULL_SEQNUM, 200); // from min to 200 +``` + +##### Note + +Even though the DB instance is empty or there is no record within the given range, opening an iterator will succeed. But any following operations on that iterator will return error. + +#### Closing an iterator + +Both key and sequence number iterators can be closed using `DB::Iterator::close` API. +```C++ +itr.close(); +``` +All iterators should be closed before the closing the parent DB instance. + + +#### Get a record at the current cursor + +```C++ +Record returned_record; +Status s = itr.get(returned_record); +returned_record.free(); +``` +If the iterator currently does not point to any record, `s.ok()` will be `false`. + +Same as `get` operations, user is responsible for the deallocation of the returned record. + +Sequence number iterator will return tombstones, while key iterator will return live records only. + +#### Move the cursor of the iterator + +##### Forward +```C++ +Status s = itr.next(); +``` +If the cursor is successfully moved, `s.ok()` will be `true`. + +##### Backward +```C++ +Status s = itr.prev(); +``` +If the cursor is successfully moved, `s.ok()` will be `true`. + +##### Jump to begin +```C++ +Status s = itr.gotoBegin(); +``` +If the cursor is successfully moved, `s.ok()` will be `true`. + +##### Jump to end +```C++ +Status s = itr.gotoEnd(); +``` +If the cursor is successfully moved, `s.ok()` will be `true`. + +##### Jump to a random position (key iterator) +```C++ +Status s = itr.seek(SizedBuf("key_to_find"); +``` +If the given key does not exist, this API will find the smallest but greater then the given key. If such key does not exist either, `s.ok()` will be `false`. + +There is an option to choose the behavior when the exact match does not exist. If you explicitly set it as follows: +```C++ +Status s = itr.seek(SizedBuf("key_to_find", Iterator::SMALLER); +``` +then it will find the greatest but smaller than the given key. + +##### Jump to a random position (sequence number iterator) +```C++ +Status s = itr.seekSN(100); +``` +Other things are identical to those of key iterator. + + diff --git a/docs/how_to_use.md b/docs/how_to_use.md new file mode 100644 index 0000000..c6e88f2 --- /dev/null +++ b/docs/how_to_use.md @@ -0,0 +1,15 @@ + +How to Use This Library +======================= + +The fundamental concept has been introduced in our paper: + +* Jung-Sang Ahn, Mohiuddin Abdul Qader, Woon-Hak Kang, Hieu Nguyen, Guogen Zhang, and Sami Ben-Romdhane, [Jungle: Towards Dynamically Adjustable Key-Value Store by Combining LSM-Tree and Copy-On-Write B+-Tree](https://www.usenix.org/system/files/hotstorage19-paper-ahn.pdf), USENIX HotStorage 2019. + + +Contents +-------- +* [Overview: Combining LSM-Tree & Append-Only B+Tree](overview.md) +* [Understanding Sequence Numbers](seq_numbers.md) +* [Basic Operations](basic_operations.md) +* Tuning Parameters diff --git a/docs/overview.md b/docs/overview.md new file mode 100644 index 0000000..199f63a --- /dev/null +++ b/docs/overview.md @@ -0,0 +1,54 @@ +Overview: Combining LSM-Tree & Append-Only B+Tree +================================================= + +Similar to other popular LSM-tree implementations, Jungle adopts horizontally partitioned leveling approach; there are multiple levels whose maximum size increases exponentially, and each level consists of multiple disjoint (i.e., non-overlapping) tables. Below diagram illustrates the overall architecture. + +``` + [MemTable 0] [MemTable 1] ... memory +-------------------------------------------------------------------- +Log [Log file 0] [Log file 1] ... disk + +Level-0 [Table-hash(0)] [Table-hash(1)] ... +(hash) + +Level-1 [Table-range(0-100)] [Table-range(101-200)] ... +(range) + +Level-2 [Table-range(0-10)] [Table-range(11-20)] ... +(range) + +... +``` + +Once a new update comes in, it is inserted into *MemTable*, which is an in-memory ordered index based on lock-free [Skiplist](https://en.wikipedia.org/wiki/Skip_list). Each MemTable has a corresponding log file on disk, and the contents of MemTables are periodically synced to log file for durability. Records in a log file are stored in chronological order, so that we always need corresponding MemTables to serve `get` operation in `O(log N)` time. Different log files or MemTables may have duplicate key. + +Each MemTable has a limit on both size and the number of records. A new MemTable and log file pair is created if the last one becomes full. Only the latest MemTable serves the incoming traffic, while older ones become immutable. + +Since MemTables reside in memory, there is a limit for the entire size of MemTable and log file pairs, and the records in MemTables are merged into Tables in level-0 once the size exceeds the limit, or user explicitly requests the task. After merge, stale log files and MemTables are removed. + +A *Table* is an ordered index on disk, where each table is a single file. Tables in the same level are not overlapping each other, thus an arbitrary key belongs to only one table in a level. Tables in different levels can have duplicate key, but the key in upper (i.e., the smaller number) level is always newer than that in lower level. + +Each level has a size limit. Once a level exceeds the limit, Jungle picks a victim Table, and merges it to overlapping Tables in the next level, called *inter-level compaction*. If no more level exists, there are two choices: 1) if the entire DB size is big enough, add one more level. Otherwise 2) do *in-place compaction* to reduce the size. Please refer to our [paper](https://www.usenix.org/conference/hotstorage19/presentation/ahn) for more details. + + +Jungle Table vs. Sorted String Table (SSTable) +---------------------------------------------- +In Jungle, each individual Table in every level is an append-only B+tree, whose key and value are separated. We are currently using ForestDB for it. Data once written in append-only B+tree are immutable, similar to sorted string tables (SSTables) in other LSM-tree implementations, thus it does not allow overwriting existing data. That means we can support multi-version concurrency control (MVCC) as other implementations do; writers do not block readers, and readers do not writers. + +However, one big advantage of append-only B+tree over SSTable is that it still supports *append* operation while keeping previous data immutable. If we want to merge some records into a SSTable, we should write a brand new SSTable including the new records, and then remove the old SSTable. With append-only B+tree, all we need to do for merge is just appending new records (i.e., delta) to the existing table, while append-only B+tree will take care of the ordering of the table so that we are still able to do `O(log N)` search. + +A notable benefit of doing *append* instead of *rewrite* is reducing write amplification. The amount of disk writes for rewriting SSTables is huge compared to the original amount of the data to update. Append-only B+tree also requires periodic *compaction* task which rewrites the entire B+tree to reclaim disk space. However, the frequency of append-only B+tree compaction is much less than that of SSTable rewriting. In our observation, the overall write amplification is reduced by up to 4-5 times compared to other LSM-tree implementations. + +Other benefits: +* We have more chances for aggressive optimizations. For instance, the level-0 in Jungle is hash-partitioned to boost parallel disk write (i.e., appending data to Tables in level-0), without increasing read cost by keeping all tables disjoint.
+In other LSM-tree implementations, level-0 is a special level where key overlapping is allowed, to make MemTable flushing faster by avoiding rewriting existing SSTables. But due to duplicate keys across different SSTables, 1) searching a key in level-0 and 2) merging data from level-0 to level-1 might be inefficient. + +* We can still use the original benefits of append-only B+tree, where every individual append works as a persistent snapshot. Having snapshots that are persistent across process restarts is difficult to achieve in existing LSM-tree implementations, due to SSTable characteristics. However, Jungle can easily support persistent snapshots as each Table is an append-only B+tree. + + +Reducing the Number of Levels +----------------------------- +Typically, the ratio of size limit between adjacent levels is a fixed number in most LSM-tree implementations, usually `10`. This number directly affects 1) the number of levels and 2) the overall write amplification. If we have bigger ratio number, it will reduce the number of levels which is good for decreasing read cost. However, it increases the number of overlapping SSTables between adjacent levels, that means write amplification also increases proportionally as we should rewrite all overlapping SSTables. + +However, in Jungle, we only append delta to Tables in the next level. Consequently, increasing the ratio number barely affects the write amplification so that we can safely reduce the number of levels, to get better read performance. + diff --git a/docs/seq_numbers.md b/docs/seq_numbers.md new file mode 100644 index 0000000..cb31bdd --- /dev/null +++ b/docs/seq_numbers.md @@ -0,0 +1,24 @@ +Understanding Sequence Numbers +============================== + +In Jungle, every single mutation (i.e., set, delete, and user-defined special record) has a unique sequence number, which starts from `1` and then monotonically increasing. For example, if we set three keys, their sequence numbers will be `1`, `2`, and `3`, respectively: +```C++ +db->set( KV("a", "A") ); // sequence number 1. +db->set( KV("b", "B") ); // sequence number 2. +db->set( KV("c", "C") ); // sequence number 3. +``` + +If we delete an existing key `b`, the new sequence number `4` will be assigned to the deletion marker, called *tombstone*: +```C++ +db->del( SizedBuf("b") ); // sequence number 4. +``` +This tombstone will remain until the underlying table gets compacted. After compaction, both key `"b"` and sequence number `4` will not be visible. + +If multiple mutations are issued on the same key, multiple sequence numbers will be created for it. But eventually only the last sequence number of the key will last as logs and Tables gets merged: +```C++ +db->set( KV("b", "BB") ); // sequence number 5. +db->del( SizedBuf("b") ); // sequence number 6. +db->set( KV("b", "BBB") ); // sequence number 7. +``` +Sequence numbers for the key `"b"` (`2` and `4`-`7`) will exist for a while, and finally only the last sequence number `7` will remain. Note that the record referred to by the last sequence number always represents the current state of the key, which means that applying the sequence of mutations from `4` to `7` and applying only the mutation `7` will bring you the same result. + diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 0000000..17386c5 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,35 @@ +set(EXAMPLES_DIR ${PROJECT_SOURCE_DIR}/examples) + +set(JUNGLE_TEST_DEPS + ${CMAKE_CURRENT_BINARY_DIR}/../libjungle.a + ${LIBSIMPLELOGGER} + ${FDB_LIB_DIR}/libforestdb.a + ${LIBSNAPPY} + ${LIBDL}) + +set(GET_SET_DEL ${EXAMPLES_DIR}/example_get_set_del.cc) +add_executable(get_set_del ${GET_SET_DEL}) +target_link_libraries(get_set_del ${JUNGLE_TEST_DEPS}) +add_dependencies(get_set_del static_lib) + +set(ITERATOR ${EXAMPLES_DIR}/example_iterator.cc) +add_executable(iterator ${ITERATOR}) +target_link_libraries(iterator ${JUNGLE_TEST_DEPS}) +add_dependencies(iterator static_lib) + +set(ITERATOR_ADV ${EXAMPLES_DIR}/example_iterator_adv.cc) +add_executable(iterator_adv ${ITERATOR_ADV}) +target_link_libraries(iterator_adv ${JUNGLE_TEST_DEPS}) +add_dependencies(iterator_adv static_lib) + +set(SNAPSHOT_CHK ${EXAMPLES_DIR}/example_snapshot_checkpoint.cc) +add_executable(snapshot_checkpoint ${SNAPSHOT_CHK}) +target_link_libraries(snapshot_checkpoint ${JUNGLE_TEST_DEPS}) +add_dependencies(snapshot_checkpoint static_lib) + +set(SNAPSHOT_INST ${EXAMPLES_DIR}/example_snapshot_instant.cc) +add_executable(snapshot_instant ${SNAPSHOT_INST}) +target_link_libraries(snapshot_instant ${JUNGLE_TEST_DEPS}) +add_dependencies(snapshot_instant static_lib) + + \ No newline at end of file diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..00f9768 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,14 @@ +Raft Examples +--------- + +* [example_get_set_del.cc](./example_get_set_del.cc) + - Basic `get`, `set`, and `del` operations. +* [example_iterator.cc](./example_iterator.cc) + - Basic key iterator and sequence number iterator operations. +* [example_iterator_adv.cc](./example_iterator_adv.cc) + - Advanced iterator operations: bi-directional traversal and seek. +* [example_snapshot_checkpoint.cc](./example_snapshot_checkpoint.cc) + - Persistent snapshot based on checkpoint. +* [example_snapshot_instant.cc](./example_snapshot_instant.cc) + - Instant (volatile) snapshot. + diff --git a/examples/example_get_set_del.cc b/examples/example_get_set_del.cc new file mode 100644 index 0000000..e986762 --- /dev/null +++ b/examples/example_get_set_del.cc @@ -0,0 +1,64 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include + +// For removing directory purpose, not needed in practice. +#include "test_common.h" + +using namespace jungle; + +int main() { + const static std::string path = "./db_example_get_set_del"; + + // Remove existing DB first. + TestSuite::clearTestFile(path); + + // Initialize global resources with default config. + jungle::init(GlobalConfig()); + + // Open an instance at the given path, with default config. + DB* db = nullptr; + DB::open(&db, path, DBConfig()); + + // Set {"hello", "world"} pair. + db->set( KV("hello", "world") ); + + // Search key "hello", should return "world". + SizedBuf value_out; + Status s; + s = db->get( SizedBuf("hello"), value_out ); + std::cout << "return code " << s << ", " + << value_out.toString() << std::endl; + // Should free the memory of value after use. + value_out.free(); + + // Delete a pair corresponding to key "hello". + db->del( SizedBuf("hello") ); + + // Search key "hello", should return error. + s = db->get( SizedBuf("hello"), value_out ); + std::cout << "return code " << s << std::endl; + + // Close and free DB instance. + DB::close(db); + + // Release global resources. + jungle::shutdown(); + + return 0; +} + diff --git a/examples/example_iterator.cc b/examples/example_iterator.cc new file mode 100644 index 0000000..b159d5d --- /dev/null +++ b/examples/example_iterator.cc @@ -0,0 +1,93 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include + +// For removing directory purpose, not needed in practice. +#include "test_common.h" + +using namespace jungle; + +int main() { + const static std::string path = "./db_example_iterator"; + + // Remove existing DB first. + TestSuite::clearTestFile(path); + + // Initialize global resources with default config. + jungle::init(GlobalConfig()); + + // Open an instance at the given path, with default config. + DB* db = nullptr; + DB::open(&db, path, DBConfig()); + + // Set {"foo", "FOO"} + // {"bar", "VAR"} + // {"baz", "VAZ"} pairs. + db->set( KV("foo", "FOO") ); + db->set( KV("bar", "BAR") ); + db->set( KV("baz", "BAZ") ); + + // Initialize a key-order iterator. + Iterator key_itr; + key_itr.init(db); + + // Initialize a seq-order iterator. + Iterator seq_itr; + seq_itr.initSN(db); + + // Add one more pair: {"qux", "QUX"}. + // Since each iterator is a snapshot, + // this pair will not be visible to both key and seq iterators. + db->set( KV("qux", "QUX") ); + + // Iterate in key order, should be in bar, baz, and foo order. + do { + Record rec_out; + Status s = key_itr.get(rec_out); + if (!s) break; + + std::cout << rec_out.kv.key.toString() << ", " + << rec_out.kv.value.toString() << std::endl; + rec_out.free(); + } while (key_itr.next().ok()); + + // Close the key-iterator. + key_itr.close(); + + // Iterate in seq order, should be in foo, bar, and baz order. + do { + Record rec_out; + Status s = seq_itr.get(rec_out); + if (!s) break; + + std::cout << rec_out.kv.key.toString() << ", " + << rec_out.kv.value.toString() << std::endl; + rec_out.free(); + } while (seq_itr.next().ok()); + + // Close the seq-iterator. + seq_itr.close(); + + // Close and free DB instance. + DB::close(db); + + // Release global resources. + jungle::shutdown(); + + return 0; +} + diff --git a/examples/example_iterator_adv.cc b/examples/example_iterator_adv.cc new file mode 100644 index 0000000..fac8b07 --- /dev/null +++ b/examples/example_iterator_adv.cc @@ -0,0 +1,142 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include + +// For removing directory purpose, not needed in practice. +#include "test_common.h" + +using namespace jungle; + +int main() { + const static std::string path = "./db_example_iterator_adv"; + + // Remove existing DB first. + TestSuite::clearTestFile(path); + + // Initialize global resources with default config. + jungle::init(GlobalConfig()); + + // Open an instance at the given path, with default config. + DB* db = nullptr; + DB::open(&db, path, DBConfig()); + + // Set {"k00", "v00"} + // {"k10", "v10"} + // ... + // {"k90", "v90"} pairs. + for (size_t ii=0; ii<9; ++ii) { + std::string key_str = "k" + std::to_string(ii) + "0"; + std::string val_str = "v" + std::to_string(ii) + "0"; + db->set( KV(key_str, val_str) ); + } + + // Initialize a key-order iterator for the range between + // k15 and k85. + Iterator key_itr; + key_itr.init(db, SizedBuf("k15"), SizedBuf("k85")); + + // Iterate all: since both k15 and k85 do not exist, + // it will traverse from k20 to k80. + std::cout << "iterate all:" << std::endl; + do { + Record rec_out; + Record::Holder h(rec_out); // Auto free. + if (!key_itr.get(rec_out).ok()) break; + std::cout << rec_out.kv.key.toString() << ", " + << rec_out.kv.value.toString() << std::endl; + } while (key_itr.next().ok()); + + // Seek k65: default option is GREATER, + // cursor will point to k70. + std::cout << "after seek k65:" << std::endl; + key_itr.seek( SizedBuf("k65") ); + do { + Record rec_out; + Record::Holder h(rec_out); // Auto free. + if (!key_itr.get(rec_out).ok()) break; + std::cout << rec_out.kv.key.toString() << ", " + << rec_out.kv.value.toString() << std::endl; + } while (key_itr.next().ok()); + + // Seek k65 with SMALLER option: + // cursor will point to k60. + std::cout << "after seek k65 with SMALLER:" << std::endl; + key_itr.seek( SizedBuf("k65"), Iterator::SMALLER ); + do { + Record rec_out; + Record::Holder h(rec_out); // Auto free. + if (!key_itr.get(rec_out).ok()) break; + std::cout << rec_out.kv.key.toString() << ", " + << rec_out.kv.value.toString() << std::endl; + } while (key_itr.next().ok()); + + // Seek k70 with SMALLER option: + // cursor will point to k70, as k70 exists. + std::cout << "after seek k70 with SMALLER:" << std::endl; + key_itr.seek( SizedBuf("k70"), Iterator::SMALLER ); + do { + Record rec_out; + Record::Holder h(rec_out); // Auto free. + if (!key_itr.get(rec_out).ok()) break; + std::cout << rec_out.kv.key.toString() << ", " + << rec_out.kv.value.toString() << std::endl; + } while (key_itr.next().ok()); + + // Iterate all using prev: should print k20-k80 in reversed order. + std::cout << "iterate all using prev:" << std::endl; + do { + Record rec_out; + Record::Holder h(rec_out); // Auto free. + if (!key_itr.get(rec_out).ok()) break; + std::cout << rec_out.kv.key.toString() << ", " + << rec_out.kv.value.toString() << std::endl; + } while (key_itr.prev().ok()); + + // After goto end: cursor is located at the end. + std::cout << "after gotoEnd:" << std::endl; + key_itr.gotoEnd(); + do { + Record rec_out; + Record::Holder h(rec_out); // Auto free. + if (!key_itr.get(rec_out).ok()) break; + std::cout << rec_out.kv.key.toString() << ", " + << rec_out.kv.value.toString() << std::endl; + } while (false); // Print the first record only. + + // After goto begin: cursor is located at the beginning. + std::cout << "after gotoBegin:" << std::endl; + key_itr.gotoBegin(); + do { + Record rec_out; + Record::Holder h(rec_out); // Auto free. + if (!key_itr.get(rec_out).ok()) break; + std::cout << rec_out.kv.key.toString() << ", " + << rec_out.kv.value.toString() << std::endl; + } while (false); // Print the first record only. + + // Close the key-iterator. + key_itr.close(); + + // Close and free DB instance. + DB::close(db); + + // Release global resources. + jungle::shutdown(); + + return 0; +} + diff --git a/examples/example_snapshot_checkpoint.cc b/examples/example_snapshot_checkpoint.cc new file mode 100644 index 0000000..d721bba --- /dev/null +++ b/examples/example_snapshot_checkpoint.cc @@ -0,0 +1,108 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include + +// For removing directory purpose, not needed in practice. +#include "test_common.h" + +using namespace jungle; + +int main() { + const static std::string path = "./db_example_snapshot"; + + // Remove existing DB first. + TestSuite::clearTestFile(path); + + // Initialize global resources with default config. + jungle::init(GlobalConfig()); + + // Open an instance at the given path, with default config. + DB* db = nullptr; + DB::open(&db, path, DBConfig()); + + // Set {"hello", "world"} pair. + db->set( KV("hello", "world") ); + + // Checkpoint - create a persistent snapshot. + uint64_t checkpoint_marker1 = 0; + db->checkpoint(checkpoint_marker1); + + // Update - set {"hello", "WORLD"} pair. + db->set( KV("hello", "WORLD") ); + + // Another checkpoint. + uint64_t checkpoint_marker2 = 0; + db->checkpoint(checkpoint_marker2); + + // Update again - set {"hello", "WoRlD"} pair. + db->set( KV("hello", "WoRlD") ); + + // Close and reopen DB. + DB::close(db); + DB::open(&db, path, DBConfig()); + + // Search key "hello" in the latest DB instance, + // should return the latest value "WoRlD". + SizedBuf value_out; + Status s; + s = db->get( SizedBuf("hello"), value_out ); + std::cout << "latest: return code " << s << ", " + << value_out.toString() << std::endl; + // Should free the memory of value after use. + value_out.free(); + + // Get the current available checkpoint list. + std::list checkpoints; + db->getCheckpoints(checkpoints); + std::cout << "available checkpoint markers: "; + for (auto& entry: checkpoints) std::cout << entry << " "; + std::cout << std::endl; + + // Open a snapshot with the first checkpoint marker. + DB* snapshot1 = nullptr; + db->openSnapshot(&snapshot1, checkpoint_marker1); + + // Search key "hello" in the first snapshot, should return "world". + s = snapshot1->get( SizedBuf("hello"), value_out ); + std::cout << "snapshot1: return code " << s << ", " + << value_out.toString() << std::endl; + // Should free the memory of value after use. + value_out.free(); + + // Open a snapshot with the second checkpoint marker. + DB* snapshot2 = nullptr; + db->openSnapshot(&snapshot2, checkpoint_marker2); + + // Search key "hello" in the second snapshot, should return "WORLD". + s = snapshot2->get( SizedBuf("hello"), value_out ); + std::cout << "snapshot2: return code " << s << ", " + << value_out.toString() << std::endl; + // Should free the memory of value after use. + value_out.free(); + + // Should close snapshot handles first, + // before closing the parent DB handle. + DB::close(snapshot1); + DB::close(snapshot2); + DB::close(db); + + // Release global resources. + jungle::shutdown(); + + return 0; +} + diff --git a/examples/example_snapshot_instant.cc b/examples/example_snapshot_instant.cc new file mode 100644 index 0000000..b06a6a5 --- /dev/null +++ b/examples/example_snapshot_instant.cc @@ -0,0 +1,76 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include + +// For removing directory purpose, not needed in practice. +#include "test_common.h" + +using namespace jungle; + +int main() { + const static std::string path = "./db_example_snapshot"; + + // Remove existing DB first. + TestSuite::clearTestFile(path); + + // Initialize global resources with default config. + jungle::init(GlobalConfig()); + + // Open an instance at the given path, with default config. + DB* db = nullptr; + DB::open(&db, path, DBConfig()); + + // Set {"hello", "world"} pair. + db->set( KV("hello", "world") ); + + // Open an instant snapshot without checkpoint marker. + // The snapshot is based on the current DB image, + // and will not persist after DB restarts. + DB* snapshot = nullptr; + db->openSnapshot(&snapshot); + + // Update - set {"hello", "WORLD"} pair. + db->set( KV("hello", "WORLD") ); + + // Search key "hello" in the latest DB instance, + // should return the latest value "WORLD". + SizedBuf value_out; + Status s; + s = db->get( SizedBuf("hello"), value_out ); + std::cout << "latest: return code " << s << ", " + << value_out.toString() << std::endl; + // Should free the memory of value after use. + value_out.free(); + + // Search key "hello" in the snapshot, should return "world". + s = snapshot->get( SizedBuf("hello"), value_out ); + std::cout << "snapshot: return code " << s << ", " + << value_out.toString() << std::endl; + // Should free the memory of value after use. + value_out.free(); + + // Should close the snapshot handle first, + // before closing the parent DB handle. + DB::close(snapshot); + DB::close(db); + + // Release global resources. + jungle::shutdown(); + + return 0; +} + diff --git a/include/libjungle/db_config.h b/include/libjungle/db_config.h new file mode 100644 index 0000000..7880ca9 --- /dev/null +++ b/include/libjungle/db_config.h @@ -0,0 +1,446 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "record.h" + +#include +#include + +#include +#include + +namespace jungle { + +/** + * Used `typedef` to make it compatible with ForestDB's function type. + */ +typedef int (*CustomCmpFunc) + ( void* a, size_t len_a, + void* b, size_t len_b, + void* user_param ); + +enum CompactionCbDecision : int { + /** + * Keep this record. This record will survive after compaction. + */ + KEEP = 0, + + /** + * Drop this record. This record will not exist after compaction. + */ + DROP = 1, +}; + +struct CompactionCbParams { + CompactionCbParams() {} + Record rec; +}; + +using CompactionCbFunc = + std::function< CompactionCbDecision(const CompactionCbParams&) >; + +#if 0 +typedef CompactionCbDecision (*CompactionCbFunc) + (const CompactionCbParams& params); +#endif + +class DBConfig { +public: + DBConfig() + : allowOverwriteSeqNum(false) + , logSectionOnly(false) + , truncateInconsecutiveLogs(true) + , logFileTtl_sec(0) + , maxKeepingMemtables(0) + , maxEntriesInLogFile(16384) // 16K + , maxLogFileSize(4194304) // 4MB + , cmpFunc(nullptr) + , cmpFuncParam(nullptr) + , compactionCbFunc(nullptr) + , allowLogging(true) + , throttlingThreshold(10000) + , bulkLoading(false) + , numL0Partitions(4) + , minFileSizeToCompact(16777216) // 16MB + , minBlockReuseCycleToCompact(0) + , maxBlockReuseCycle(1) + , compactionFactor(300) // 300% + , blockReuseFactor(300) // 300% + , useBloomFilterForGet(true) + , bloomFilterBitsPerUnit(0.0) + , nextLevelExtension(true) + , maxL0TableSize(1073741824) // 1GB + , maxL1TableSize(2684354560) // 2.5GB + , maxL1Size((uint64_t)120 * 1073741824) // 120 GB + , maxParallelWritesPerJob(0) + , readOnly(false) + , directIo(false) + { + tableSizeRatio.push_back(2.5); + levelSizeRatio.push_back(10.0); + + lookupBoosterLimit_mb.push_back(100); + lookupBoosterLimit_mb.push_back(200); + } + + /** + * Check if this config is valid. + * + * @return `true` if valid. + */ + bool isValid() const; + + /** + * Calculate the maximum table size of the given level. + * + * @param level Level. + * @return Maximum table size in bytes. + */ + uint64_t getMaxTableSize(size_t level) const; + + /** + * Calculate the maximum parallel disk write threads per compaction. + * + * @return The number of threads. + */ + size_t getMaxParallelWriters() const; + + /** + * Allow overwriting logs that already exist. + */ + bool allowOverwriteSeqNum; + + /* + * Disable table section and use logging part only. + */ + bool logSectionOnly; + + /* + * (Only when `logSectionOnly == true`) + * Truncate tail logs if they are inconsecutive, + * to avoid empty log (a hole) in the middle. + */ + bool truncateInconsecutiveLogs; + + /** + * (Only when `logSectionOnly == true`) + * TTL for log file in second. + * If it is non-zero, the mem-table of the log file will + * be purged once that file is not accessed for the given time. + */ + uint32_t logFileTtl_sec; + + /** + * (Only when `logSectionOnly == true`) + * Number of memtables kept in memory at the same time. + * If it is non-zero, and if the number of memtables exceeds + * this number, the oldest memtable will be purged from memory + * even before the TTL of corresponding log file. + */ + uint32_t maxKeepingMemtables; + + /** + * Max number of logs in a file. + */ + uint32_t maxEntriesInLogFile; + + /** + * Max size of a log file. + */ + uint32_t maxLogFileSize; + + /** + * Custom comparison function. + */ + CustomCmpFunc cmpFunc; + + /** + * Parameter for custom comparison function. + */ + void* cmpFuncParam; + + /** + * Compaction callback function. + */ + CompactionCbFunc compactionCbFunc; + + /** + * Allow logging system info. + */ + bool allowLogging; + + /** + * Minimum number of records for triggering write throttling. + */ + uint32_t throttlingThreshold; + + /** + * Bulk loading mode. + */ + bool bulkLoading; + + /** + * Number of partitions in level-0. + */ + uint32_t numL0Partitions; + + /** + * Minimum file size that can be compacted. + */ + uint64_t minFileSizeToCompact; + + /** + * Minimum block re-use cycle to trigger compaction. + */ + uint32_t minBlockReuseCycleToCompact; + + /** + * If non-zero, ForestDB's block reuse cycle will be + * limited to given number. After that the file will + * be growing without reusing. + */ + uint32_t maxBlockReuseCycle; + + /** + * File size ratio threshold to trigger compaction, in percentage. + * e.g.) 150 == 150%, which means that compactio will + * be triggered if file size becomes 150% of the active + * data size. + */ + uint32_t compactionFactor; + + /** + * File size ratio threshold to trigger block reuse, in percentage. + */ + uint32_t blockReuseFactor; + + /** + * If `false`, point get will not use bloom filter even though it exists. + */ + bool useBloomFilterForGet; + + /** + * LSM-mode: Bloom filter's bits per key. + * Jungle mode: Bloom filter's bits per 1KB portion of table. + */ + double bloomFilterBitsPerUnit; + + /** + * Use range-partitioned L1+ for non-LSM mode. + */ + bool nextLevelExtension; + + /** + * L0 table size limit. + */ + uint64_t maxL0TableSize; + + /** + * L1+ table size limit. + */ + uint64_t maxL1TableSize; + + /** + * L1 level size limit. + * The other levels (L2, L3, ...) will be determined by + * both `maxL1Size` and `multiplicationFactor`. + */ + uint64_t maxL1Size; + + /** + * Starting from L2, the size ratio of table compared to + * the previous level: { L2/L1, L3/L2, ... }. + * For the levels not given in this vector, the last + * ratio will be used. + * If not given, it will be fixed to 10. + */ + std::vector tableSizeRatio; + + /** + * Starting from L2, the size ratio of level compared to + * the previous level: { L2/L1, L3/L2, ... }. + * For the levels not given in this vector, the last + * ratio will be used. + * If not given, `multiplicationFactor` will be used. + */ + std::vector levelSizeRatio; + + /** + * Size limit of in-memory lookup booster for each level. + */ + std::vector lookupBoosterLimit_mb; + + /** + * Maximum number of writers for each job (compaction, split). + * If 0, this number will be automatically adjusted considering + * the number of flushers and compactors. + */ + uint32_t maxParallelWritesPerJob; + + /** + * If `true`, read-only mode. No modify, recovery, and compaction. + */ + bool readOnly; + + /** + * If `true`, use direct-IO bypassing OS page cache. + * Currently only supported for log files. + * Default: `false` + */ + bool directIo; +}; + +class GlobalConfig { +public: + GlobalConfig() + : globalLogPath("./") + , numFlusherThreads(1) + , flusherSleepDuration_ms(500) + , flusherMinRecordsToTrigger(65536) + , flusherMinLogFilesToTrigger(16) + , flusherAutoSync(false) + , numCompactorThreads(2) + , compactorSleepDuration_ms(5000) + , logFileReclaimerSleep_sec(5) + , fdbCacheSize(0) + , numTableWriters(8) + , memTableFlushBufferSize(32768) + , shutdownLogger(true) + {} + + /** + * Path where Jungle's global log will be located. + */ + std::string globalLogPath; + + /** + * Max number of flusher threads. + */ + size_t numFlusherThreads; + + /** + * Fluhser thread sleep time in ms. + */ + size_t flusherSleepDuration_ms; + + /** + * Minimum number of records that triggers flushing. + */ + size_t flusherMinRecordsToTrigger; + + /** + * Minimum number of log files that triggers flushing. + */ + size_t flusherMinLogFilesToTrigger; + + /** + * Automatic sync before flushing. + */ + bool flusherAutoSync; + + /** + * Max number of compactor threads. + */ + size_t numCompactorThreads; + + /** + * Compactor thread sleep time in ms. + */ + size_t compactorSleepDuration_ms; + + /** + * Sleep duration of background log reclaimer. + */ + size_t logFileReclaimerSleep_sec; + + /** + * Underlying ForestDB's buffer cache size. + */ + uint64_t fdbCacheSize; + + /** + * Size of thread pool for table mutate tasks + * (flush, compaction, and split). + * NOTE: Both `numTableWriterGroups` and `numTableWritersPerGroup` + * have been deprecated. + */ + size_t numTableWriters; + + /** + * Size of buffer when flushing MemTable to log file. + */ + size_t memTableFlushBufferSize; + + /** + * Settings for idle time compaction. + * + * Compaction factor will be adjusted to the given value + * if traffic to this process is lower than the given threshold + * for the given time window. + */ + struct IdleTimeCompactionOptions { + IdleTimeCompactionOptions() + : timeWindow_sec(0) + , startHour(0) + , endHour(0) + , iopsThreshold(1000) + , factor(125) + {} + + /** + * Time window to check whether the process is idle. + * If zero, idle time compaction will not be activated. + */ + uint32_t timeWindow_sec; + + /** + * Start hour (24h format) to trigger compaction. + * If start and end hours are the same, this condition will be ignored. + */ + uint32_t startHour; + + /** + * End hour (24h format) to trigger compaction. + * If start and end hours are the same, this condition will be ignored. + */ + uint32_t endHour; + + /** + * IOPS threshold to determine whether the process is idle. + */ + uint32_t iopsThreshold; + + /** + * Temporary compaction factor if the process is idle. + */ + uint32_t factor; + }; + + /** + * Idle time compaction options. + */ + IdleTimeCompactionOptions itcOpt; + + /** + * Shutdown system logger on shutdown of Jungle. + */ + bool shutdownLogger; +}; + +} // namespace jungle + diff --git a/include/libjungle/db_stats.h b/include/libjungle/db_stats.h new file mode 100644 index 0000000..bfb0d35 --- /dev/null +++ b/include/libjungle/db_stats.h @@ -0,0 +1,55 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include +#include + +namespace jungle { + +class DBStats { +public: + DBStats() + : numKvs(0) + , workingSetSizeByte(0) + , cacheSizeByte(0) + , cacheUsedByte(0) + {} + + /** + * Approximate the number of key-value pairs in DB. + */ + uint64_t numKvs; + + /** + * Total working set (i.e., valid KV pairs) size. + */ + uint64_t workingSetSizeByte; + + /** + * Total block cache capacity (byte). + */ + uint64_t cacheSizeByte; + + /** + * Amount of cache used (byte). + */ + uint64_t cacheUsedByte; +}; + +} // namespace jungle + diff --git a/include/libjungle/iterator.h b/include/libjungle/iterator.h new file mode 100644 index 0000000..5861862 --- /dev/null +++ b/include/libjungle/iterator.h @@ -0,0 +1,168 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "keyvalue.h" +#include "status.h" +#include "record.h" + +namespace jungle { + +class DB; +class Iterator { +public: + Iterator(); + ~Iterator(); + + enum SeekOption { + /** + * If exact match does not exist, find the smallest key + * which is greater than the given key. + */ + GREATER = 0, + + /** + * If exact match does not exist, find the greatest key + * which is smaller than the given key. + */ + SMALLER = 1, + }; + + /** + * Initialize a key iterator, based on the given DB (or snapshot) instance. + * Note that even though the given instance is empty, this API will succeed. + * + * @param db DB (or snapshot) instance. + * @param start_key + * Lower bound of the iterator (inclusive). If exact match does not + * exist, the iterator will start from the smallest key which is + * greater than the given `start_key`. + * If not given, there will be no lower bound for the iterator. + * @param end_key + * Upper bound of the iterator (inclusive). If exact match does not + * exist, the iterator will start from the greatest key which is + * smaller than the given `end_key`. + * If not given, there will be no upper bound for the iterator. + * @return OK on success. + */ + Status init(DB* db, + const SizedBuf& start_key = SizedBuf(), + const SizedBuf& end_key = SizedBuf()); + + /** + * Initialize a sequence number iterator, based on the given DB + * (or snapshot) instance. + * Note that even though the given instance is empty, this API will succeed. + * + * @param db DB (or snapshot) instance. + * @param min_seq + * Lower bound of the iterator (inclusive). If exact match does not + * exist, the iterator will start from the smallest sequence number + * which is greater than the given `min_seq`. + * If not given, there will be no lower bound for the iterator. + * @param max_seq + * Upper bound of the iterator (inclusive). If exact match does not + * exist, the iterator will start from the greatest sequence number + * which is smaller than the given `max_seq`. + * If not given, there will be no upper bound for the iterator. + * @return OK on success. + */ + Status initSN(DB* db, + const uint64_t min_seq = -1, + const uint64_t max_seq = -1); + + /** + * Get the record currently pointed to by the iterator. + * + * If this is a sequence number iterator, it will return + * tombstones. Key iterator will not return tombstones. + * + * User is responsible for freeing the memory of `rec_out`, + * by using `Record::free()` or `Record::Holder`. + * + * @param[out] rec_out Reference to the record + * where the result will be stored. + * @return OK on success. + */ + Status get(Record& rec_out); + + /** + * Move the cursor of the iterator one step backward. + * + * @return OK on success. + */ + Status prev(); + + /** + * Move the cursor of the iterator one step forward. + * + * @return OK on success. + */ + Status next(); + + /** + * Move the cursor of the iterator to the given key. + * It is valid only for key iterator. + * If exact match does not exist, the cursor will be located according + * to the given seek option. + * + * @param key Key to find. + * @param opt Seek option. + * @return OK on success. + */ + Status seek(const SizedBuf& key, SeekOption opt = GREATER); + + /** + * Move the cursor of the iterator to the given sequence number. + * It is valid only for sequence number iterator. + * If exact match does not exist, the cursor will be located according + * to the given seek option. + * + * @param seqnum Sequence number to find. + * @param opt Seek option. + * @return OK on success. + */ + Status seekSN(const uint64_t seqnum, SeekOption opt = GREATER); + + /** + * Move the cursor to the beginning of the iterator. + * + * @return OK on success. + */ + Status gotoBegin(); + + /** + * Move the cursor to the end of the iterator. + * + * @return OK on success. + */ + Status gotoEnd(); + + /** + * Close the iterator. + * + * @return OK on success. + */ + Status close(); + +private: + class IteratorInternal; + IteratorInternal* const p; + using ItrInt = Iterator::IteratorInternal; +}; + +} // namespace jungle diff --git a/include/libjungle/jungle.h b/include/libjungle/jungle.h new file mode 100644 index 0000000..2282ed4 --- /dev/null +++ b/include/libjungle/jungle.h @@ -0,0 +1,719 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "db_config.h" +#include "db_stats.h" +#include "iterator.h" +#include "keyvalue.h" +#include "record.h" +#include "status.h" + +#include +#include +#include + +namespace jungle { + +class FlushOptions { +public: + FlushOptions() + : purgeOnly(false) + , syncOnly(false) + , callFsync(false) + , beyondLastSync(false) + , numFilesLimit(0) + , execDelayUs(0) + {} + + /** + * If `true`, records will not be stored in back-end table, + * but just will be purged from log. + */ + bool purgeOnly; + + /** + * (Only in async flush) + * If `true`, records will be written to log file only, + * will not be flushed to table section. + */ + bool syncOnly; + + /** + * (Only in async flush) + * If `true`, call `fsync()` on log files before flushing + * to table section. + */ + bool callFsync; + + /** + * If `true`, flush all logs currently exist, + * including logs not explicitly synced yet. + * If `false`, flushing only happens upto the last synced log. + */ + bool beyondLastSync; + + /** + * Limit the number of log files to be flushed at once. + * Disabled if 0. + */ + uint32_t numFilesLimit; + + /** + * (Only in async flush) + * If non-zero, given request will not be executed immediately, + * and Jungle will wait and merge incoming requests for the given + * time delay, and then execute them at once. + */ + uint32_t execDelayUs; +}; + +class CompactOptions { +public: + CompactOptions() + : preserveTombstone(false) + {} + + /** + * If true, deletion marker (i.e., tombstone) will be + * alive even after compaction. + */ + bool preserveTombstone; +}; + +struct DebugParams { + DebugParams() + : compactionDelayUs(0) + , compactionItrScanDelayUs(0) + , urgentCompactionFilesize(0) + , urgentCompactionRatio(0) + , rollbackDelayUs(0) + , logDetailsOfKeyNotFound(false) + , tableSetBatchCb(nullptr) + , addNewLogFileCb(nullptr) + {} + + /** + * If non-zero, every record copy during compaction will + * sleep this amount of time. + */ + uint32_t compactionDelayUs; + + /** + * If non-zero, every record scan at the 2nd phase of compaction + * will sleep this amount of time. + */ + uint32_t compactionItrScanDelayUs; + + /** + * If non-zero, background compaction will be invoked + * once file size becomes bigger than this value, + * regardless of other factors such as block reuse cycle + * or stale data ratio. + */ + uint64_t urgentCompactionFilesize; + + /** + * If bigger than 100, compaction factors (ratio) of all opened DBs + * are temporarily overwritten by this value. + * The same as compaction factor, the unit is percentage: + * e.g.) 200 -> trigger compaction at 200%. + */ + uint64_t urgentCompactionRatio; + + /** + * If non-zero, every file removal or truncation during rollback + * will sleep this amount of time. + */ + uint32_t rollbackDelayUs; + + /** + * If `true`, leave detailed logs if given key is not found. + */ + bool logDetailsOfKeyNotFound; + + struct GenericCbParams { + GenericCbParams() {} + }; + + /** + * Callback function that will be invoked at the end of each + * table write batch. + */ + std::function< void(const GenericCbParams&) > tableSetBatchCb; + + /** + * Callback function that will be invoked at the moment + * new log file is added, but right before appending the first log. + */ + std::function< void(const GenericCbParams&) > addNewLogFileCb; +}; + +using UserHandler = std::function< void(Status, void*) >; + +// Opaque class definition +namespace checker { + class Checker; +}; +class DB { + friend class checker::Checker; + friend class Compactor; + friend class DBMgr; + friend class DBGroup; + friend class Iterator; + friend class Flusher; + friend class FlusherQueue; + friend class LogMgr; + friend class LogReclaimer; + friend class Merger; + friend class TableMgr; + +public: + /** + * Null sequence number. + */ + static const uint64_t NULL_SEQNUM = static_cast(-1); + + /** + * Initialize process-wide global resources such as + * block cache and background workers. + * + * @param global_config Global configurations. + * @return OK on success. + */ + static Status init(const GlobalConfig& global_config); + + /** + * Release process-wide global resources. + * + * @return OK on success. + */ + static Status shutdown(); + + /** + * Open a Jungle instance in the given path. + * If the given path is empty, create a new one. + * + * @param[out] ptr_out Pointer to the instance as a result of this API call. + * @param path Path to open (or to create) an instance. + * @param db_config DB configurations. + * @return OK on success. + */ + static Status open(DB** ptr_out, + const std::string& path, + const DBConfig& db_config); + + /** + * Close a Jungle instance. + * Note that the allocated memory pointed to by the given instance + * will be released, so that user does not need to explicitly call `delete`. + * + * @param db Pointer to the instance to be closed. + * @return OK on success. + */ + static Status close(DB* db); + + /** + * Check if the Jungle instance at the given path is in log section mode or not, + * without opening the instance itself. + * + * @param path Path to an instance to check. + * @return `true` if the instance at the path is for log section mode. + */ + static bool isLogSectionMode(const std::string& path); + + /** + * Open a snapshot. + * + * @param[out] snap_out + * Pointer to the snapshot instance as a result of this API call. + * @param checkpoint + * Checkpoint number of the snapshot to open. If `0`, will open a + * snapshot based on the latest image. + * @return OK on success. + */ + Status openSnapshot(DB** snap_out, + const uint64_t checkpoint = 0); + + /** + * Rollback the given instance to the given sequence number. + * Only supported in log section mode now. + * + * @param seqnum_upto Rollback point (exclusive). This sequence number will be + * preserved. + * @return OK on success. + */ + Status rollback(uint64_t seqnum_upto); + + /** + * Set (upsert) a key-value pair. + * Sequence number will be automatically assigned. + * + * @param kv Key-value pair to set. + * @return OK on success. + */ + Status set(const KV& kv); + + /** + * Set (upsert) a key-value pair, with custom sequence number. + * Sequence number does not need to be consecutive, + * but should be in an increasing order and also should be unique. + * + * @param seq_num Custom sequence number. + * @param kv Key-value pair to set. + * @return OK on success. + */ + Status setSN(const uint64_t seq_num, const KV& kv); + + /** + * Set (upsert) a record, with custom metadata and sequence number. + * Sequence number does not need to be consecutive, + * but should be in an increasing order and also should be unique. + * + * @param rec Record to set. + * @return OK on success. + */ + Status setRecord(const Record& rec); + + /** + * Set (upsert) a record, with custom metadata. + * Sequence number will be automatically assigned. + * + * @param rec Record to set. + * @return OK on success. + */ + Status setRecordByKey(const Record& rec); + + /** + * Set (upsert) a set of records in batch. + * + * @param batch Set of records. + * @return OK on success. + */ + Status setRecordByKeyMulti(std::list& batch, + bool last_batch = false); + + /** + * Get the value corresponding to the given key. + * + * User is responsible for freeing the memory of `value_out`, + * by using `SizedBuf::free()` or `SizedBuf::Holder`. + * + * @param key Key to find. + * @param[out] value Reference to the buffer where the result will be stored. + * @return OK on success. + */ + Status get(const SizedBuf& key, SizedBuf& value_out); + + /** + * Get the key-value pair corresponding to the given sequence number. + * Only key-value pairs in log section will be visible. + * + * User is responsible for freeing the memory of `kv_out`, + * by using `KV::free()` or `KV::Holder`. + * + * @param seq_num Sequence number to find. + * @param[out] kv_out Reference to the key-value buffer + * where the result will be stored. + * @return OK on success. + */ + Status getSN(const uint64_t seq_num, KV& kv_out); + + /** + * Get the record corresponding to the given sequence number. + * Only key-value pairs in log section will be visible. + * + * User is responsible for freeing the memory of `rec_out`, + * by using `Record::free()` or `Record::Holder`. + * + * @param seq_num Sequence number to find. + * @param[out] rec_out Reference to the record + * where the result will be stored. + * @return OK on success. + */ + Status getRecord(const uint64_t seq_num, Record& rec_out); + + /** + * Get the record corresponding to the given key. + * + * User is responsible for freeing the memory of `rec_out`, + * by using `Record::free()` or `Record::Holder`. + * + * @param key Key to find. + * @param[out] rec_out Reference to the record + * where the result will be stored. + * @param meta_only + * If `true`, + * 1) value part will not be retrieved, and + * 2) removed record will be searched unless + * they are already compacted and purged. + * + * @return OK on success. + */ + Status getRecordByKey(const SizedBuf& key, + Record& rec_out, + bool meta_only = false); + + /** + * Delete the key-value pair corresponding to the given key. + * + * @param key Key to delete. + * @return OK on success. + */ + Status del(const SizedBuf& key); + + /** + * Delete the key-value pair corresponding to the given key, + * with custom sequence number. Sequence number given to this API is + * a number corresponding to delete operation itself, which will + * be used as a tombstone. + * + * Sequence number does not need to be consecutive, + * but should be in an increasing order and also should be unique. + * + * @param seq_num Custom sequence number. + * @param key Key to delete. + * @return OK on success. + */ + Status delSN(const uint64_t seq_num, const SizedBuf& key); + + /** + * Delete a record, with custom metadata and sequence number. + * Sequence number given to this API is a number corresponding + * to delete operation itself, which will be used as a tombstone. + * + * Sequence number does not need to be consecutive, + * but should be in an increasing order and also should be unique. + * + * @param rec Record to set. + * @return OK on success. + */ + Status delRecord(const Record& rec); + + /** + * Get the maximum sequence number in the log section. + * + * @param[out] seq_num_out Reference to sequence number as a result. + * @return OK on success. + */ + Status getMaxSeqNum(uint64_t& seq_num_out); + + /** + * Get the minimum sequence number in the log section. + * + * @param[out] seq_num_out Reference to sequence number as a result. + * @return OK on success. + */ + Status getMinSeqNum(uint64_t& seq_num_out); + + /** + * Get the last flushed sequence number. + * "Flush" means: + * - Normal mode: merging into L0+ tables. + * - Log section mode: log compaction. + * + * @param[out] seq_num_out Reference to sequence number as a result. + * @return OK on success. + */ + Status getLastFlushedSeqNum(uint64_t& seq_num_out); + + /** + * Get the last synced (written to file) sequence number. + * + * @param[out] seq_num_out Reference to sequence number as a result. + * @return OK on success. + */ + Status getLastSyncedSeqNum(uint64_t& seq_num_out); + + /** + * Get the list of checkpoint markers. + * + * @param[out] chk_out Checkpoint markers. + * @return OK on success. + */ + Status getCheckpoints(std::list& chk_out); + + /** + * Do sync (writing to file). + * Only one thread can execute this operation at a time, and other threads + * will be blocked. + * + * @param call_fsync If `true`, call `fsync` for each file + * after writing data is done. + * @return OK on success. + */ + Status sync(bool call_fsync = true); + + /** + * Do sync (writing to file). + * Only one thread can execute this operation at a time, and other threads + * will return immediately, without waiting. + * + * @param call_fsync If `true`, call `fsync` for each file + * after writing data is done. + * @return OK on success. + * OPERATION_IN_PROGRESS if other thread is working on it. + */ + Status syncNoWait(bool call_fsync = true); + + /** + * Flush logs and merge them into table up to given sequence number. + * In log section mode, this API is used for log compaction, which is + * the same as `purgeOnly = true` option. + * + * Only one thread can execute this operation at a time, and other threads + * will return immediately, without waiting. + * + * @param options Flush operation options. + * @param seq_num Max sequence number to flush. + * If not given, it will flush all logs. + * @return OK on success. + */ + Status flushLogs(const FlushOptions& options, + const uint64_t seq_num = -1); + + /** + * Flush logs asynchronously. + * This API can be used to call `sync` API asynchronously as well. + * + * @param options Flush operation options. + * @param handler Handler that will be invoked after the request is done. + * @param ctx Generic pointer that will be passed to handler. + * @param seq_num Max sequence number to flush. + * If not given, it will flush all logs. + * @return OK on success. + */ + Status flushLogsAsync(const FlushOptions& options, + UserHandler handler, + void* ctx, + const uint64_t seq_num = -1); + + /** + * Add a checkpoint marker. + * This API will internally call `sync` operation. + * + * @param[out] seq_num_out + * Sequence number that will be used as a checkpoint marker. + * @param call_fsync + * If `true`, call `fsync` for each file after writing data is done. + * @return OK on success. + */ + Status checkpoint(uint64_t& seq_num_out, bool call_fsync = true); + + /** + * Do compaction on the table for given hash number in level-0. + * + * @param options Compaction options. + * @param hash_num Hash partition number. + * @return OK on success. + */ + Status compactL0(const CompactOptions& options, + uint32_t hash_num); + + /** + * Do inter-level compaction on a table in the given level, + * except for level-0. This API will internally find the most + * suitable table to compact. + * + * @param options Compaction options. + * @param level Level to compact. + * @return OK on success. + * TABLE_NOT_FOUND if there is no table to compact. + */ + Status compactLevel(const CompactOptions& options, + size_t level); + + /** + * Do in-place compaction on a table in the given level, + * except for level-0. This API will internally find the most + * suitable table to compact. + * + * @param options Compaction options. + * @param level Level to compact. + * @return OK on success. + * TABLE_NOT_FOUND if there is no table to compact. + */ + Status compactInplace(const CompactOptions& options, + size_t level); + + /** + * Do split on a table in the given level, except for level-0. + * This API will internally find the most suitable table to compact. + * + * @param options Compaction options. + * @param level Level to compact. + * @return OK on success. + * TABLE_NOT_FOUND if there is no table to split. + */ + Status splitLevel(const CompactOptions& options, + size_t level); + + /** + * Do merge of two arbitrary adjacent tables in the given level, + * except for level-0. This API will internally find the most + * suitable table to compact. + * + * @param options Compaction options. + * @param level Level to merge. + * @return OK on success. + * TABLE_NOT_FOUND if there is no table to split. + */ + Status mergeLevel(const CompactOptions& options, + size_t level); + + /** + * Get the current statistics of the Jungle instance. + * + * @param[out] stats_out Stats as a result of this API call. + * @return OK on success. + */ + Status getStats(DBStats& stats_out); + + /** + * Set debugging parameters. + * + * @param to New debugging parameters to set. + * @param effective_time_sec + * Effective time duration in seconds. After it expires, debugging + * parameters will not have any impact. + * @return OK on success. + */ + static void setDebugParams(const DebugParams& to, + size_t effective_time_sec = 3600); + + /** + * Get the current debugging parameters. + * + * @return The current debugging parameters. + */ + static DebugParams getDebugParams(); + + /** + * Set the level of debugging log file (i.e., `system_logs.log`). + * + * Log level follows that in SimpleLogger: + * https://github.com/greensky00/simple_logger + * + * 0: System [====] + * 1: Fatal [FATL] + * 2: Error [ERRO] + * 3: Warning [WARN] + * 4: Info [INFO] + * 5: Debug [DEBG] + * 6: Trace [TRAC] + * + * Default: 4 (Info). + * + * @param new_level New level to set. + */ + void setLogLevel(int new_level); + + /** + * Get the current level of debugging log file. + * + * @return Log level. + */ + int getLogLevel() const; + +private: + DB(); + DB(DB* _parent, uint64_t last_flush, uint64_t checkpoint); + ~DB(); + + class DBInternal; + DBInternal* const p; + + class SnapInternal; + SnapInternal* const sn; +}; + +class DBGroup { +public: + DBGroup(); + ~DBGroup(); + + static Status open(DBGroup** ptr_out, + std::string path, + const DBConfig& db_config); + static Status close(DBGroup* db_group); + + Status openDefaultDB(DB** ptr_out); + + Status openDB(DB** ptr_out, + std::string db_name); + + Status openDB(DB** ptr_out, + std::string db_name, + const DBConfig& db_config); + +private: + class DBGroupInternal; + DBGroupInternal* const p; +}; + + +// Global management functions. ===== + +/** + * Initialize process-wide global resources such as + * block cache and background workers. + * + * @param global_config Global configurations. + * @return OK on success. + */ +static inline Status init(const GlobalConfig& global_config) { + (void)init; + return DB::init(global_config); +} + +/** + * Release process-wide global resources. + * + * @return OK on success. + */ +static inline Status shutdown() { + (void)shutdown; + return DB::shutdown(); +} + +/** + * Set debugging parameters. + * + * @param to New debugging parameters to set. + * @param effective_time_sec + * Effective time duration in seconds. After it expires, debugging + * parameters will not have any impact. + * @return OK on success. + */ +static inline void setDebugParams(const DebugParams& to, + size_t effective_time_sec = 3600) { + (void)setDebugParams; + DB::setDebugParams(to, effective_time_sec); +} + +/** + * Get the current debugging parameters. + * + * @return The current debugging parameters. + */ +static inline DebugParams getDebugParams() { + (void)getDebugParams; + return DB::getDebugParams(); +} + +} // namespace jungle + diff --git a/include/libjungle/keyvalue.h b/include/libjungle/keyvalue.h new file mode 100644 index 0000000..59377fe --- /dev/null +++ b/include/libjungle/keyvalue.h @@ -0,0 +1,231 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "sized_buf.h" + +#include +#include + +#include + +namespace jungle { + +class KV { +public: + /** + * Automatically free the memory of given KV. + */ + struct Holder { + Holder(KV& _kv) : kv(_kv) {} + ~Holder() { kv.free(); } + KV& kv; + }; + + /** + * Create an empty KV. + */ + KV() {} + + /** + * Create a KV referring to the given memory addresses. + * + * @param k_size Length of key. + * @param k_data Start address of key. + * @param v_size Length of value. + * @param v_data Start address of value. + */ + KV( size_t k_size, void *k_data, + size_t v_size, void *v_data ) + { + key.set(k_size, k_data); + value.set(v_size, v_data); + } + + /** + * Create a KV referring to given SizedBufs for key and value. + * + * @param k SizedBuf for key. + * @param v SizedBuf for value. + */ + KV(const SizedBuf& k, const SizedBuf& v) : key(k), value(v) {} + + /** + * Create a KV referring to the given KV. + * + * @param src Source KV. + */ + KV(const KV& src) { + set(src.key, src.value); + } + + /** + * Create a KV referring to the raw memory addresses of given strings. + * + * @param k String for key. + * @param v String for value. + */ + KV(const std::string& k, const std::string& v) { + set(k, v); + } + + /** + * Create a KV referring to the raw memory addresses of given + * null-terminated C-string. + * + * @param k C-string for key. + * @param v C-string for value. + */ + KV(const char* k, const char* v) { + set(k, v); + } + + /** + * Calculate the sum of lengths of key and value. + * + * @return Length. + */ + size_t size() const { + return key.size + value.size; + } + + /** + * Move the contents of this KV to the given KV. + * This KV will become empty as a result of this API call. + * + * @param dst Destination KV. + */ + void moveTo(KV& dst) { + key.moveTo(dst.key); + value.moveTo(dst.value); + } + + /** + * Make a clone of this KV. + * User is responsible for deallocating the memory of the destination + * KV, by using `KV::Holder` or `KV::free()`. + * + * @param dst Destination KV. + */ + void copyTo(KV& dst) const { + key.copyTo(dst.key); + value.copyTo(dst.value); + } + + /** + * Make this KV refer to the raw memory addresses of given + * null-terminated C-strings. + * + * @param k C-string for key. + * @param v C-string for value. + */ + void set(const char* k, const char* v) { + key.set(strlen(k), (void*)k); + value.set(strlen(v), (void*)v); + } + + /** + * Make this KV refer to the raw memory addresses of given strings. + * + * @param k String for key. + * @param v String for value. + */ + void set(const std::string& k, const std::string& v) { + key.set(k.size(), (void*)k.data()); + value.set(v.size(), (void*)v.data()); + } + + /** + * Make this KV refer to given SizedBufs. + * + * @param k SizedBuf for key. + * @param v SizedBuf for value. + */ + void set(const SizedBuf& k, const SizedBuf& v) { + key = k; + value = v; + } + + /** + * Allocate memory for this KV and copy given C-strings into it. + * User is responsible for deallocating the memory of this + * SizedBuf, by using `KV::Holder` or `KV::free()`. + * + * @param k Source C-string for key. + * @param v Source C-string for value. + */ + void alloc(const char* k, const char* v) { + key.alloc(strlen(k), (void*)k); + value.alloc(strlen(v), (void*)v); + } + + /** + * Allocate memory for this KV and copy the contents of given + * strings into it. + * User is responsible for deallocating the memory of this + * SizedBuf, by using `KV::Holder` or `KV::free()`. + * + * @param k Source string for key. + * @param v Source string for value. + */ + void alloc(const std::string& k, const std::string& v) { + key.alloc(k.size(), (void*)k.data()); + value.alloc(v.size(), (void*)v.data()); + } + + /** + * Clone the contents of given SizedBufs for key and value. + * User is responsible for deallocating the memory of this + * SizedBuf, by using `KV::Holder` or `KV::free()`. + * + * @param k Source SizedBuf for key. + * @param v Source SizedBuf for value. + */ + void alloc(const SizedBuf& k, const SizedBuf& v) { + key.alloc(k.size, k.data); + value.alloc(v.size, v.data); + } + + /** + * Deallocate the memory of key and value. + */ + void free() { + key.free(); + value.free(); + } + + /** + * Clear the memory of key and value, without deallocation. + */ + void clear() { + key.clear(); + value.clear(); + } + + /** + * Buffer for key. + */ + SizedBuf key; + + /** + * Buffer for value. + */ + SizedBuf value; +}; + +} // namespace jungle + diff --git a/include/libjungle/record.h b/include/libjungle/record.h new file mode 100644 index 0000000..7c58781 --- /dev/null +++ b/include/libjungle/record.h @@ -0,0 +1,217 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "keyvalue.h" +#include "status.h" + +#include +#include + +namespace jungle { + +class Record { +public: + enum Type : uint8_t { + /** + * Insertion marker. + */ + INSERTION = 0, + + /** + * Tombstone (deletion marker). + */ + DELETION = 1, + + /** + * Special type reserved for other commands. + */ + COMMAND = 2 + }; + + /** + * Automatically free the memory of given Record. + */ + struct Holder { + Holder(Record& _rec) : rec(_rec) {} + ~Holder() { rec.free(); } + Record& rec; + }; + + /** + * Create an empty Record. + */ + Record() + : seqNum(-1) + , type(INSERTION) {} + + /** + * Create an empty Record with the given type. + * + * @param t Type. + */ + Record(Type t) + : seqNum(-1) + , type(t) {} + + /** + * Create an empty Record with given sequence number and type. + * + * @param sn Sequence number. + * @param t Type. + */ + Record(const uint64_t sn, + const Type t) + : seqNum(sn) + , type(t) {} + + /** + * Calculate the sum of lengths of key, value, and meta. + */ + size_t size() const { + return kv.size() + meta.size; + } + + /** + * Move the contents of this Record to the given Record. + * This Record will become empty as a result of this API call. + * + * @param dst Destination Record. + */ + void moveTo(Record& dst) { + kv.moveTo(dst.kv); + meta.moveTo(dst.meta); + dst.seqNum = seqNum; + dst.type = type; + clear(); + } + + /** + * Make a clone of this Record. + * User is responsible for deallocating the memory of the destination + * KV, by using `KV::Holder` or `KV::free()`. + * + * @param dst Destination Record. + */ + void copyTo(Record& dst) const { + kv.copyTo(dst.kv); + meta.copyTo(dst.meta); + dst.seqNum = seqNum; + dst.type = type; + } + + /** + * Make this Record as a clone of the given source Record. + * This record should be empty before calling this API. + * User is responsible for deallocating the memory of the destination + * KV, by using `KV::Holder` or `KV::free()`. + * + * @param src Source Record. + * @return OK on success. + */ + Status clone(const Record& src) { + if (kv.key.data || kv.value.data) { + return Status(Status::ALREADY_INITIALIZED); + } + + kv.key.alloc(src.kv.key.size, src.kv.key.data); + kv.value.alloc(src.kv.value.size, src.kv.value.data); + meta.alloc(src.meta); + seqNum = src.seqNum; + type = src.type; + + return Status(); + } + + /** + * Deallocate the memory of key, value, and meta. + */ + void free() { + kv.key.free(); + kv.value.free(); + meta.free(); + } + + /** + * Clear the memory of key, value, and meta, without deallocation. + */ + void clear() { + kv.clear(); + meta.clear(); + seqNum = -1; + type = INSERTION; + } + + /** + * Check if this Record is empty, which means both key and value are empty. + * + * @return `true` if empty. + */ + bool empty() const { return kv.key.empty() && kv.value.empty(); } + + /** + * Check if this Record is an insertion marker. + * + * @return `true` if insertion marker. + */ + bool isIns() const { return type == INSERTION; } + + /** + * Check if this Record is a tombstone. + * + * @return `true` if tombstone. + */ + bool isDel() const { return type == DELETION; } + + /** + * Check if this Record is for a special command. + * + * @return `true` if special command. + */ + bool isCmd() const { return type == COMMAND; } + + /** + * Less functor. + */ + struct Less { + bool operator()(const Record* a, const Record* b) const { + return a->kv.key < b->kv.key; + } + }; + + /** + * Key and value. + */ + KV kv; + + /** + * Custom metadata. + */ + SizedBuf meta; + + /** + * Sequence number. + */ + uint64_t seqNum; + + /** + * Type. + */ + Type type; +}; + +} // namespace jungle diff --git a/include/libjungle/sized_buf.h b/include/libjungle/sized_buf.h new file mode 100644 index 0000000..acfd373 --- /dev/null +++ b/include/libjungle/sized_buf.h @@ -0,0 +1,497 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include +#include +#include + +#include + +namespace jungle { + +struct SizedBufFlags { + /** + * If this flag is set, `free` will be used for deallocation. + */ + static const uint8_t NEED_TO_FREE = 0x1; + + /** + * If this flag is set, `delete` will be used for deallocation. + */ + static const uint8_t NEED_TO_DELETE = 0x2; +}; + +class SizedBuf { +public: + /** + * Automatically free the memory of given SizedBuf. + */ + struct Holder { + Holder(SizedBuf& s) : src(s) {} + ~Holder() { src.free(); } + SizedBuf& src; + }; + + /** + * Create an empty SizedBuf. + */ + SizedBuf() : flags(0x0), size(0), data(nullptr) { } + + /** + * Create a SizedBuf with allocated memory of the given size. + * User is responsible for deallocating the memory + * by using `SizedBuf::Holder` or `SizedBuf::free()`. + * + * @param s Size to allocate. + */ + SizedBuf(size_t s) : flags(0x0), size(0), data(nullptr) { + alloc(s, nullptr); + } + + /** + * Create a SizedBuf referring to the given memory address. + * + * @param s Length of the memory region that this SizedBuf will refer to. + * @param d Start address. + */ + SizedBuf(size_t s, void* d) : flags(0x0) { + set(s, d); + } + + /** + * Create a SizedBuf referring to the same memory address of the + * given source SizedBuf. + * + * @param src Source SizedBuf. + */ + SizedBuf(const SizedBuf& src) : flags(0x0) { + set(src.size, src.data); + flags = src.flags; + } + + /** + * Create a SizedBuf referring to the raw memory of the given string. + * + * @param str Source string. + */ + SizedBuf(const std::string& str) : flags(0x0) { + set(str.size(), (void*)str.data()); + } + + /** + * Create a SizedBuf referring to the memory of the given + * null-terminated C-string. + * + * @param str_char Source C-string. + */ + SizedBuf(const char* str_char) : flags(0x0) { + set(strlen(str_char), (void*)str_char); + } + + /** + * Assign the same memory address of the given SizedBuf. + * Both this and the given SizedBufs will point to the same + * memory as a result of this API call. Since both SizedBufs are + * referring to the same memory, user should be careful about double-free. + * + * @param src Source SizedBuf. + */ + SizedBuf& operator=(const SizedBuf& src) { + flags = src.flags; + size = src.size; + data = src.data; + return *this; + } + + /** + * Move the contents of this SizedBuf to the given SizedBuf. + * This SizedBuf will become empty as a result of this API call. + * + * @param dst Destination SizedBuf. + */ + void moveTo(SizedBuf& dst) { + dst.flags = flags; + dst.size = size; + dst.data = data; + flags = 0x0; + size = 0; + data = nullptr; + } + + /** + * Make a clone of this SizedBuf. + * User is responsible for deallocating the memory of the destination + * SizedBuf, by using `SizedBuf::Holder` or `SizedBuf::free()`. + * + * @param dst Destination SizedBuf. + */ + void copyTo(SizedBuf& dst) const { + dst.alloc(*this); + } + + /** + * Assign the same memory address of the given SizedBuf. + * Unlike `operator=`, this API does not copy the original flag, + * which means that calling `free()` or using `SizedBuf::Holder` on + * this SizedBuf will not have any impact on the original SizedBuf. + * + * @param src Source SizedBuf. + */ + void referTo(const SizedBuf& src) { + size = src.size; + data = src.data; + flags = 0x0; + } + + /** + * Compare the given two SizedBufs in lexicographical order. + * + * @param l Left SizedBuf. + * @param r Right SizedBuf. + * @return Negative number if `l < r`, + * Zero if `l == r`, or + * Positive number if `l > r`. + */ + static inline int cmp(const SizedBuf& l, const SizedBuf& r) { + if (l.size == r.size) { + if (l.size == 0) return 0; + return memcmp(l.data, r.data, l.size); + } else { + size_t len = std::min(l.size, r.size); + int cmp = memcmp(l.data, r.data, len); + if (cmp != 0) return cmp; + else { + return (int)((int)l.size - (int)r.size); + } + } + } + + inline bool operator==(const SizedBuf &other) const { + if (size != other.size) return false; + if (size) { + return memcmp(data, other.data, size) == 0; + } else if (other.size == 0) { + // Both are empty. + return true; + } + return false; + } + + inline bool operator!=(const SizedBuf &other) const { + return !operator==(other); + } + + friend inline bool operator<(const SizedBuf& l, const SizedBuf& r) { + if (l.size == r.size) { + if (l.size == 0) return false; // Both are empty. + return (memcmp(l.data, r.data, l.size) < 0); + } else if (l.size < r.size) { + if (l.size == 0) return true; + return (memcmp(l.data, r.data, l.size) <= 0); + } else { // l.size > r.size + if (r.size == 0) return false; + return (memcmp(l.data, r.data, r.size) < 0); + } + + return false; + } + + friend inline bool operator<=(const SizedBuf& l, const SizedBuf& r) { + if (l.size == r.size) { + if (l.size == 0) return true; // Both are empty. + return (memcmp(l.data, r.data, l.size) <= 0); + } else if (l.size < r.size) { + if (l.size == 0) return true; + return (memcmp(l.data, r.data, l.size) <= 0); + } else { // l.size > r.size + if (r.size == 0) return false; + return (memcmp(l.data, r.data, r.size) < 0); + } + + return false; + } + + friend inline bool operator>(const SizedBuf& l, const SizedBuf& r) { + return !operator<=(l, r); + } + + friend inline bool operator>=(const SizedBuf& l, const SizedBuf& r) { + return !operator<(l, r); + } + + #define MSG_MAX 24 + /** + * Print the contents of the given SizedBuf with readable + * (i.e., ASCII printable) characters. + * Data after `MSG_MAX` will be emitted. + */ + friend std::ostream &operator<<(std::ostream &output, const SizedBuf &sb) { + if (sb.size == 0) { + output << "(empty)"; + return output; + } + + output << "(" << sb.size << ") "; + size_t size_local = std::min(sb.size, (uint32_t)MSG_MAX); + for (size_t ii=0; ii MSG_MAX) output << "..."; + return output; + } + + /** + * Print the contents of the given SizedBuf with readable + * (i.e., ASCII printable) characters. + * Data after `MSG_MAX` will be emitted. + */ + std::string toReadableString() const { + std::stringstream ss; + ss << *this; + return ss.str(); + } + + /** + * Assign the same memory address of the given SizedBuf. + * Both this and the given SizedBufs will point to the same + * memory as a result of this API call. Since both SizedBufs are + * referring to the same memory, user should be careful about double-free. + * + * @param src Source SizedBuf. + */ + void set(const SizedBuf& src) { + set(src.size, src.data); + flags = src.flags; + } + + /** + * Make this SizedBuf refer to the memory of the given + * null-terminated C-string. + * + * @param str_char Source C-string. + */ + void set(const char* str_char) { + set(strlen(str_char), (void*)str_char); + } + + /** + * Make this SizedBuf refer to the raw memory of the given string. + * + * @param str Source string. + */ + void set(const std::string& str) { + set(str.size(), (void*)str.data()); + } + + /** + * Make this SizedBuf refer to the given memory address. + * + * @param s Length of the memory region that this SizedBuf will refer to. + * @param d Start address. + */ + void set(size_t s, void* d) { + clear(); + size = s; + data = static_cast(d); + } + + /** + * Clone the contents of the given SizedBuf. + * User is responsible for deallocating the memory of this + * SizedBuf, by using `SizedBuf::Holder` or `SizedBuf::free()`. + * + * @param src Source SizedBuf. + */ + void alloc(const SizedBuf& src) { + alloc(src.size, src.data); + } + + /** + * Allocate memory for this SizedBuf and copy the given C-string into it. + * User is responsible for deallocating the memory of this + * SizedBuf, by using `SizedBuf::Holder` or `SizedBuf::free()`. + * + * @param str_char Source C-string. + */ + void alloc(const char* str_char) { + alloc(strlen(str_char), (void*)str_char); + } + + /** + * Allocate memory for this SizedBuf and copy the contents of the given + * string into it. + * User is responsible for deallocating the memory of this + * SizedBuf, by using `SizedBuf::Holder` or `SizedBuf::free()`. + * + * @param str Source string. + */ + void alloc(const std::string& str) { + alloc(str.size(), (void*)str.data()); + } + + /** + * Allocate memory of the given size for this SizedBuf, and initialize it + * with zero bytes. + * User is responsible for deallocating the memory of this + * SizedBuf, by using `SizedBuf::Holder` or `SizedBuf::free()`. + * + * @param s Length to allocate. + */ + void alloc(size_t s) { + alloc(s, nullptr); + } + + /** + * Allocate memory of the given size for this SizedBuf, and copy the data in + * the given memory address into it + * User is responsible for deallocating the memory of this + * SizedBuf, by using `SizedBuf::Holder` or `SizedBuf::free()`. + * + * @param s Length to allocate and copy. + * @param d Memory address of source data. + */ + void alloc(size_t s, void* d) { + clear(); + + if (s == 0) { + data = nullptr; + flags = 0x0; + return; + } + + size = s; + data = reinterpret_cast(malloc(size)); + if (d) { + // Source data is given: copy. + memcpy(data, d, size); + } else { + // NULL: just allocate space + // (set to 0 optionally). + memset(data, 0x0, size); + } + flags |= SizedBufFlags::NEED_TO_FREE; + } + + /** + * Resize the buffer. Effective only when this SizedBuf owns the memory. + * + * @param s Length to allocate and copy. + * @param d Memory address of source data. + */ + void resize(size_t _size) { + if ( !(flags & SizedBufFlags::NEED_TO_FREE) ) { + // Not owning the memory, fail. + return; + } + + uint8_t* new_ptr = reinterpret_cast(::realloc(data, _size)); + if (new_ptr) { + data = new_ptr; + size = _size; + } + } + + /** + * Export the contents of this SizedBuf as a string. + * + * @return String instance. + */ + std::string toString() { + return std::string((const char*)data, size); + } + + /** + * Deallocate the memory owned by this SizedBuf. + * If this SizedBuf does not own the memory, will do nothing. + * + * @return `true` if the memory is deallocated. + * `false` otherwise. + */ + bool free() { + if (flags & SizedBufFlags::NEED_TO_FREE) { + ::free(data); + flags &= ~SizedBufFlags::NEED_TO_FREE; + clear(); + return true; + + } else if (flags & SizedBufFlags::NEED_TO_DELETE) { + delete[] data; + flags &= ~SizedBufFlags::NEED_TO_DELETE; + clear(); + return true; + } + return false; + } + + /** + * Force set the flag so as to make this SizedBuf own the memory, + * with `malloc` and `free`. + */ + void setNeedToFree() { + flags |= SizedBufFlags::NEED_TO_FREE; + } + + /** + * Force set the flag so as to make this SizedBuf own the memory, + * with `new` and `delete`. + */ + void setNeedToDelete() { + flags |= SizedBufFlags::NEED_TO_DELETE; + } + + /** + * Clear the contents without deallocation. + */ + void clear() { + flags = 0x0; + size = 0; + data = nullptr; + } + + /** + * Check if this SizedBuf is empty. + * + * @return `true` if empty. + */ + bool empty() const { + return (size == 0); + } + + /** + * Flags. + */ + uint8_t flags; + + /** + * Length of memory buffer. + */ + uint32_t size; + + /** + * Pointer to memory. + */ + uint8_t* data; +}; + +} // namespace jungle + diff --git a/include/libjungle/status.h b/include/libjungle/status.h new file mode 100644 index 0000000..0927a3c --- /dev/null +++ b/include/libjungle/status.h @@ -0,0 +1,145 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include +#include + +namespace jungle { + +class Status { +public: + enum Value { + OK = 0, + INVALID_PARAMETERS = -1, + ALREADY_EXIST = -2, + NOT_INITIALIZED = -3, + ALLOCATION_FAILURE = -4, + ALREADY_INITIALIZED = -5, + LOG_FILE_NOT_FOUND = -6, + INVALID_SEQNUM = -7, + SEQNUM_NOT_FOUND = -8, + INVALID_LEVEL = -9, + FDB_OPEN_FILE_FAIL = -10, + FDB_OPEN_KVS_FAIL = -11, + FDB_SET_FAIL = -12, + FDB_COMMIT_FAIL = -13, + LOG_NOT_SYNCED = -14, + ALREADY_PURGED = -15, + KEY_NOT_FOUND = -16, + TABLE_NOT_FOUND = -17, + ITERATOR_INIT_FAIL = -18, + OUT_OF_RANGE = -19, + ALREADY_LOADED = -20, + FILE_NOT_EXIST = -21, + KVS_NOT_FOUND = -22, + NOT_KV_PAIR = -23, + ALREADY_CLOSED = -24, + ALREADY_SHUTDOWN = -25, + INVALID_HANDLE_USAGE = -26, + LOG_NOT_EXIST = -27, + OPERATION_IN_PROGRESS = -28, + ALREADY_REMOVED = -29, + WRITE_VIOLATION = -30, + TABLES_ARE_DISABLED = -31, + INVALID_CHECKPOINT = -32, + INVALID_SNAPSHOT = -33, + FDB_CLOSE_FAIL = -34, + ALREADY_FLUSHED = -35, + DB_HANDLE_NOT_FOUND = -36, + NULL_FILEOPS_HANDLE = -37, + INVALID_FILE_DESCRIPTOR = -38, + FILE_WRITE_SIZE_MISMATCH = -39, + FILE_READ_SIZE_MISMATCH = -40, + CHECKSUM_ERROR = -41, + FILE_CORRUPTION = -42, + INVALID_RECORD = -43, + INVALID_MODE = -44, + COMPACTION_IS_NOT_ALLOWED = -45, + SNAPSHOT_NOT_FOUND = -46, + FILE_IS_NOT_IMMUTABLE = -47, + MANUAL_COMPACTION_OPEN_FAILED = -48, + FDB_KVS_CLOSE_FAIL = -49, + COMPACTION_CANCELLED = -50, + READ_VIOLATION = -51, + TIMEOUT = -52, + EVICTION_FAILED = -53, + INVALID_OFFSET = -54, + HANDLE_IS_BEING_CLOSED = -55, + ROLLBACK_IN_PROGRESS = -56, + DIRECT_IO_NOT_SUPPORTED = -57, + NOT_IMPLEMENTED = -58, + FILE_SIZE_MISMATCH = -59, + INCOMPLETE_LOG = -60, + UNKNOWN_LOG_FLAG = -61, + + ERROR = -32768 + }; + + Status() : val(OK) {} + Status(int _val) : val((Value)_val) {} + Status(Value _val) : val(_val) {} + + explicit operator bool() { return ok(); } + inline bool operator==(const Status::Value _val) const { + return val == _val; + } + operator int() const { return (int)val; } + Value getValue() const { return val; } + bool ok() const { return val == OK; } + std::string toString() { + static std::vector names + ({"OK", "INVALID_PARAMETERS", "ALREADY_EXIST", + "NOT_INITIALIZED", "ALLOCATION_FAILURE", "ALREADY_INITIALIZED", + "LOG_FILE_NOT_FOUND", "INVALID_SEQNUM", "SEQNUM_NOT_FOUND", + "INVALID_LEVEL", "FDB_OPEN_FILE_FAIL", "FDB_OPEN_KVS_FAIL", + "FDB_SET_FAIL", "FDB_COMMIT_FAIL", "LOG_NOT_SYNCED", + "ALREADY_PURGED", "KEY_NOT_FOUND", "TABLE_NOT_FOUND", + "ITERATOR_INIT_FAIL", "OUT_OF_RANGE", "ALREADY_LOADED", + "FILE_NOT_EXIST", "KVS_NOT_FOUND", "NOT_KV_PAIR", + "ALREADY_CLOSED", "ALREADY_SHUTDOWN", "INVALID_HANDLE_USAGE", + "LOG_NOT_EXIST", "OPERATION_IN_PROGRESS", "ALREADY_REMOVED", + "WRITE_VIOLATION", "TABLES_ARE_DISABLED", "INVALID_CHECKPOINT", + "INVALID_SNAPSHOT", "FDB_CLOSE_FAIL", "ALREADY_FLUSHED", + "DB_HANDLE_NOT_FOUND", "NULL_FILEOPS_HANDLE", + "INVALID_FILE_DESCRIPTOR", "FILE_WRITE_SIZE_MISMATCH", + "FILE_READ_SIZE_MISMATCH", "CHECKSUM_ERROR", "FILE_CORRUPTION", + "INVALID_RECORD", "INVALID_MODE", "COMPACTION_IS_NOT_ALLOWED", + "SNAPSHOT_NOT_FOUND", "FILE_IS_NOT_IMMUTABLE", + "MANUAL_COMPACTION_OPEN_FAILED", "FDB_KVS_CLOSE_FAIL", + "COMPACTION_CANCELLED", "READ_VIOLATION", "TIMEOUT", + "EVICTION_FAILED", "INVALID_OFFSET", "HANDLE_IS_BEING_CLOSED", + "ROLLBACK_IN_PROGRESS", "DIRECT_IO_NOT_SUPPORTED", + "NOT_IMPLEMENTED", "FILE_SIZE_MISMATCH", "INCOMPLETE_LOG", + "UNKNOWN_LOG_FLAG"}); + uint32_t index = -val; + if (index < names.size()) { + return names[index]; + } else if (ERROR == val) { + return "ERROR"; + } else { + return "UNKNOWN_STATUS"; + } + } + +private: + Value val; +}; + +} // namespace jungle + + diff --git a/manifest.sh b/manifest.sh new file mode 100644 index 0000000..5f2b735 --- /dev/null +++ b/manifest.sh @@ -0,0 +1 @@ +FORESTDB_COMMIT="02642fe03829fb2e2f957a07d35db2db70d3463f" diff --git a/prepare.sh b/prepare.sh new file mode 100644 index 0000000..fcd6460 --- /dev/null +++ b/prepare.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -ex + +. manifest.sh + +RECOMPILE_FDB=true + +if [ -d third_party/forestdb ]; then + pushd third_party/forestdb + if [ $(git rev-parse HEAD) == ${FORESTDB_COMMIT} ]; then + RECOMPILE_FDB=false + fi + popd + git submodule update +fi + +if [ ! -f third_party/forestdb/CMakeLists.txt ]; then + git submodule update --init +fi + +pushd third_party/forestdb/ +if [ ${RECOMPILE_FDB} == true ]; then + git pull origin master + git reset --hard ${FORESTDB_COMMIT} + rm -rf ./build + mkdir build + cd build + cmake -DSNAPPY_OPTION=Disable ../ + make static_lib $1 + cd .. +fi +popd diff --git a/scripts/lcov_cobertura.py b/scripts/lcov_cobertura.py new file mode 100644 index 0000000..7aae6d1 --- /dev/null +++ b/scripts/lcov_cobertura.py @@ -0,0 +1,414 @@ +#!/usr/bin/env python + +# Copyright 2011-2012 Eric Wendelin +# +# This is free software, licensed under the Apache License, Version 2.0, +# available in the accompanying LICENSE.txt file. + +""" +Converts lcov line coverage output to Cobertura-compatible XML for CI +""" + +import re +import sys +import os +import time +import subprocess +from xml.dom import minidom +from optparse import OptionParser + +from distutils.spawn import find_executable + +CPPFILT = "c++filt" +HAVE_CPPFILT = False + +if find_executable(CPPFILT) is not None: + HAVE_CPPFILT = True + +VERSION = '1.6' +__all__ = ['LcovCobertura'] + + +class Demangler(object): + def __init__(self): + self.pipe = subprocess.Popen( + CPPFILT, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + + def demangle(self, name): + self.pipe.stdin.write(name + "\n") + return self.pipe.stdout.readline().rstrip() + + +class LcovCobertura(object): + """ + Converts code coverage report files in lcov format to Cobertura's XML + report format so that CI servers like Jenkins can aggregate results and + determine build stability etc. + + >>> from lcov_cobertura import LcovCobertura + >>> LCOV_INPUT = 'your lcov input' + >>> converter = LcovCobertura(LCOV_INPUT) + >>> cobertura_xml = converter.convert() + >>> print(cobertura_xml) + """ + + def __init__(self, lcov_data, base_dir='.', excludes=None, demangle=False): + """ + Create a new :class:`LcovCobertura` object using the given `lcov_data` + and `options`. + + :param lcov_data: Path to LCOV data file + :type lcov_data: string + :param base_dir: Path upon which to base all sources + :type base_dir: string + :param excludes: list of regexes to packages as excluded + :type excludes: [string] + :param demangle: whether to demangle function names using c++filt + :type demangle: bool + """ + + if not excludes: + excludes = [] + self.lcov_data = lcov_data + self.base_dir = base_dir + self.excludes = excludes + if demangle: + demangler = Demangler() + self.format = demangler.demangle + else: + self.format = lambda x: x + + def convert(self): + """ + Convert lcov file to cobertura XML using options from this instance. + """ + coverage_data = self.parse() + return self.generate_cobertura_xml(coverage_data) + + def parse(self): + """ + Generate a data structure representing it that can be serialized in any + logical format. + """ + + coverage_data = { + 'packages': {}, + 'summary': {'lines-total': 0, 'lines-covered': 0, + 'branches-total': 0, 'branches-covered': 0}, + 'timestamp': str(int(time.time())) + } + package = None + current_file = None + file_lines_total = 0 + file_lines_covered = 0 + file_lines = {} + file_methods = {} + file_branches_total = 0 + file_branches_covered = 0 + + for line in self.lcov_data.split('\n'): + if line.strip() == 'end_of_record': + if current_file is not None: + package_dict = coverage_data['packages'][package] + package_dict['lines-total'] += file_lines_total + package_dict['lines-covered'] += file_lines_covered + package_dict['branches-total'] += file_branches_total + package_dict['branches-covered'] += file_branches_covered + file_dict = package_dict['classes'][current_file] + file_dict['lines-total'] = file_lines_total + file_dict['lines-covered'] = file_lines_covered + file_dict['lines'] = dict(file_lines) + file_dict['methods'] = dict(file_methods) + file_dict['branches-total'] = file_branches_total + file_dict['branches-covered'] = file_branches_covered + coverage_data['summary']['lines-total'] += file_lines_total + coverage_data['summary']['lines-covered'] += file_lines_covered + coverage_data['summary']['branches-total'] += file_branches_total + coverage_data['summary']['branches-covered'] += file_branches_covered + + line_parts = line.split(':', 1) + input_type = line_parts[0] + + if input_type == 'SF': + # Get file name + file_name = line_parts[-1].strip() + relative_file_name = os.path.relpath(file_name, self.base_dir) + package = '.'.join(relative_file_name.split(os.path.sep)[0:-1]) + class_name = '.'.join(relative_file_name.split(os.path.sep)) + if package not in coverage_data['packages']: + coverage_data['packages'][package] = { + 'classes': {}, 'lines-total': 0, 'lines-covered': 0, + 'branches-total': 0, 'branches-covered': 0 + } + coverage_data['packages'][package]['classes'][ + relative_file_name] = { + 'name': class_name, 'lines': {}, 'lines-total': 0, + 'lines-covered': 0, 'branches-total': 0, + 'branches-covered': 0 + } + package = package + current_file = relative_file_name + file_lines_total = 0 + file_lines_covered = 0 + file_lines.clear() + file_methods.clear() + file_branches_total = 0 + file_branches_covered = 0 + elif input_type == 'DA': + # DA:2,0 + (line_number, line_hits) = line_parts[-1].strip().split(',') + line_number = int(line_number) + if line_number not in file_lines: + file_lines[line_number] = { + 'branch': 'false', 'branches-total': 0, + 'branches-covered': 0 + } + file_lines[line_number]['hits'] = line_hits + # Increment lines total/covered for class and package + try: + if int(line_hits) > 0: + file_lines_covered += 1 + except: + pass + file_lines_total += 1 + elif input_type == 'BRDA': + # BRDA:1,1,2,0 + (line_number, block_number, branch_number, branch_hits) = line_parts[-1].strip().split(',') + line_number = int(line_number) + if line_number not in file_lines: + file_lines[line_number] = { + 'branch': 'true', 'branches-total': 0, + 'branches-covered': 0, 'hits': 0 + } + file_lines[line_number]['branch'] = 'true' + file_lines[line_number]['branches-total'] += 1 + file_branches_total += 1 + if branch_hits != '-' and int(branch_hits) > 0: + file_lines[line_number]['branches-covered'] += 1 + file_branches_covered += 1 + elif input_type == 'BRF': + file_branches_total = int(line_parts[1]) + elif input_type == 'BRH': + file_branches_covered = int(line_parts[1]) + elif input_type == 'FN': + # FN:5,(anonymous_1) + function_line, function_name = line_parts[-1].strip().split(',') + file_methods[function_name] = [function_line, '0'] + elif input_type == 'FNDA': + # FNDA:0,(anonymous_1) + (function_hits, function_name) = line_parts[-1].strip().split(',') + if function_name not in file_methods: + file_methods[function_name] = ['0', '0'] + file_methods[function_name][-1] = function_hits + + # Exclude packages + excluded = [x for x in coverage_data['packages'] for e in self.excludes + if re.match(e, x)] + for package in excluded: + del coverage_data['packages'][package] + + # Compute line coverage rates + for package_data in list(coverage_data['packages'].values()): + package_data['line-rate'] = self._percent( + package_data['lines-total'], + package_data['lines-covered']) + package_data['branch-rate'] = self._percent( + package_data['branches-total'], + package_data['branches-covered']) + + return coverage_data + + def generate_cobertura_xml(self, coverage_data): + """ + Given parsed coverage data, return a String cobertura XML representation. + + :param coverage_data: Nested dict representing coverage information. + :type coverage_data: dict + """ + + dom_impl = minidom.getDOMImplementation() + doctype = dom_impl.createDocumentType("coverage", None, + "http://cobertura.sourceforge.net/xml/coverage-04.dtd") + document = dom_impl.createDocument(None, "coverage", doctype) + root = document.documentElement + summary = coverage_data['summary'] + self._attrs(root, { + 'branch-rate': self._percent(summary['branches-total'], + summary['branches-covered']), + 'branches-covered': str(summary['branches-covered']), + 'branches-valid': str(summary['branches-total']), + 'complexity': '0', + 'line-rate': self._percent(summary['lines-total'], + summary['lines-covered']), + 'lines-covered': str(summary['lines-covered']), + 'lines-valid': str(summary['lines-total']), + 'timestamp': coverage_data['timestamp'], + 'version': '2.0.3' + }) + + sources = self._el(document, 'sources', {}) + source = self._el(document, 'source', {}) + source.appendChild(document.createTextNode(self.base_dir)) + sources.appendChild(source) + + root.appendChild(sources) + + packages_el = self._el(document, 'packages', {}) + + packages = coverage_data['packages'] + for package_name, package_data in list(packages.items()): + package_el = self._el(document, 'package', { + 'line-rate': package_data['line-rate'], + 'branch-rate': package_data['branch-rate'], + 'name': package_name, + 'complexity': '0', + }) + classes_el = self._el(document, 'classes', {}) + for class_name, class_data in list(package_data['classes'].items()): + class_el = self._el(document, 'class', { + 'branch-rate': self._percent(class_data['branches-total'], + class_data['branches-covered']), + 'complexity': '0', + 'filename': class_name, + 'line-rate': self._percent(class_data['lines-total'], + class_data['lines-covered']), + 'name': class_data['name'] + }) + + # Process methods + methods_el = self._el(document, 'methods', {}) + for method_name, (line, hits) in list(class_data['methods'].items()): + method_el = self._el(document, 'method', { + 'name': self.format(method_name), + 'signature': '', + 'line-rate': '1.0' if int(hits) > 0 else '0.0', + 'branch-rate': '1.0' if int(hits) > 0 else '0.0', + }) + method_lines_el = self._el(document, 'lines', {}) + method_line_el = self._el(document, 'line', { + 'hits': hits, + 'number': line, + 'branch': 'false', + }) + method_lines_el.appendChild(method_line_el) + method_el.appendChild(method_lines_el) + methods_el.appendChild(method_el) + + # Process lines + lines_el = self._el(document, 'lines', {}) + lines = list(class_data['lines'].keys()) + lines.sort() + for line_number in lines: + line_el = self._el(document, 'line', { + 'branch': class_data['lines'][line_number]['branch'], + 'hits': str(class_data['lines'][line_number]['hits']), + 'number': str(line_number) + }) + if class_data['lines'][line_number]['branch'] == 'true': + total = int(class_data['lines'][line_number]['branches-total']) + covered = int(class_data['lines'][line_number]['branches-covered']) + percentage = int((covered * 100.0) / total) + line_el.setAttribute('condition-coverage', + '{0}% ({1}/{2})'.format( + percentage, covered, total)) + lines_el.appendChild(line_el) + + class_el.appendChild(methods_el) + class_el.appendChild(lines_el) + classes_el.appendChild(class_el) + package_el.appendChild(classes_el) + packages_el.appendChild(package_el) + root.appendChild(packages_el) + + return document.toprettyxml() + + def _el(self, document, name, attrs): + """ + Create an element within document with given name and attributes. + + :param document: Document element + :type document: Document + :param name: Element name + :type name: string + :param attrs: Attributes for element + :type attrs: dict + """ + return self._attrs(document.createElement(name), attrs) + + def _attrs(self, element, attrs): + """ + Set attributes on given element. + + :param element: DOM Element + :type element: Element + :param attrs: Attributes for element + :type attrs: dict + """ + for attr, val in list(attrs.items()): + element.setAttribute(attr, val) + return element + + def _percent(self, lines_total, lines_covered): + """ + Get the percentage of lines covered in the total, with formatting. + + :param lines_total: Total number of lines in given module + :type lines_total: number + :param lines_covered: Number of lines covered by tests in module + :type lines_covered: number + """ + + if lines_total == 0: + return '0.0' + return str(float(float(lines_covered) / float(lines_total))) + + +def main(argv=None): + """ + Converts LCOV coverage data to Cobertura-compatible XML for reporting. + + Usage: + lcov_cobertura.py lcov-file.dat + lcov_cobertura.py lcov-file.dat -b src/dir -e test.lib -o path/out.xml + + By default, XML output will be written to ./coverage.xml + """ + if argv is None: + argv = sys.argv + parser = OptionParser() + parser.usage = ('lcov_cobertura.py lcov-file.dat [-b source/dir] ' + '[-e ] [-o output.xml] [-d]') + parser.description = 'Converts lcov output to cobertura-compatible XML' + parser.add_option('-b', '--base-dir', action='store', + help='Directory where source files are located', + dest='base_dir', default='.') + parser.add_option('-e', '--excludes', + help='Comma-separated list of regexes of packages to exclude', + action='append', dest='excludes', default=[]) + parser.add_option('-o', '--output', + help='Path to store cobertura xml file', + action='store', dest='output', default='coverage.xml') + parser.add_option('-d', '--demangle', + help='Demangle C++ function names using %s' % CPPFILT, + action='store_true', dest='demangle', default=False) + (options, args) = parser.parse_args(args=argv) + + if options.demangle and not HAVE_CPPFILT: + raise RuntimeError("C++ filter executable (%s) not found!" % CPPFILT) + + if len(args) != 2: + print(main.__doc__) + sys.exit(1) + + try: + with open(args[1], 'r') as lcov_file: + lcov_data = lcov_file.read() + lcov_cobertura = LcovCobertura(lcov_data, options.base_dir, options.excludes, options.demangle) + cobertura_xml = lcov_cobertura.convert() + with open(options.output, mode='wt') as output_file: + output_file.write(cobertura_xml) + except IOError: + sys.stderr.write("Unable to convert %s to Cobertura XML" % args[1]) + +if __name__ == '__main__': + main() diff --git a/scripts/runtests.sh b/scripts/runtests.sh new file mode 100644 index 0000000..afd63dd --- /dev/null +++ b/scripts/runtests.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +VALGRIND= + +for var in "$@" +do + if [ $var == "valgrind" ]; then + VALGRIND="valgrind --undef-value-errors=no" + echo "valgrind mode: ON" + fi + if [ $var == "directio" ]; then + export TEST_ENABLE_DIRECTIO=true + echo "direct IO mode: ON" + fi +done + +$VALGRIND ./tests/crc32_test --abort-on-failure +$VALGRIND ./tests/keyvalue_test --abort-on-failure +$VALGRIND ./tests/fileops_test --abort-on-failure +$VALGRIND ./tests/fileops_directio_test --abort-on-failure +$VALGRIND ./tests/memtable_test --abort-on-failure +$VALGRIND ./tests/table_lookup_booster_test --abort-on-failure + +$VALGRIND ./tests/basic_op_test --abort-on-failure +$VALGRIND ./tests/seq_itr_test --abort-on-failure +$VALGRIND ./tests/key_itr_test --abort-on-failure +$VALGRIND ./tests/snapshot_test --abort-on-failure +$VALGRIND ./tests/custom_cmp_test --abort-on-failure +$VALGRIND ./tests/corruption_test --abort-on-failure +$VALGRIND ./tests/compaction_test --abort-on-failure +$VALGRIND ./tests/mt_test --abort-on-failure +$VALGRIND ./tests/log_reclaim_test --abort-on-failure +$VALGRIND ./tests/level_extension_test --abort-on-failure diff --git a/src/ashared_ptr.h b/src/ashared_ptr.h new file mode 100644 index 0000000..40ae0db --- /dev/null +++ b/src/ashared_ptr.h @@ -0,0 +1,154 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2017 Jung-Sang Ahn +See URL: https://github.com/greensky00/latency-collector + (v0.1.2) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include + +#include +#include + +template +class ashared_ptr { +public: + ashared_ptr() : object(nullptr) {} + ashared_ptr(T* src_ptr) : object( (src_ptr) + ? new PtrWrapper(src_ptr) + : nullptr ) {} + ashared_ptr(const ashared_ptr& src) : object(nullptr) { + operator=(src); + } + + ~ashared_ptr() { + reset(); + } + + void reset() { + std::lock_guard l(lock); + PtrWrapper* ptr = object.load(MO); + // Unlink pointer first, destroy object next. + object.store(nullptr, MO); + releaseObject(ptr); + } + + bool operator==(const ashared_ptr& src) const { + return object.load(MO) == src.object.load(MO); + } + + bool operator==(const T* src) const { + if (!object.load(MO)) { + // If current `object` is NULL, + return src == nullptr; + } + return object.load(MO)->ptr.load(MO) == src; + } + + void operator=(const ashared_ptr& src) { + std::lock_guard l(lock); + + ashared_ptr& writable_src = const_cast&>(src); + PtrWrapper* src_object = writable_src.shareCurObject(); + + // Replace object. + PtrWrapper* old = object.load(MO); + object.store(src_object, MO); + + // Release old object. + releaseObject(old); + } + + T* operator->() const { return object.load(MO)->ptr.load(MO); } + T& operator*() const { return *object.load(MO)->ptr.load(MO); } + T* get() const { return object.load(MO)->ptr.load(MO); } + + inline bool compare_exchange_strong(ashared_ptr& expected, + ashared_ptr src, + std::memory_order order) + { + (void)order; + return compare_exchange(expected, src); + } + + inline bool compare_exchange_weak(ashared_ptr& expected, + ashared_ptr src, + std::memory_order order) + { + (void)order; + return compare_exchange(expected, src); + } + + bool compare_exchange(ashared_ptr& expected, ashared_ptr src) { + // Note: it is OK that `expected` becomes outdated. + PtrWrapper* expected_ptr = expected.object.load(MO); + PtrWrapper* val_ptr = src.shareCurObject(); + + { // Lock for `object` + std::lock_guard l(lock); + if (object.compare_exchange_weak(expected_ptr, val_ptr)) { + // Succeeded. + // Release old object. + releaseObject(expected.object.load(MO)); + return true; + } + } + // Failed. + expected = *this; + // Release the object from `src`. + releaseObject(val_ptr); + return false; + } + +private: + template + struct PtrWrapper { + PtrWrapper() : ptr(nullptr), refCount(0) {} + PtrWrapper(T2* src) : ptr(src), refCount(1) {} + + std::atomic ptr; + std::atomic refCount; + }; + + // Atomically increase ref count and then return. + PtrWrapper* shareCurObject() { + std::lock_guard l(lock); + if (!object.load(MO)) return nullptr; + + // Now no one can change `object`. + // By increasing its ref count, `object` will be safe + // until the new holder (i.e., caller) is destructed. + object.load(MO)->refCount.fetch_add(1, MO); + return object.load(MO); + } + + // Decrease ref count and delete if no one refers to it. + void releaseObject(PtrWrapper* target) { + if (!target) return; + if (target->refCount.fetch_sub(1, MO) == 1) { + // Last shared pointer, delete it. + delete target->ptr.load(MO); + delete target; + } + } + + const static std::memory_order MO = std::memory_order_relaxed; + + std::atomic*> object; + std::mutex lock; +}; diff --git a/src/avltree.cc b/src/avltree.cc new file mode 100644 index 0000000..9656cb7 --- /dev/null +++ b/src/avltree.cc @@ -0,0 +1,671 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2014 Jung-Sang Ahn +See URL: https://github.com/greensky00/avltree + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#ifndef INLINE + #ifdef __APPLE__ + #define INLINE extern inline + #elif __linux__ + #define INLINE __inline + #else + #define INLINE + #endif +#endif + +#include "avltree.h" + +#define max(a,b) (((a) > (b)) ? (a) : (b)) + +INLINE int _abs(int n) { + int mask = n >> ((sizeof(int)*8) -1); + return (mask + n)^mask; +} + +INLINE void avl_set_parent(struct avl_node *node, struct avl_node *parent) +{ + node->parent = (struct avl_node *)( + (uint64_t)parent | ((uint64_t)node->parent & 0x3)); +} + +#ifdef __AVL_DEBUG +#include +#include +#include "avltree_debug.h" +#else +#define __AVL_DEBUG_BF_CHECK(bf) +#define __AVL_DEBUG_LL(p, c, pb, cb) +#define __AVL_DEBUG_RR(p, c, pb, cb) +#define __AVL_DEBUG_BAL_BEGIN(node, bf, height_diff) +#define __AVL_DEBUG_BAL_END(node) +#define __AVL_DEBUG_INSERT(node) +#define __AVL_DEBUG_REMOVE(node) +#define __AVL_DEBUG_DISPLAY(tree) +#endif + +INLINE void avl_set_bf(struct avl_node *node, int bf) +{ + __AVL_DEBUG_BF_CHECK(bf); + +#ifdef _AVL_SEPARATE_PARENT_BF + node->bf = bf; +#else + node->parent = (struct avl_node *)( + (uint64_t)avl_parent(node) | (uint64_t)(bf+1)); +#endif +} + +INLINE struct avl_node* _rotate_LL(struct avl_node *parent, + int parent_bf, + int *child_bf, + int *height_delta) +// MUST ensure that parent_bf <= 0 +{ + int p_right, c_left, c_right; + struct avl_node *child = parent->left; + + __AVL_DEBUG_LL(parent, child, parent_bf, *child_bf); + + c_left = (child->left)?(1):(0); + c_right = (child->right)?(1):(0); + if (*child_bf < 0) { + // child->left > child->right + c_left = c_right - (*child_bf); + p_right = c_left + 1 + parent_bf; + if (height_delta) + *height_delta = max(c_left, max(c_right, p_right)+1) - (c_left + 1); + + } else { + // child->left <= child->right + c_right = c_left + (*child_bf); + p_right = c_right + 1 + parent_bf; + if (height_delta) + *height_delta = max(c_left, max(c_right, p_right)+1) - (c_right + 1); + } + *child_bf = (max(c_right, p_right) + 1) - c_left; + avl_set_bf(parent, p_right - c_right); + + parent->left = child->right; + if (child->right) + avl_set_parent(child->right, parent); + child->right = parent; + avl_set_parent(child, avl_parent(parent)); + avl_set_parent(parent, child); + + return child; +} + +INLINE struct avl_node* _rotate_RR(struct avl_node *parent, + int parent_bf, + int *child_bf, + int *height_delta) +// MUST ensure that parent_bf >= 0 +{ + int p_left, c_left, c_right; + struct avl_node *child = parent->right; + + __AVL_DEBUG_RR(parent, child, parent_bf, *child_bf); + + c_left = (child->left)?(1):(0); + c_right = (child->right)?(1):(0); + if (*child_bf < 0) { + // child->left > child->right + c_left = c_right - (*child_bf); + p_left = c_left + 1 - parent_bf; + if (height_delta) + *height_delta = max(c_right, max(c_left, p_left)+1) - (c_left + 1); + + } else { + // child->left <= child->right + c_right = c_left + (*child_bf); + p_left = c_right + 1 - parent_bf; + if (height_delta) + *height_delta = max(c_right, max(c_left, p_left)+1) - (c_right + 1); + + } + *child_bf = c_right - (max(c_left, p_left) + 1); + avl_set_bf(parent, c_left - p_left); + + parent->right = child->left; + if (child->left) + avl_set_parent(child->left, parent); + child->left = parent; + avl_set_parent(child, avl_parent(parent)); + avl_set_parent(parent, child); + + return child; +} + +INLINE struct avl_node* _rotate_LR(struct avl_node *parent, int parent_bf) +{ + int child_bf, height_delta = 0; + struct avl_node *child = parent->left; + struct avl_node *ret; + + if (child->right) { + child_bf = avl_bf(child->right); + parent->left = _rotate_RR(child, avl_bf(child), &child_bf, &height_delta); + } else { + child_bf = avl_bf(child); + } + + ret = _rotate_LL(parent, parent_bf-height_delta, &child_bf, NULL); + avl_set_bf(ret, child_bf); + return ret; +} + +INLINE struct avl_node* _rotate_RL(struct avl_node *parent, int parent_bf) +{ + int child_bf, height_delta = 0; + struct avl_node *child = parent->right; + struct avl_node *ret; + + if (child->left) { + child_bf = avl_bf(child->left); + parent->right = _rotate_LL(child, avl_bf(child), &child_bf, &height_delta); + } else { + child_bf = avl_bf(child); + } + + ret = _rotate_RR(parent, parent_bf+height_delta, &child_bf, NULL); + avl_set_bf(ret, child_bf); + return ret; +} + +#define _get_balance(node) ((node)?(avl_bf(node)):(0)) + +static struct avl_node* _balance_tree(struct avl_node *node, int bf) +{ + int child_bf; + int height_diff= _get_balance(node) + bf; + + if (node) { + __AVL_DEBUG_BAL_BEGIN(node, bf, height_diff); + + if(height_diff < -1 && node->left) { + // balance left sub tree + if(_get_balance(node->left) <= 0) { + child_bf = avl_bf(node->left); + node = _rotate_LL(node, height_diff, &child_bf, NULL); + avl_set_bf(node, child_bf); + } else { + node = _rotate_LR(node, height_diff); + } + } else if(height_diff > 1 && node->right) { + // balance right sub tree + if(_get_balance(node->right) >= 0) { + child_bf = avl_bf(node->right); + node = _rotate_RR(node, height_diff, &child_bf, NULL); + avl_set_bf(node, child_bf); + } else { + node = _rotate_RL(node, height_diff); + } + } else { + avl_set_bf(node, avl_bf(node) + bf); + } + + __AVL_DEBUG_BAL_END(node); + } + + return node; +} + +struct avl_node* avl_first(struct avl_tree *tree) +{ + struct avl_node *p = NULL; + struct avl_node *node = tree->root; + + while(node) { + p = node; + node = node->left; + } + return p; +} + +struct avl_node* avl_last(struct avl_tree *tree) +{ + struct avl_node *p = NULL; + struct avl_node *node = tree->root; + + while(node) { + p = node; + node = node->right; + } + return p; +} + +struct avl_node* avl_next(struct avl_node *node) +{ + if (node == NULL) return NULL; + +#ifdef _AVL_NEXT_POINTER + return node->next; +#else + + struct avl_node *p; + + // smallest value of right subtree + if (node->right) { + p = node; + node = node->right; + while (node) { + p = node; + node = node->left; + } + return p; + } + + // node does not have right child + if (avl_parent(node)) { + // find first parent that has right child + p = node; + node = avl_parent(node); + while(node) { + if (node->left == p) { + return node; + } + p = node; + node = avl_parent(node); + } + } +#endif + return NULL; +} + +struct avl_node* avl_prev(struct avl_node *node) +{ + if (node == NULL) return NULL; + +#ifdef _AVL_NEXT_POINTER + return node->prev; +#else + + struct avl_node *p; + + // largest value of left subtree + if (node->left) { + p = node; + node = node->left; + while (node) { + p = node; + node = node->right; + } + return p; + } + + // node does not have left child + if (avl_parent(node)) { + // find first parent that has left child + p = node; + node = avl_parent(node); + while(node) { + if (node->right == p) { + return node; + } + p = node; + node = avl_parent(node); + } + } +#endif + return NULL; +} + +struct avl_node* avl_search(struct avl_tree *tree, + struct avl_node *node, + avl_cmp_func *func) +// exact match +{ + struct avl_node *p = tree->root; + int cmp; + + while(p) + { + cmp = func(p, node, tree->aux); + if (cmp > 0) { + p = p->left; + }else if (cmp < 0){ + p = p->right; + }else { + // search success + return p; + } + } + // search fail + return NULL; +} + +struct avl_node* avl_search_greater(struct avl_tree *tree, + struct avl_node *node, + avl_cmp_func *func) +// if an exact match does not exist, +// return smallest node greater than NODE +{ + struct avl_node *p = tree->root; + struct avl_node *pp = NULL; + int cmp; + + while(p) + { + cmp = func(p, node, tree->aux); + pp = p; + + if (cmp > 0) { + p = p->left; + }else if (cmp < 0){ + p = p->right; + }else { + // search success + return p; + } + } + + if (!pp) { + return pp; + } + + cmp = func(pp, node, tree->aux); + if (cmp > 0) { + return pp; + }else{ + return avl_next(pp); + } +} + +struct avl_node* avl_search_smaller(struct avl_tree *tree, + struct avl_node *node, + avl_cmp_func *func) +// if an exact match does not exist, +// return greatest node smaller than NODE +{ + struct avl_node *p = tree->root; + struct avl_node *pp = NULL; + int cmp; + + while(p) + { + cmp = func(p, node, tree->aux); + pp = p; + + if (cmp > 0) { + p = p->left; + }else if (cmp < 0){ + p = p->right; + }else { + // search success + return p; + } + } + + if (!pp) { + return pp; + } + + cmp = func(pp, node, tree->aux); + if (cmp < 0) { + return pp; + }else{ + return avl_prev(pp); + } +} + +void avl_init(struct avl_tree *tree, void *aux) +{ + tree->root = NULL; + tree->aux = aux; +} + +void avl_set_aux(struct avl_tree *tree, void *aux) +{ + tree->aux = aux; +} + +struct avl_node* avl_insert(struct avl_tree *tree, + struct avl_node *node, + avl_cmp_func *func) +{ + __AVL_DEBUG_INSERT(node); + + struct avl_node *node_original = node; + struct avl_node *p=NULL,*cur; + int cmp, bf, bf_old; + + cur = tree->root; + while(cur) + { + cmp = func(cur, node, tree->aux); + p = cur; + + if(cmp > 0) { + cur = cur->left; + }else if (cmp < 0){ + cur = cur->right; + }else { + // duplicated key -> return + return cur; + } + } + + avl_set_parent(node, p); + avl_set_bf(node, 0); + node->left = node->right = NULL; +#ifdef _AVL_NEXT_POINTER + node->prev = node->next = NULL; +#endif + + // P is parent node of CUR + if(p) { + if(func(p, node, tree->aux) > 0) { + p->left = node; +#ifdef _AVL_NEXT_POINTER + node->next = p; + node->prev = p->prev; + if (p->prev) p->prev->next = node; + p->prev = node; +#endif + + }else { + p->right = node; +#ifdef _AVL_NEXT_POINTER + node->prev = p; + node->next = p->next; + if (p->next) p->next->prev = node; + p->next = node; +#endif + } + + } else { + // no parent .. make NODE as root + tree->root = node; + } + + // recursive balancing process .. scan from leaf to root + bf = 0; + while(node) { + p = avl_parent(node); + + if (p) { + // if parent exists + bf_old = avl_bf(node); + + if (p->right == node) { + node = _balance_tree(node, bf); + p->right = node; + }else { + node = _balance_tree(node, bf); + p->left = node; + } + + // calculate balance facter BF for parent + if (node->left == NULL && node->right == NULL) { + // leaf node + if (p->left == node) bf = -1; + else bf = 1; + } else { + // index ndoe + bf = 0; + if (_abs(bf_old) < _abs(avl_bf(node))) { + // if ABS of balance factor increases + // cascade to parent + if (p->left == node) bf = -1; + else bf = 1; + } + } + + } else if(node == tree->root){ + tree->root = _balance_tree(tree->root, bf); + break; + } + if (bf == 0) break; + + node = p; + } + + __AVL_DEBUG_DISPLAY(tree); + + return node_original; +} + +void avl_remove(struct avl_tree *tree, + struct avl_node *node) +{ + __AVL_DEBUG_REMOVE(node); + + // not found + if (node == NULL) return; + + struct avl_tree right_subtree; + struct avl_node *p=NULL,*cur, *next=NULL; + int bf = 0, bf_old; + + +#ifdef _AVL_NEXT_POINTER + if (node->prev) node->prev->next = node->next; + if (node->next) node->next->prev = node->prev; +#endif + + // find smallest node in right sub-tree + right_subtree.root = node->right; + next = avl_first(&right_subtree); + + if (next) { + // 1. NEXT exists + if (avl_parent(next)) { + if (avl_parent(next) != node) { + // NODE is not NEXT's direct parent + // MUST ensure NEXT should be *left child* of its parent + // MUST ensure NEXT doesn't have right child + avl_parent(next)->left = next->right; + if (next->right) + avl_set_parent(next->right, avl_parent(next)); + } + } + if (avl_parent(node)) { + // replace NODE by NEXT + if (avl_parent(node)->left == node) { + avl_parent(node)->left = next; + } else { + avl_parent(node)->right = next; + } + } + + // re-link pointers + if (node->right != next) { + next->right = node->right; + if (node->right) avl_set_parent(node->right, next); + cur = avl_parent(next); + bf = 1; + }else{ + cur = next; + bf = -1; + } + + next->left = node->left; + if (node->left) avl_set_parent(node->left, next); + avl_set_parent(next, avl_parent(node)); + + // inherit NODE's balance factor + avl_set_bf(next, avl_bf(node)); + + } else { + // 2. NEXT == NULL (only when there's no right sub-tree) + p = avl_parent(node); + if (p) { + if (p->left == node) { + p->left = node->left; + bf = 1; + } else { + p->right = node->left; + bf = -1; + } + } + if (node->left) + avl_set_parent(node->left, p); + + cur = avl_parent(node); + } + + // reset root + if (tree->root == node) { + tree->root = next; + if (next == NULL) { + if (node->left) tree->root = node->left; + } + } + + // recursive balancing process .. scan from CUR to root + while(cur) { + p = avl_parent(cur); + if (p) { + // if parent exists + bf_old = avl_bf(cur); + + if (p->right == cur) { + cur = _balance_tree(cur, bf); + p->right = cur; + }else { + cur = _balance_tree(cur, bf); + p->left = cur; + } + + // calculate balance facter BF for parent + if (cur->left == NULL && cur->right == NULL) { + // leaf node + if (p->left == cur) bf = 1; + else bf = -1; + } else { + // index ndoe + bf = 0; + if (_abs(bf_old) > _abs(avl_bf(cur))) { + // if ABS of balance factor decreases + // cascade to parent + if (p->left == cur) bf = 1; + else bf = -1; + } + } + + } else if(cur == tree->root){ + tree->root = _balance_tree(tree->root, bf); + break; + } + if (bf == 0) break; + + cur = p; + } + + __AVL_DEBUG_DISPLAY(tree); +} + diff --git a/src/avltree.h b/src/avltree.h new file mode 100644 index 0000000..061d0e3 --- /dev/null +++ b/src/avltree.h @@ -0,0 +1,90 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2014 Jung-Sang Ahn +See URL: https://github.com/greensky00/avltree + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + + +#ifndef _JSAHN_AVL_TREE_H +#define _JSAHN_AVL_TREE_H + +#include "stddef.h" +#include "stdint.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct avl_node { + struct avl_node *parent, *left, *right; + +#ifdef _AVL_SEPARATE_PARENT_BF + int bf; +#endif +#ifdef _AVL_NEXT_POINTER + struct avl_node *prev, *next; +#endif +}; + +struct avl_tree{ + struct avl_node *root; + void *aux; +}; + +#ifndef _get_entry +#define _get_entry(ELEM, STRUCT, MEMBER) \ + ((STRUCT *) ((uint8_t *) (ELEM) - offsetof (STRUCT, MEMBER))) +#endif + +#define avl_parent(node) \ + ((struct avl_node *)((uint64_t)(node)->parent & ~0x3)) + +#ifdef _AVL_SEPARATE_PARENT_BF + #define avl_bf(node) ((node)->bf) +#else + #define avl_bf(node) (((int)((uint64_t)(node)->parent & 0x3)) - 1) +#endif + +// *a < *b : return neg +// *a == *b : return 0 +// *a > *b : return pos +typedef int avl_cmp_func (struct avl_node *a, struct avl_node *b, void *aux); + +void avl_init(struct avl_tree *tree, void *aux); +void avl_set_aux(struct avl_tree *tree, void *aux); +struct avl_node* avl_insert(struct avl_tree *tree, + struct avl_node *node, + avl_cmp_func *func); +struct avl_node* avl_search(struct avl_tree *tree, + struct avl_node *node, + avl_cmp_func *func); +struct avl_node* avl_search_greater(struct avl_tree *tree, + struct avl_node *node, + avl_cmp_func *func); +struct avl_node* avl_search_smaller(struct avl_tree *tree, + struct avl_node *node, + avl_cmp_func *func); +void avl_remove(struct avl_tree *tree, + struct avl_node *node); +struct avl_node* avl_first(struct avl_tree *tree); +struct avl_node* avl_last(struct avl_tree *tree); +struct avl_node* avl_next(struct avl_node *node); +struct avl_node* avl_prev(struct avl_node *node); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/backtrace.h b/src/backtrace.h new file mode 100644 index 0000000..bffb6b3 --- /dev/null +++ b/src/backtrace.h @@ -0,0 +1,393 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2017 Jung-Sang Ahn +See URL: https://github.com/greensky00/simple_logger + (v0.3.5) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +// LCOV_EXCL_START + +#define SIZE_T_UNUSED size_t __attribute__((unused)) +#define VOID_UNUSED void __attribute__((unused)) +#define UINT64_T_UNUSED uint64_t __attribute__((unused)) +#define STR_UNUSED std::string __attribute__((unused)) +#define INTPTR_UNUSED intptr_t __attribute__((unused)) + +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef __APPLE__ +#include +#include + +static UINT64_T_UNUSED static_base_address(void) { + const struct segment_command_64* command = getsegbyname(SEG_TEXT /*"__TEXT"*/); + uint64_t addr = command->vmaddr; + return addr; +} + +static STR_UNUSED get_exec_path() { + char path[1024]; + uint32_t size = sizeof(path); + if (_NSGetExecutablePath(path, &size) != 0) return std::string(); + + return path; +} + +static STR_UNUSED get_file_part(const std::string& full_path) { + size_t pos = full_path.rfind("/"); + if (pos == std::string::npos) return full_path; + + return full_path.substr(pos + 1, full_path.size() - pos - 1); +} + +static INTPTR_UNUSED image_slide(void) { + std::string exec_path = get_exec_path(); + if (exec_path.empty()) return -1; + + auto image_count = _dyld_image_count(); + for (decltype(image_count) i = 0; i < image_count; i++) { + if ( strcmp( _dyld_get_image_name(i), + exec_path.c_str() ) == 0 ) { + return _dyld_get_image_vmaddr_slide(i); + } + } + return -1; +} +#endif + + +#define _snprintf(msg, avail_len, cur_len, msg_len, ...) \ + avail_len = (avail_len > cur_len) ? (avail_len - cur_len) : 0; \ + msg_len = snprintf( msg + cur_len, avail_len, __VA_ARGS__ ); \ + cur_len += (avail_len > msg_len) ? msg_len : avail_len + +static SIZE_T_UNUSED +_stack_backtrace(void** stack_ptr, size_t stack_ptr_capacity) { + return backtrace(stack_ptr, stack_ptr_capacity); +} + +static SIZE_T_UNUSED _stack_interpret_linux(void** stack_ptr, + char** stack_msg, + int stack_size, + char* output_buf, + size_t output_buflen); + +static SIZE_T_UNUSED _stack_interpret_apple(void** stack_ptr, + char** stack_msg, + int stack_size, + char* output_buf, + size_t output_buflen); + +static SIZE_T_UNUSED _stack_interpret_other(void** stack_ptr, + char** stack_msg, + int stack_size, + char* output_buf, + size_t output_buflen); + +static SIZE_T_UNUSED +_stack_interpret(void** stack_ptr, + int stack_size, + char* output_buf, + size_t output_buflen) +{ + char** stack_msg = nullptr; + stack_msg = backtrace_symbols(stack_ptr, stack_size); + + size_t len = 0; + +#if defined(__linux__) + len = _stack_interpret_linux( stack_ptr, + stack_msg, + stack_size, + output_buf, + output_buflen ); + +#elif defined(__APPLE__) + len = _stack_interpret_apple( stack_ptr, + stack_msg, + stack_size, + output_buf, + output_buflen ); + +#else + len = _stack_interpret_other( stack_ptr, + stack_msg, + stack_size, + output_buf, + output_buflen ); + +#endif + free(stack_msg); + + return len; +} + +static SIZE_T_UNUSED _stack_interpret_linux(void** stack_ptr, + char** stack_msg, + int stack_size, + char* output_buf, + size_t output_buflen) +{ + size_t cur_len = 0; +#ifdef __linux__ + size_t frame_num = 0; + + // NOTE: starting from 1, skipping this frame. + for (int i = 1; i < stack_size; ++i) { + // `stack_msg[x]` format: + // /foo/bar/executable() [0xabcdef] + // /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0) [0x123456] + + // NOTE: with ASLR + // /foo/bar/executable(+0x5996) [0x555555559996] + + int fname_len = 0; + while ( stack_msg[i][fname_len] != '(' && + stack_msg[i][fname_len] != ' ' && + stack_msg[i][fname_len] != 0x0 ) { + ++fname_len; + } + + char addr_str[256]; + uintptr_t actual_addr = 0x0; + if ( stack_msg[i][fname_len] == '(' && + stack_msg[i][fname_len+1] == '+' ) { + // ASLR is enabled, get the offset from here. + int upto = fname_len + 2; + while ( stack_msg[i][upto] != ')' && + stack_msg[i][upto] != 0x0 ) { + upto++; + } + sprintf( addr_str, "%.*s", + upto - fname_len - 2, + &stack_msg[i][fname_len + 2] ); + + // Convert hex string -> integer address. + std::stringstream ss; + ss << std::hex << addr_str; + ss >> actual_addr; + + } else { + actual_addr = (uintptr_t)stack_ptr[i]; + sprintf(addr_str, "%" PRIxPTR, actual_addr); + } + + char cmd[1024]; + snprintf( cmd, 1024, "addr2line -f -e %.*s %s", + fname_len, stack_msg[i], addr_str ); + FILE* fp = popen(cmd, "r"); + if (!fp) continue; + + char mangled_name[1024]; + char file_line[1024]; + int ret = fscanf(fp, "%1023s %1023s", mangled_name, file_line); + (void)ret; + pclose(fp); + + size_t msg_len = 0; + size_t avail_len = output_buflen; + _snprintf( output_buf, avail_len, cur_len, msg_len, + "#%-2zu 0x%016" PRIxPTR " in ", + frame_num++, + actual_addr ); + + int status; + char *cc = abi::__cxa_demangle(mangled_name, 0, 0, &status); + if (cc) { + _snprintf(output_buf, avail_len, cur_len, msg_len, "%s at ", cc); + } else { + std::string msg_str = stack_msg[i]; + std::string _func_name = msg_str; + size_t s_pos = msg_str.find("("); + size_t e_pos = msg_str.rfind("+"); + if (e_pos == std::string::npos) e_pos = msg_str.rfind(")"); + if ( s_pos != std::string::npos && + e_pos != std::string::npos ) { + _func_name = msg_str.substr(s_pos+1, e_pos-s_pos-1); + } + _snprintf( output_buf, avail_len, cur_len, msg_len, + "%s() at ", + ( _func_name.empty() + ? mangled_name + : _func_name.c_str() ) ); + } + + _snprintf(output_buf, avail_len, cur_len, msg_len, "%s\n", file_line); + } + +#endif + return cur_len; +} + +static VOID_UNUSED skip_whitespace(const std::string base_str, size_t& cursor) { + while (base_str[cursor] == ' ') cursor++; +} + +static VOID_UNUSED skip_glyph(const std::string base_str, size_t& cursor) { + while (base_str[cursor] != ' ') cursor++; +} + +static SIZE_T_UNUSED _stack_interpret_apple(void** stack_ptr, + char** stack_msg, + int stack_size, + char* output_buf, + size_t output_buflen) +{ + size_t cur_len = 0; +#ifdef __APPLE__ + + size_t frame_num = 0; + (void)frame_num; + + std::string exec_full_path = get_exec_path(); + std::string exec_file = get_file_part( exec_full_path ); + uint64_t load_base = (uint64_t)image_slide() + static_base_address(); + + // NOTE: starting from 1, skipping this frame. + for (int i = 1; i < stack_size; ++i) { + // `stack_msg[x]` format: + // 8 foobar 0x000000010fd490da main + 1322 + if (!stack_msg[i] || stack_msg[i][0] == 0x0) continue; + + std::string base_str = stack_msg[i]; + + size_t s_pos = 0; + size_t len = 0; + size_t cursor = 0; + + // Skip frame number part. + skip_glyph(base_str, cursor); + + // Skip whitespace. + skip_whitespace(base_str, cursor); + s_pos = cursor; + // Filename part. + skip_glyph(base_str, cursor); + len = cursor - s_pos; + std::string filename = base_str.substr(s_pos, len); + + // Skip whitespace. + skip_whitespace(base_str, cursor); + s_pos = cursor; + // Address part. + skip_glyph(base_str, cursor); + len = cursor - s_pos; + std::string address = base_str.substr(s_pos, len); + if (!address.empty() && address[0] == '?') continue; + + // Skip whitespace. + skip_whitespace(base_str, cursor); + s_pos = cursor; + // Mangled function name part. + skip_glyph(base_str, cursor); + len = cursor - s_pos; + std::string func_mangled = base_str.substr(s_pos, len); + + size_t msg_len = 0; + size_t avail_len = output_buflen; + + _snprintf(output_buf, avail_len, cur_len, msg_len, + "#%-2zu %s in ", + frame_num++, address.c_str() ); + + if (filename != exec_file) { + // Dynamic library. + int status; + char *cc = abi::__cxa_demangle(func_mangled.c_str(), 0, 0, &status); + if (cc) { + _snprintf( output_buf, avail_len, cur_len, msg_len, + "%s at %s\n", cc, filename.c_str() ); + } else { + _snprintf( output_buf, avail_len, cur_len, msg_len, + "%s() at %s\n", + func_mangled.c_str(), + filename.c_str() ); + } + } else { + // atos return format: + // bbb(char) (in crash_example) (crash_example.cc:37) + std::stringstream ss; + ss << "atos -l 0x"; + ss << std::hex << load_base; + ss << " -o " << exec_full_path; + ss << " " << address; + FILE* fp = popen(ss.str().c_str(), "r"); + if (!fp) continue; + + char atos_cstr[4096]; + fgets(atos_cstr, 4095, fp); + + std::string atos_str = atos_cstr; + size_t d_pos = atos_str.find(" (in "); + if (d_pos == std::string::npos) continue; + std::string function_part = atos_str.substr(0, d_pos); + + d_pos = atos_str.find(") (", d_pos); + if (d_pos == std::string::npos) continue; + std::string source_part = atos_str.substr(d_pos + 3); + source_part = source_part.substr(0, source_part.size() - 2); + + _snprintf( output_buf, avail_len, cur_len, msg_len, + "%s at %s\n", + function_part.c_str(), + source_part.c_str() ); + } + } + +#endif + return cur_len; +} + +static SIZE_T_UNUSED _stack_interpret_other(void** stack_ptr, + char** stack_msg, + int stack_size, + char* output_buf, + size_t output_buflen) +{ + size_t cur_len = 0; + size_t frame_num = 0; + (void)frame_num; + + // NOTE: starting from 1, skipping this frame. + for (int i=1; i= byte_len ); + + uint8_t* ptr = (uint8_t*)bitmap.getPtr(); + memcpy(data, ptr, byte_len); +} + +void* BloomFilter::getPtr() const { + return bitmap.getPtr(); +} + +GenericBitmap& BloomFilter::getBitmap() { + return bitmap; +} + diff --git a/src/bloomfilter.h b/src/bloomfilter.h new file mode 100644 index 0000000..aa5e441 --- /dev/null +++ b/src/bloomfilter.h @@ -0,0 +1,59 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2016 Michael Schmatz +http://blog.michaelschmatz.com/2016/04/11/how-to-write-a-bloom-filter-cpp/ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "generic_bitmap.h" + +#include + +#include +#include + +class BloomFilter { +public: + BloomFilter(uint64_t bitmap_size, uint8_t hash_count); + BloomFilter(void* data, size_t len, uint8_t hash_count); + size_t size() const; + + /** + * Set the filter with the given binary. + */ + void set(void* data, size_t len); + + /** + * Check if the filter is positive with the given binary. + */ + bool check(void* data, size_t len); + + /** + * Check if the filter is positive with the given + * pre-calculated hash pair. + */ + bool check(uint64_t* hash_pair); + void moveBitmapFrom(void* data, size_t len); + void exportBitmap(void* data, size_t len) const; + void* getPtr() const; + GenericBitmap& getBitmap(); + +private: + uint8_t hashCount; + GenericBitmap bitmap; +}; + diff --git a/src/cmd_handler.cc b/src/cmd_handler.cc new file mode 100644 index 0000000..ed91329 --- /dev/null +++ b/src/cmd_handler.cc @@ -0,0 +1,174 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "cmd_handler.h" + +#include "db_mgr.h" +#include "db_internal.h" +#include "log_mgr.h" +#include "skiplist.h" +#include "table_mgr.h" + +#include +#include + +namespace jungle { + +CmdHandler::CmdHandler( const std::string& _w_name, + const GlobalConfig& _config ) +{ + workerName = _w_name; + gConfig = _config; + CmdHandlerOptions options; + options.sleepDuration_ms = 1000; + options.worker = this; + curOptions = options; + handle = std::thread(WorkerBase::loop, &curOptions); +} + +CmdHandler::~CmdHandler() { +} + +void CmdHandler::work(WorkerOptions* opt_base) { + Status s; + + DBMgr* dbm = DBMgr::getWithoutInit(); + if (!dbm) return; + + dbm->updateGlobalTime(); + // For the case when there is no traffic. + dbm->updateOpHistory(0); + bool new_idle_status = dbm->determineIdleStatus(); + if (dbm->setIdleStatus(new_idle_status)) { + if (new_idle_status) { + _log_info(dbm->getLogger(), " === Enter idle traffic mode ==="); + } else { + _log_info(dbm->getLogger(), " === Enter normal traffic mode ==="); + } + } + + DBWrap* target_dbw = nullptr; + + { std::lock_guard l(dbm->dbMapLock); + + std::vector dbs_to_check; + skiplist_node* cursor = skiplist_begin(&dbm->dbMap); + while (cursor) { + DBWrap* dbwrap = _get_entry(cursor, DBWrap, snode); + dbs_to_check.push_back(dbwrap); + cursor = skiplist_next(&dbm->dbMap, cursor); + skiplist_release_node(&dbwrap->snode); + } + if (cursor) skiplist_release_node(cursor); + + for (DBWrap* dbw: dbs_to_check) { + if (FileMgr::exist(dbw->path + "/jungle_cmd")) { + target_dbw = dbw; + break; + } + } + } + + if (target_dbw) { + handleCmd(target_dbw); + FileMgr::remove(target_dbw->path + "/jungle_cmd"); + } +} + +void CmdHandler::handleCmd(DBWrap* target_dbw) { + std::string cmd_file = target_dbw->path + "/jungle_cmd"; + std::string ret_file = target_dbw->path + "/jungle_cmd_result"; + + std::ifstream fs; + fs.open(cmd_file); + if (!fs.good()) return; + + std::stringstream ss; + ss << fs.rdbuf(); + fs.close(); + + if (ss.str().empty()) return; + + std::string ret_str; + std::vector tokens = + StrHelper::tokenize( StrHelper::trim(ss.str()), " " ); + + if ( tokens[0] == "getstats" ) { + ret_str = hGetStats(target_dbw, tokens); + + } else if ( tokens[0] == "loglevel" ) { + ret_str = hLogLevel(target_dbw, tokens); + + } + + std::ofstream ofs; + ofs.open(ret_file); + if (ret_str.empty()) { + ofs << "failed" << std::endl; + } else { + ofs << ret_str; + } + ofs.close(); + return; +} + +std::string CmdHandler::hGetStats(DBWrap* target_dbw, + const std::vector& tokens) +{ + DBStats stats_out; + target_dbw->db->getStats(stats_out); + + std::stringstream ss; + ss << "num_records" + << " " << stats_out.numKvs << std::endl; + ss << "working_set_size" + << " " << stats_out.workingSetSizeByte << std::endl; + ss << "cache" + << " " << stats_out.cacheUsedByte + << " " << stats_out.cacheSizeByte << std::endl; + + return ss.str(); +} + +std::string CmdHandler::hLogLevel(DBWrap* target_dbw, + const std::vector& tokens) +{ + std::stringstream ss; + int prev_lv = target_dbw->db->getLogLevel(); + + if (tokens.size() == 1) { + // Get log level. + ss << "log_level" + << " " << prev_lv << std::endl; + + } else { + // Set log level. + int new_lv = atoi(tokens[1].c_str()); + if (new_lv < -1 || new_lv > 6) { + ss << "invalid level: " << tokens[1] << std::endl; + return ss.str(); + } + + target_dbw->db->setLogLevel(new_lv); + + ss << "log_level" + << " " << prev_lv + << " " << new_lv << std::endl; + } + return ss.str(); +} + +} // namespace jungle diff --git a/src/cmd_handler.h b/src/cmd_handler.h new file mode 100644 index 0000000..230415e --- /dev/null +++ b/src/cmd_handler.h @@ -0,0 +1,50 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "worker_mgr.h" + +#include + +namespace jungle { + +struct DBWrap; +class CmdHandler : public WorkerBase { +public: + struct CmdHandlerOptions : public WorkerOptions { + }; + + CmdHandler(const std::string& _w_name, + const GlobalConfig& _config); + ~CmdHandler(); + void work(WorkerOptions* opt_base); + + GlobalConfig gConfig; + +private: + void handleCmd(DBWrap* target_dbw); + + std::string hGetStats(DBWrap* target_dbw, + const std::vector& tokens); + + std::string hLogLevel(DBWrap* target_dbw, + const std::vector& tokens); + +}; + + +} // namespace jungle diff --git a/src/compactor.cc b/src/compactor.cc new file mode 100644 index 0000000..4f089e6 --- /dev/null +++ b/src/compactor.cc @@ -0,0 +1,224 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "compactor.h" + +#include "db_mgr.h" +#include "db_internal.h" +#include "log_mgr.h" +#include "skiplist.h" +#include "table_mgr.h" + +#include +#include + +#include + +namespace jungle { + +Compactor::Compactor( const std::string& _w_name, + const GlobalConfig& _config ) + : lastCheckedFileIndex(0xffff) // Any big number to start from 0. + , lastCompactedHashNum(0) +{ + workerName = _w_name; + gConfig = _config; + CompactorOptions options; + options.sleepDuration_ms = gConfig.compactorSleepDuration_ms; + options.worker = this; + curOptions = options; + handle = std::thread(WorkerBase::loop, &curOptions); +} + +Compactor::~Compactor() { +} + +bool Compactor::chkLevel(size_t level, + DBWrap* dbwrap, + DB*& target_db_out, + size_t& target_level_out, + size_t& target_hash_num_out, + TableInfo*& target_table_out, + TableMgr::MergeStrategy& target_strategy_out) +{ + Status s; + bool found = false; + const DBConfig& db_config = dbwrap->db->p->dbConfig; + size_t num_l0 = dbwrap->db->p->tableMgr->getNumL0Partitions(); + + if (level == 0) { + // L0: Special case (hash). + for (size_t ii=0; iidb->p->tableMgr->chkL0CompactCond(idx)) { + target_db_out = dbwrap->db; + target_level_out = 0; + target_hash_num_out = idx; + lastCompactedHashNum = idx; + found = true; + break; + } + } + if (found) return true; + + } else { + // L1+: Key range. + if (db_config.nextLevelExtension) { + // Check L1 conditions. + TableMgr::MergeStrategy ms; + TableInfo* victim = nullptr; + s = dbwrap->db->p->tableMgr->chkLPCompactCond(level, ms, victim); + if (s) { + target_db_out = dbwrap->db; + target_level_out = level; + target_table_out = victim; + target_strategy_out = ms; + found = true; + } + } + if (found) return true; + } + + return false; +} + +void Compactor::work(WorkerOptions* opt_base) { + Status s; + + DBMgr* dbm = DBMgr::getWithoutInit(); + if (!dbm) return; + + DB* target_db = nullptr; + size_t target_level = 0; + + // only for L0. + size_t target_hash_num = 0; + + // only for L1. + TableInfo* target_table = nullptr; + TableMgr::MergeStrategy target_strategy = TableMgr::INPLACE; + + { std::lock_guard l(dbm->dbMapLock); + + // NOTE: + // Start from right next DB of the last checked one. + // Checking outside skiplist's loop will be safe + // as long as we are holding `dbMapLock`. + std::vector dbs_to_check; + + skiplist_node* cursor = skiplist_begin(&dbm->dbMap); + while (cursor) { + DBWrap* dbwrap = _get_entry(cursor, DBWrap, snode); + dbs_to_check.push_back(dbwrap); + cursor = skiplist_next(&dbm->dbMap, cursor); + skiplist_release_node(&dbwrap->snode); + } + if (cursor) skiplist_release_node(cursor); + + size_t num_dbs = dbs_to_check.size(); + if (++lastCheckedFileIndex >= num_dbs) lastCheckedFileIndex = 0; + + size_t s_idx = lastCheckedFileIndex; + size_t e_idx = lastCheckedFileIndex + num_dbs; + for (size_t ii = s_idx; ii < e_idx; ++ii) { + lastCheckedFileIndex = ii % num_dbs; + DBWrap* dbwrap = dbs_to_check[lastCheckedFileIndex]; + const DBConfig& db_config = dbwrap->db->p->dbConfig; + + if ( db_config.nextLevelExtension && + dbwrap->db->p->tableMgr->isL0CompactionInProg() ) { + // In level extension mode, there is nothing we can do + // if L0 -> L1 compaction is in progress, + // as it will touch (possibly) all tables in L1. + + // TODO: + // We can support parallel L0 -> L1 compaction + // as we can write to the same table simultaneously. + continue; + } + + // Currently all levels have even probability. + // TODO: Give higher priority to upper levels? + size_t num_levels = dbwrap->db->p->tableMgr->getNumLevels(); + std::vector prob_dist(num_levels, 1); + size_t start_level = RndGen::fromProbDist(prob_dist); + + if ( dbm->isIdleTraffic() || + ( dbm->isDebugParamsEffective() && + dbm->getDebugParams().urgentCompactionRatio ) ) { + // Special compaction mode, L0 compaction first. + start_level = 0; + } + + bool found = false; + for (size_t ii = start_level; ii < start_level + num_levels; ++ii) { + size_t lv = ii % num_levels; + found = chkLevel( lv, dbwrap, target_db, target_level, + target_hash_num, target_table, target_strategy ); + if (found) break; + } + if (found) { + target_db->p->incBgTask(); + break; + } + } + } + + if (target_db) { + // Found a DB to compact. + CompactOptions c_opt; + + if (target_level == 0) { + s = target_db->p->tableMgr->compactL0(c_opt, target_hash_num); + + } else { + if (target_strategy == TableMgr::INTERLEVEL) { + s = target_db->p->tableMgr->compactLevelItr + ( c_opt, target_table, target_level ); + + } else if ( target_strategy == TableMgr::SPLIT && + target_table ) { + s = target_db->p->tableMgr->splitLevel + ( c_opt, target_table, target_level ); + + } else if ( target_strategy == TableMgr::INPLACE && + target_table ) { + s = target_db->p->tableMgr->compactInPlace + ( c_opt, target_table, target_level ); + + } else if ( target_strategy == TableMgr::MERGE && + target_table ) { + s = target_db->p->tableMgr->mergeLevel + ( c_opt, target_table, target_level ); + + } else { + assert(0); + } + if (target_table) target_table->done(); + } + + // Do not sleep next time to continue + // to quickly compact other DB. + if (s) { + doNotSleepNextTime = true; + } + + target_db->p->decBgTask(); + } +} + +} // namespace jungle + diff --git a/src/compactor.h b/src/compactor.h new file mode 100644 index 0000000..5d2ee8c --- /dev/null +++ b/src/compactor.h @@ -0,0 +1,57 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "table_mgr.h" +#include "worker_mgr.h" + +#include + +#include + +namespace jungle { + +class DB; +struct DBWrap; +struct TableInfo; +class Compactor : public WorkerBase { +public: + struct CompactorOptions : public WorkerOptions { + }; + + Compactor(const std::string& _w_name, + const GlobalConfig& _config); + ~Compactor(); + void work(WorkerOptions* opt_base); + + GlobalConfig gConfig; + +private: + bool chkLevel(size_t level, + DBWrap* dbwrap, + DB*& target_db_out, + size_t& target_level_out, + size_t& target_hash_num_out, + TableInfo*& target_table_out, + TableMgr::MergeStrategy& target_strategy_out); + + size_t lastCheckedFileIndex; + size_t lastCompactedHashNum; +}; + + +} // namespace jungle diff --git a/src/configs.h b/src/configs.h new file mode 100644 index 0000000..4c32351 --- /dev/null +++ b/src/configs.h @@ -0,0 +1,21 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#define VERBOSE_LOG_SUPPRESS_MS (5000) +#define THROTTLING_EFFECTIVE_TIME_MS (5000) + diff --git a/src/crc32.cc b/src/crc32.cc new file mode 100644 index 0000000..ed651a1 --- /dev/null +++ b/src/crc32.cc @@ -0,0 +1,386 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2011-2016 Stephan Brumme +See URL: https://github.com/stbrumme/crc32 +See Original ZLib License: https://github.com/stbrumme/crc32/blob/master/LICENSE + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +// LCOV_EXCL_START + +#include "crc32.h" + +#include +#include + +/// look-up table, already declared above +const uint32_t crc_lookup[8][256] = +{ +//// same algorithm as crc32_bitwise +//for (int i = 0; i <= 0xFF; i++) +//{ +// uint32_t crc = i; +// for (int j = 0; j < 8; j++) +// crc = (crc >> 1) ^ ((crc & 1) * Polynomial); +// Crc32Lookup[0][i] = crc; +//} +//// ... and the following slicing-by-8 algorithm (from Intel): +//// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf +//// http://sourceforge.net/projects/slicing-by-8/ +//for (int i = 0; i <= 0xFF; i++) +//{ +// Crc32Lookup[1][i] = (Crc32Lookup[0][i] >> 8) ^ Crc32Lookup[0][Crc32Lookup[0][i] & 0xFF]; +// Crc32Lookup[2][i] = (Crc32Lookup[1][i] >> 8) ^ Crc32Lookup[0][Crc32Lookup[1][i] & 0xFF]; +// Crc32Lookup[3][i] = (Crc32Lookup[2][i] >> 8) ^ Crc32Lookup[0][Crc32Lookup[2][i] & 0xFF]; + +// Crc32Lookup[4][i] = (Crc32Lookup[3][i] >> 8) ^ Crc32Lookup[0][Crc32Lookup[3][i] & 0xFF]; +// Crc32Lookup[5][i] = (Crc32Lookup[4][i] >> 8) ^ Crc32Lookup[0][Crc32Lookup[4][i] & 0xFF]; +// Crc32Lookup[6][i] = (Crc32Lookup[5][i] >> 8) ^ Crc32Lookup[0][Crc32Lookup[5][i] & 0xFF]; +// Crc32Lookup[7][i] = (Crc32Lookup[6][i] >> 8) ^ Crc32Lookup[0][Crc32Lookup[6][i] & 0xFF]; +//} + { 0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3, + 0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91, + 0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7, + 0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5, + 0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B, + 0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59, + 0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F, + 0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D, + 0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433, + 0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01, + 0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457, + 0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65, + 0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB, + 0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9, + 0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F, + 0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD, + 0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683, + 0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1, + 0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7, + 0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5, + 0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B, + 0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79, + 0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F, + 0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D, + 0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713, + 0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21, + 0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777, + 0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45, + 0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB, + 0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9, + 0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF, + 0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D }, + + { 0x00000000,0x191B3141,0x32366282,0x2B2D53C3,0x646CC504,0x7D77F445,0x565AA786,0x4F4196C7, + 0xC8D98A08,0xD1C2BB49,0xFAEFE88A,0xE3F4D9CB,0xACB54F0C,0xB5AE7E4D,0x9E832D8E,0x87981CCF, + 0x4AC21251,0x53D92310,0x78F470D3,0x61EF4192,0x2EAED755,0x37B5E614,0x1C98B5D7,0x05838496, + 0x821B9859,0x9B00A918,0xB02DFADB,0xA936CB9A,0xE6775D5D,0xFF6C6C1C,0xD4413FDF,0xCD5A0E9E, + 0x958424A2,0x8C9F15E3,0xA7B24620,0xBEA97761,0xF1E8E1A6,0xE8F3D0E7,0xC3DE8324,0xDAC5B265, + 0x5D5DAEAA,0x44469FEB,0x6F6BCC28,0x7670FD69,0x39316BAE,0x202A5AEF,0x0B07092C,0x121C386D, + 0xDF4636F3,0xC65D07B2,0xED705471,0xF46B6530,0xBB2AF3F7,0xA231C2B6,0x891C9175,0x9007A034, + 0x179FBCFB,0x0E848DBA,0x25A9DE79,0x3CB2EF38,0x73F379FF,0x6AE848BE,0x41C51B7D,0x58DE2A3C, + 0xF0794F05,0xE9627E44,0xC24F2D87,0xDB541CC6,0x94158A01,0x8D0EBB40,0xA623E883,0xBF38D9C2, + 0x38A0C50D,0x21BBF44C,0x0A96A78F,0x138D96CE,0x5CCC0009,0x45D73148,0x6EFA628B,0x77E153CA, + 0xBABB5D54,0xA3A06C15,0x888D3FD6,0x91960E97,0xDED79850,0xC7CCA911,0xECE1FAD2,0xF5FACB93, + 0x7262D75C,0x6B79E61D,0x4054B5DE,0x594F849F,0x160E1258,0x0F152319,0x243870DA,0x3D23419B, + 0x65FD6BA7,0x7CE65AE6,0x57CB0925,0x4ED03864,0x0191AEA3,0x188A9FE2,0x33A7CC21,0x2ABCFD60, + 0xAD24E1AF,0xB43FD0EE,0x9F12832D,0x8609B26C,0xC94824AB,0xD05315EA,0xFB7E4629,0xE2657768, + 0x2F3F79F6,0x362448B7,0x1D091B74,0x04122A35,0x4B53BCF2,0x52488DB3,0x7965DE70,0x607EEF31, + 0xE7E6F3FE,0xFEFDC2BF,0xD5D0917C,0xCCCBA03D,0x838A36FA,0x9A9107BB,0xB1BC5478,0xA8A76539, + 0x3B83984B,0x2298A90A,0x09B5FAC9,0x10AECB88,0x5FEF5D4F,0x46F46C0E,0x6DD93FCD,0x74C20E8C, + 0xF35A1243,0xEA412302,0xC16C70C1,0xD8774180,0x9736D747,0x8E2DE606,0xA500B5C5,0xBC1B8484, + 0x71418A1A,0x685ABB5B,0x4377E898,0x5A6CD9D9,0x152D4F1E,0x0C367E5F,0x271B2D9C,0x3E001CDD, + 0xB9980012,0xA0833153,0x8BAE6290,0x92B553D1,0xDDF4C516,0xC4EFF457,0xEFC2A794,0xF6D996D5, + 0xAE07BCE9,0xB71C8DA8,0x9C31DE6B,0x852AEF2A,0xCA6B79ED,0xD37048AC,0xF85D1B6F,0xE1462A2E, + 0x66DE36E1,0x7FC507A0,0x54E85463,0x4DF36522,0x02B2F3E5,0x1BA9C2A4,0x30849167,0x299FA026, + 0xE4C5AEB8,0xFDDE9FF9,0xD6F3CC3A,0xCFE8FD7B,0x80A96BBC,0x99B25AFD,0xB29F093E,0xAB84387F, + 0x2C1C24B0,0x350715F1,0x1E2A4632,0x07317773,0x4870E1B4,0x516BD0F5,0x7A468336,0x635DB277, + 0xCBFAD74E,0xD2E1E60F,0xF9CCB5CC,0xE0D7848D,0xAF96124A,0xB68D230B,0x9DA070C8,0x84BB4189, + 0x03235D46,0x1A386C07,0x31153FC4,0x280E0E85,0x674F9842,0x7E54A903,0x5579FAC0,0x4C62CB81, + 0x8138C51F,0x9823F45E,0xB30EA79D,0xAA1596DC,0xE554001B,0xFC4F315A,0xD7626299,0xCE7953D8, + 0x49E14F17,0x50FA7E56,0x7BD72D95,0x62CC1CD4,0x2D8D8A13,0x3496BB52,0x1FBBE891,0x06A0D9D0, + 0x5E7EF3EC,0x4765C2AD,0x6C48916E,0x7553A02F,0x3A1236E8,0x230907A9,0x0824546A,0x113F652B, + 0x96A779E4,0x8FBC48A5,0xA4911B66,0xBD8A2A27,0xF2CBBCE0,0xEBD08DA1,0xC0FDDE62,0xD9E6EF23, + 0x14BCE1BD,0x0DA7D0FC,0x268A833F,0x3F91B27E,0x70D024B9,0x69CB15F8,0x42E6463B,0x5BFD777A, + 0xDC656BB5,0xC57E5AF4,0xEE530937,0xF7483876,0xB809AEB1,0xA1129FF0,0x8A3FCC33,0x9324FD72 }, + + { 0x00000000,0x01C26A37,0x0384D46E,0x0246BE59,0x0709A8DC,0x06CBC2EB,0x048D7CB2,0x054F1685, + 0x0E1351B8,0x0FD13B8F,0x0D9785D6,0x0C55EFE1,0x091AF964,0x08D89353,0x0A9E2D0A,0x0B5C473D, + 0x1C26A370,0x1DE4C947,0x1FA2771E,0x1E601D29,0x1B2F0BAC,0x1AED619B,0x18ABDFC2,0x1969B5F5, + 0x1235F2C8,0x13F798FF,0x11B126A6,0x10734C91,0x153C5A14,0x14FE3023,0x16B88E7A,0x177AE44D, + 0x384D46E0,0x398F2CD7,0x3BC9928E,0x3A0BF8B9,0x3F44EE3C,0x3E86840B,0x3CC03A52,0x3D025065, + 0x365E1758,0x379C7D6F,0x35DAC336,0x3418A901,0x3157BF84,0x3095D5B3,0x32D36BEA,0x331101DD, + 0x246BE590,0x25A98FA7,0x27EF31FE,0x262D5BC9,0x23624D4C,0x22A0277B,0x20E69922,0x2124F315, + 0x2A78B428,0x2BBADE1F,0x29FC6046,0x283E0A71,0x2D711CF4,0x2CB376C3,0x2EF5C89A,0x2F37A2AD, + 0x709A8DC0,0x7158E7F7,0x731E59AE,0x72DC3399,0x7793251C,0x76514F2B,0x7417F172,0x75D59B45, + 0x7E89DC78,0x7F4BB64F,0x7D0D0816,0x7CCF6221,0x798074A4,0x78421E93,0x7A04A0CA,0x7BC6CAFD, + 0x6CBC2EB0,0x6D7E4487,0x6F38FADE,0x6EFA90E9,0x6BB5866C,0x6A77EC5B,0x68315202,0x69F33835, + 0x62AF7F08,0x636D153F,0x612BAB66,0x60E9C151,0x65A6D7D4,0x6464BDE3,0x662203BA,0x67E0698D, + 0x48D7CB20,0x4915A117,0x4B531F4E,0x4A917579,0x4FDE63FC,0x4E1C09CB,0x4C5AB792,0x4D98DDA5, + 0x46C49A98,0x4706F0AF,0x45404EF6,0x448224C1,0x41CD3244,0x400F5873,0x4249E62A,0x438B8C1D, + 0x54F16850,0x55330267,0x5775BC3E,0x56B7D609,0x53F8C08C,0x523AAABB,0x507C14E2,0x51BE7ED5, + 0x5AE239E8,0x5B2053DF,0x5966ED86,0x58A487B1,0x5DEB9134,0x5C29FB03,0x5E6F455A,0x5FAD2F6D, + 0xE1351B80,0xE0F771B7,0xE2B1CFEE,0xE373A5D9,0xE63CB35C,0xE7FED96B,0xE5B86732,0xE47A0D05, + 0xEF264A38,0xEEE4200F,0xECA29E56,0xED60F461,0xE82FE2E4,0xE9ED88D3,0xEBAB368A,0xEA695CBD, + 0xFD13B8F0,0xFCD1D2C7,0xFE976C9E,0xFF5506A9,0xFA1A102C,0xFBD87A1B,0xF99EC442,0xF85CAE75, + 0xF300E948,0xF2C2837F,0xF0843D26,0xF1465711,0xF4094194,0xF5CB2BA3,0xF78D95FA,0xF64FFFCD, + 0xD9785D60,0xD8BA3757,0xDAFC890E,0xDB3EE339,0xDE71F5BC,0xDFB39F8B,0xDDF521D2,0xDC374BE5, + 0xD76B0CD8,0xD6A966EF,0xD4EFD8B6,0xD52DB281,0xD062A404,0xD1A0CE33,0xD3E6706A,0xD2241A5D, + 0xC55EFE10,0xC49C9427,0xC6DA2A7E,0xC7184049,0xC25756CC,0xC3953CFB,0xC1D382A2,0xC011E895, + 0xCB4DAFA8,0xCA8FC59F,0xC8C97BC6,0xC90B11F1,0xCC440774,0xCD866D43,0xCFC0D31A,0xCE02B92D, + 0x91AF9640,0x906DFC77,0x922B422E,0x93E92819,0x96A63E9C,0x976454AB,0x9522EAF2,0x94E080C5, + 0x9FBCC7F8,0x9E7EADCF,0x9C381396,0x9DFA79A1,0x98B56F24,0x99770513,0x9B31BB4A,0x9AF3D17D, + 0x8D893530,0x8C4B5F07,0x8E0DE15E,0x8FCF8B69,0x8A809DEC,0x8B42F7DB,0x89044982,0x88C623B5, + 0x839A6488,0x82580EBF,0x801EB0E6,0x81DCDAD1,0x8493CC54,0x8551A663,0x8717183A,0x86D5720D, + 0xA9E2D0A0,0xA820BA97,0xAA6604CE,0xABA46EF9,0xAEEB787C,0xAF29124B,0xAD6FAC12,0xACADC625, + 0xA7F18118,0xA633EB2F,0xA4755576,0xA5B73F41,0xA0F829C4,0xA13A43F3,0xA37CFDAA,0xA2BE979D, + 0xB5C473D0,0xB40619E7,0xB640A7BE,0xB782CD89,0xB2CDDB0C,0xB30FB13B,0xB1490F62,0xB08B6555, + 0xBBD72268,0xBA15485F,0xB853F606,0xB9919C31,0xBCDE8AB4,0xBD1CE083,0xBF5A5EDA,0xBE9834ED }, + + { 0x00000000,0xB8BC6765,0xAA09C88B,0x12B5AFEE,0x8F629757,0x37DEF032,0x256B5FDC,0x9DD738B9, + 0xC5B428EF,0x7D084F8A,0x6FBDE064,0xD7018701,0x4AD6BFB8,0xF26AD8DD,0xE0DF7733,0x58631056, + 0x5019579F,0xE8A530FA,0xFA109F14,0x42ACF871,0xDF7BC0C8,0x67C7A7AD,0x75720843,0xCDCE6F26, + 0x95AD7F70,0x2D111815,0x3FA4B7FB,0x8718D09E,0x1ACFE827,0xA2738F42,0xB0C620AC,0x087A47C9, + 0xA032AF3E,0x188EC85B,0x0A3B67B5,0xB28700D0,0x2F503869,0x97EC5F0C,0x8559F0E2,0x3DE59787, + 0x658687D1,0xDD3AE0B4,0xCF8F4F5A,0x7733283F,0xEAE41086,0x525877E3,0x40EDD80D,0xF851BF68, + 0xF02BF8A1,0x48979FC4,0x5A22302A,0xE29E574F,0x7F496FF6,0xC7F50893,0xD540A77D,0x6DFCC018, + 0x359FD04E,0x8D23B72B,0x9F9618C5,0x272A7FA0,0xBAFD4719,0x0241207C,0x10F48F92,0xA848E8F7, + 0x9B14583D,0x23A83F58,0x311D90B6,0x89A1F7D3,0x1476CF6A,0xACCAA80F,0xBE7F07E1,0x06C36084, + 0x5EA070D2,0xE61C17B7,0xF4A9B859,0x4C15DF3C,0xD1C2E785,0x697E80E0,0x7BCB2F0E,0xC377486B, + 0xCB0D0FA2,0x73B168C7,0x6104C729,0xD9B8A04C,0x446F98F5,0xFCD3FF90,0xEE66507E,0x56DA371B, + 0x0EB9274D,0xB6054028,0xA4B0EFC6,0x1C0C88A3,0x81DBB01A,0x3967D77F,0x2BD27891,0x936E1FF4, + 0x3B26F703,0x839A9066,0x912F3F88,0x299358ED,0xB4446054,0x0CF80731,0x1E4DA8DF,0xA6F1CFBA, + 0xFE92DFEC,0x462EB889,0x549B1767,0xEC277002,0x71F048BB,0xC94C2FDE,0xDBF98030,0x6345E755, + 0x6B3FA09C,0xD383C7F9,0xC1366817,0x798A0F72,0xE45D37CB,0x5CE150AE,0x4E54FF40,0xF6E89825, + 0xAE8B8873,0x1637EF16,0x048240F8,0xBC3E279D,0x21E91F24,0x99557841,0x8BE0D7AF,0x335CB0CA, + 0xED59B63B,0x55E5D15E,0x47507EB0,0xFFEC19D5,0x623B216C,0xDA874609,0xC832E9E7,0x708E8E82, + 0x28ED9ED4,0x9051F9B1,0x82E4565F,0x3A58313A,0xA78F0983,0x1F336EE6,0x0D86C108,0xB53AA66D, + 0xBD40E1A4,0x05FC86C1,0x1749292F,0xAFF54E4A,0x322276F3,0x8A9E1196,0x982BBE78,0x2097D91D, + 0x78F4C94B,0xC048AE2E,0xD2FD01C0,0x6A4166A5,0xF7965E1C,0x4F2A3979,0x5D9F9697,0xE523F1F2, + 0x4D6B1905,0xF5D77E60,0xE762D18E,0x5FDEB6EB,0xC2098E52,0x7AB5E937,0x680046D9,0xD0BC21BC, + 0x88DF31EA,0x3063568F,0x22D6F961,0x9A6A9E04,0x07BDA6BD,0xBF01C1D8,0xADB46E36,0x15080953, + 0x1D724E9A,0xA5CE29FF,0xB77B8611,0x0FC7E174,0x9210D9CD,0x2AACBEA8,0x38191146,0x80A57623, + 0xD8C66675,0x607A0110,0x72CFAEFE,0xCA73C99B,0x57A4F122,0xEF189647,0xFDAD39A9,0x45115ECC, + 0x764DEE06,0xCEF18963,0xDC44268D,0x64F841E8,0xF92F7951,0x41931E34,0x5326B1DA,0xEB9AD6BF, + 0xB3F9C6E9,0x0B45A18C,0x19F00E62,0xA14C6907,0x3C9B51BE,0x842736DB,0x96929935,0x2E2EFE50, + 0x2654B999,0x9EE8DEFC,0x8C5D7112,0x34E11677,0xA9362ECE,0x118A49AB,0x033FE645,0xBB838120, + 0xE3E09176,0x5B5CF613,0x49E959FD,0xF1553E98,0x6C820621,0xD43E6144,0xC68BCEAA,0x7E37A9CF, + 0xD67F4138,0x6EC3265D,0x7C7689B3,0xC4CAEED6,0x591DD66F,0xE1A1B10A,0xF3141EE4,0x4BA87981, + 0x13CB69D7,0xAB770EB2,0xB9C2A15C,0x017EC639,0x9CA9FE80,0x241599E5,0x36A0360B,0x8E1C516E, + 0x866616A7,0x3EDA71C2,0x2C6FDE2C,0x94D3B949,0x090481F0,0xB1B8E695,0xA30D497B,0x1BB12E1E, + 0x43D23E48,0xFB6E592D,0xE9DBF6C3,0x516791A6,0xCCB0A91F,0x740CCE7A,0x66B96194,0xDE0506F1 }, + + { 0x00000000,0x3D6029B0,0x7AC05360,0x47A07AD0,0xF580A6C0,0xC8E08F70,0x8F40F5A0,0xB220DC10, + 0x30704BC1,0x0D106271,0x4AB018A1,0x77D03111,0xC5F0ED01,0xF890C4B1,0xBF30BE61,0x825097D1, + 0x60E09782,0x5D80BE32,0x1A20C4E2,0x2740ED52,0x95603142,0xA80018F2,0xEFA06222,0xD2C04B92, + 0x5090DC43,0x6DF0F5F3,0x2A508F23,0x1730A693,0xA5107A83,0x98705333,0xDFD029E3,0xE2B00053, + 0xC1C12F04,0xFCA106B4,0xBB017C64,0x866155D4,0x344189C4,0x0921A074,0x4E81DAA4,0x73E1F314, + 0xF1B164C5,0xCCD14D75,0x8B7137A5,0xB6111E15,0x0431C205,0x3951EBB5,0x7EF19165,0x4391B8D5, + 0xA121B886,0x9C419136,0xDBE1EBE6,0xE681C256,0x54A11E46,0x69C137F6,0x2E614D26,0x13016496, + 0x9151F347,0xAC31DAF7,0xEB91A027,0xD6F18997,0x64D15587,0x59B17C37,0x1E1106E7,0x23712F57, + 0x58F35849,0x659371F9,0x22330B29,0x1F532299,0xAD73FE89,0x9013D739,0xD7B3ADE9,0xEAD38459, + 0x68831388,0x55E33A38,0x124340E8,0x2F236958,0x9D03B548,0xA0639CF8,0xE7C3E628,0xDAA3CF98, + 0x3813CFCB,0x0573E67B,0x42D39CAB,0x7FB3B51B,0xCD93690B,0xF0F340BB,0xB7533A6B,0x8A3313DB, + 0x0863840A,0x3503ADBA,0x72A3D76A,0x4FC3FEDA,0xFDE322CA,0xC0830B7A,0x872371AA,0xBA43581A, + 0x9932774D,0xA4525EFD,0xE3F2242D,0xDE920D9D,0x6CB2D18D,0x51D2F83D,0x167282ED,0x2B12AB5D, + 0xA9423C8C,0x9422153C,0xD3826FEC,0xEEE2465C,0x5CC29A4C,0x61A2B3FC,0x2602C92C,0x1B62E09C, + 0xF9D2E0CF,0xC4B2C97F,0x8312B3AF,0xBE729A1F,0x0C52460F,0x31326FBF,0x7692156F,0x4BF23CDF, + 0xC9A2AB0E,0xF4C282BE,0xB362F86E,0x8E02D1DE,0x3C220DCE,0x0142247E,0x46E25EAE,0x7B82771E, + 0xB1E6B092,0x8C869922,0xCB26E3F2,0xF646CA42,0x44661652,0x79063FE2,0x3EA64532,0x03C66C82, + 0x8196FB53,0xBCF6D2E3,0xFB56A833,0xC6368183,0x74165D93,0x49767423,0x0ED60EF3,0x33B62743, + 0xD1062710,0xEC660EA0,0xABC67470,0x96A65DC0,0x248681D0,0x19E6A860,0x5E46D2B0,0x6326FB00, + 0xE1766CD1,0xDC164561,0x9BB63FB1,0xA6D61601,0x14F6CA11,0x2996E3A1,0x6E369971,0x5356B0C1, + 0x70279F96,0x4D47B626,0x0AE7CCF6,0x3787E546,0x85A73956,0xB8C710E6,0xFF676A36,0xC2074386, + 0x4057D457,0x7D37FDE7,0x3A978737,0x07F7AE87,0xB5D77297,0x88B75B27,0xCF1721F7,0xF2770847, + 0x10C70814,0x2DA721A4,0x6A075B74,0x576772C4,0xE547AED4,0xD8278764,0x9F87FDB4,0xA2E7D404, + 0x20B743D5,0x1DD76A65,0x5A7710B5,0x67173905,0xD537E515,0xE857CCA5,0xAFF7B675,0x92979FC5, + 0xE915E8DB,0xD475C16B,0x93D5BBBB,0xAEB5920B,0x1C954E1B,0x21F567AB,0x66551D7B,0x5B3534CB, + 0xD965A31A,0xE4058AAA,0xA3A5F07A,0x9EC5D9CA,0x2CE505DA,0x11852C6A,0x562556BA,0x6B457F0A, + 0x89F57F59,0xB49556E9,0xF3352C39,0xCE550589,0x7C75D999,0x4115F029,0x06B58AF9,0x3BD5A349, + 0xB9853498,0x84E51D28,0xC34567F8,0xFE254E48,0x4C059258,0x7165BBE8,0x36C5C138,0x0BA5E888, + 0x28D4C7DF,0x15B4EE6F,0x521494BF,0x6F74BD0F,0xDD54611F,0xE03448AF,0xA794327F,0x9AF41BCF, + 0x18A48C1E,0x25C4A5AE,0x6264DF7E,0x5F04F6CE,0xED242ADE,0xD044036E,0x97E479BE,0xAA84500E, + 0x4834505D,0x755479ED,0x32F4033D,0x0F942A8D,0xBDB4F69D,0x80D4DF2D,0xC774A5FD,0xFA148C4D, + 0x78441B9C,0x4524322C,0x028448FC,0x3FE4614C,0x8DC4BD5C,0xB0A494EC,0xF704EE3C,0xCA64C78C }, + + { 0x00000000,0xCB5CD3A5,0x4DC8A10B,0x869472AE,0x9B914216,0x50CD91B3,0xD659E31D,0x1D0530B8, + 0xEC53826D,0x270F51C8,0xA19B2366,0x6AC7F0C3,0x77C2C07B,0xBC9E13DE,0x3A0A6170,0xF156B2D5, + 0x03D6029B,0xC88AD13E,0x4E1EA390,0x85427035,0x9847408D,0x531B9328,0xD58FE186,0x1ED33223, + 0xEF8580F6,0x24D95353,0xA24D21FD,0x6911F258,0x7414C2E0,0xBF481145,0x39DC63EB,0xF280B04E, + 0x07AC0536,0xCCF0D693,0x4A64A43D,0x81387798,0x9C3D4720,0x57619485,0xD1F5E62B,0x1AA9358E, + 0xEBFF875B,0x20A354FE,0xA6372650,0x6D6BF5F5,0x706EC54D,0xBB3216E8,0x3DA66446,0xF6FAB7E3, + 0x047A07AD,0xCF26D408,0x49B2A6A6,0x82EE7503,0x9FEB45BB,0x54B7961E,0xD223E4B0,0x197F3715, + 0xE82985C0,0x23755665,0xA5E124CB,0x6EBDF76E,0x73B8C7D6,0xB8E41473,0x3E7066DD,0xF52CB578, + 0x0F580A6C,0xC404D9C9,0x4290AB67,0x89CC78C2,0x94C9487A,0x5F959BDF,0xD901E971,0x125D3AD4, + 0xE30B8801,0x28575BA4,0xAEC3290A,0x659FFAAF,0x789ACA17,0xB3C619B2,0x35526B1C,0xFE0EB8B9, + 0x0C8E08F7,0xC7D2DB52,0x4146A9FC,0x8A1A7A59,0x971F4AE1,0x5C439944,0xDAD7EBEA,0x118B384F, + 0xE0DD8A9A,0x2B81593F,0xAD152B91,0x6649F834,0x7B4CC88C,0xB0101B29,0x36846987,0xFDD8BA22, + 0x08F40F5A,0xC3A8DCFF,0x453CAE51,0x8E607DF4,0x93654D4C,0x58399EE9,0xDEADEC47,0x15F13FE2, + 0xE4A78D37,0x2FFB5E92,0xA96F2C3C,0x6233FF99,0x7F36CF21,0xB46A1C84,0x32FE6E2A,0xF9A2BD8F, + 0x0B220DC1,0xC07EDE64,0x46EAACCA,0x8DB67F6F,0x90B34FD7,0x5BEF9C72,0xDD7BEEDC,0x16273D79, + 0xE7718FAC,0x2C2D5C09,0xAAB92EA7,0x61E5FD02,0x7CE0CDBA,0xB7BC1E1F,0x31286CB1,0xFA74BF14, + 0x1EB014D8,0xD5ECC77D,0x5378B5D3,0x98246676,0x852156CE,0x4E7D856B,0xC8E9F7C5,0x03B52460, + 0xF2E396B5,0x39BF4510,0xBF2B37BE,0x7477E41B,0x6972D4A3,0xA22E0706,0x24BA75A8,0xEFE6A60D, + 0x1D661643,0xD63AC5E6,0x50AEB748,0x9BF264ED,0x86F75455,0x4DAB87F0,0xCB3FF55E,0x006326FB, + 0xF135942E,0x3A69478B,0xBCFD3525,0x77A1E680,0x6AA4D638,0xA1F8059D,0x276C7733,0xEC30A496, + 0x191C11EE,0xD240C24B,0x54D4B0E5,0x9F886340,0x828D53F8,0x49D1805D,0xCF45F2F3,0x04192156, + 0xF54F9383,0x3E134026,0xB8873288,0x73DBE12D,0x6EDED195,0xA5820230,0x2316709E,0xE84AA33B, + 0x1ACA1375,0xD196C0D0,0x5702B27E,0x9C5E61DB,0x815B5163,0x4A0782C6,0xCC93F068,0x07CF23CD, + 0xF6999118,0x3DC542BD,0xBB513013,0x700DE3B6,0x6D08D30E,0xA65400AB,0x20C07205,0xEB9CA1A0, + 0x11E81EB4,0xDAB4CD11,0x5C20BFBF,0x977C6C1A,0x8A795CA2,0x41258F07,0xC7B1FDA9,0x0CED2E0C, + 0xFDBB9CD9,0x36E74F7C,0xB0733DD2,0x7B2FEE77,0x662ADECF,0xAD760D6A,0x2BE27FC4,0xE0BEAC61, + 0x123E1C2F,0xD962CF8A,0x5FF6BD24,0x94AA6E81,0x89AF5E39,0x42F38D9C,0xC467FF32,0x0F3B2C97, + 0xFE6D9E42,0x35314DE7,0xB3A53F49,0x78F9ECEC,0x65FCDC54,0xAEA00FF1,0x28347D5F,0xE368AEFA, + 0x16441B82,0xDD18C827,0x5B8CBA89,0x90D0692C,0x8DD55994,0x46898A31,0xC01DF89F,0x0B412B3A, + 0xFA1799EF,0x314B4A4A,0xB7DF38E4,0x7C83EB41,0x6186DBF9,0xAADA085C,0x2C4E7AF2,0xE712A957, + 0x15921919,0xDECECABC,0x585AB812,0x93066BB7,0x8E035B0F,0x455F88AA,0xC3CBFA04,0x089729A1, + 0xF9C19B74,0x329D48D1,0xB4093A7F,0x7F55E9DA,0x6250D962,0xA90C0AC7,0x2F987869,0xE4C4ABCC }, + + { 0x00000000,0xA6770BB4,0x979F1129,0x31E81A9D,0xF44F2413,0x52382FA7,0x63D0353A,0xC5A73E8E, + 0x33EF4E67,0x959845D3,0xA4705F4E,0x020754FA,0xC7A06A74,0x61D761C0,0x503F7B5D,0xF64870E9, + 0x67DE9CCE,0xC1A9977A,0xF0418DE7,0x56368653,0x9391B8DD,0x35E6B369,0x040EA9F4,0xA279A240, + 0x5431D2A9,0xF246D91D,0xC3AEC380,0x65D9C834,0xA07EF6BA,0x0609FD0E,0x37E1E793,0x9196EC27, + 0xCFBD399C,0x69CA3228,0x582228B5,0xFE552301,0x3BF21D8F,0x9D85163B,0xAC6D0CA6,0x0A1A0712, + 0xFC5277FB,0x5A257C4F,0x6BCD66D2,0xCDBA6D66,0x081D53E8,0xAE6A585C,0x9F8242C1,0x39F54975, + 0xA863A552,0x0E14AEE6,0x3FFCB47B,0x998BBFCF,0x5C2C8141,0xFA5B8AF5,0xCBB39068,0x6DC49BDC, + 0x9B8CEB35,0x3DFBE081,0x0C13FA1C,0xAA64F1A8,0x6FC3CF26,0xC9B4C492,0xF85CDE0F,0x5E2BD5BB, + 0x440B7579,0xE27C7ECD,0xD3946450,0x75E36FE4,0xB044516A,0x16335ADE,0x27DB4043,0x81AC4BF7, + 0x77E43B1E,0xD19330AA,0xE07B2A37,0x460C2183,0x83AB1F0D,0x25DC14B9,0x14340E24,0xB2430590, + 0x23D5E9B7,0x85A2E203,0xB44AF89E,0x123DF32A,0xD79ACDA4,0x71EDC610,0x4005DC8D,0xE672D739, + 0x103AA7D0,0xB64DAC64,0x87A5B6F9,0x21D2BD4D,0xE47583C3,0x42028877,0x73EA92EA,0xD59D995E, + 0x8BB64CE5,0x2DC14751,0x1C295DCC,0xBA5E5678,0x7FF968F6,0xD98E6342,0xE86679DF,0x4E11726B, + 0xB8590282,0x1E2E0936,0x2FC613AB,0x89B1181F,0x4C162691,0xEA612D25,0xDB8937B8,0x7DFE3C0C, + 0xEC68D02B,0x4A1FDB9F,0x7BF7C102,0xDD80CAB6,0x1827F438,0xBE50FF8C,0x8FB8E511,0x29CFEEA5, + 0xDF879E4C,0x79F095F8,0x48188F65,0xEE6F84D1,0x2BC8BA5F,0x8DBFB1EB,0xBC57AB76,0x1A20A0C2, + 0x8816EAF2,0x2E61E146,0x1F89FBDB,0xB9FEF06F,0x7C59CEE1,0xDA2EC555,0xEBC6DFC8,0x4DB1D47C, + 0xBBF9A495,0x1D8EAF21,0x2C66B5BC,0x8A11BE08,0x4FB68086,0xE9C18B32,0xD82991AF,0x7E5E9A1B, + 0xEFC8763C,0x49BF7D88,0x78576715,0xDE206CA1,0x1B87522F,0xBDF0599B,0x8C184306,0x2A6F48B2, + 0xDC27385B,0x7A5033EF,0x4BB82972,0xEDCF22C6,0x28681C48,0x8E1F17FC,0xBFF70D61,0x198006D5, + 0x47ABD36E,0xE1DCD8DA,0xD034C247,0x7643C9F3,0xB3E4F77D,0x1593FCC9,0x247BE654,0x820CEDE0, + 0x74449D09,0xD23396BD,0xE3DB8C20,0x45AC8794,0x800BB91A,0x267CB2AE,0x1794A833,0xB1E3A387, + 0x20754FA0,0x86024414,0xB7EA5E89,0x119D553D,0xD43A6BB3,0x724D6007,0x43A57A9A,0xE5D2712E, + 0x139A01C7,0xB5ED0A73,0x840510EE,0x22721B5A,0xE7D525D4,0x41A22E60,0x704A34FD,0xD63D3F49, + 0xCC1D9F8B,0x6A6A943F,0x5B828EA2,0xFDF58516,0x3852BB98,0x9E25B02C,0xAFCDAAB1,0x09BAA105, + 0xFFF2D1EC,0x5985DA58,0x686DC0C5,0xCE1ACB71,0x0BBDF5FF,0xADCAFE4B,0x9C22E4D6,0x3A55EF62, + 0xABC30345,0x0DB408F1,0x3C5C126C,0x9A2B19D8,0x5F8C2756,0xF9FB2CE2,0xC813367F,0x6E643DCB, + 0x982C4D22,0x3E5B4696,0x0FB35C0B,0xA9C457BF,0x6C636931,0xCA146285,0xFBFC7818,0x5D8B73AC, + 0x03A0A617,0xA5D7ADA3,0x943FB73E,0x3248BC8A,0xF7EF8204,0x519889B0,0x6070932D,0xC6079899, + 0x304FE870,0x9638E3C4,0xA7D0F959,0x01A7F2ED,0xC400CC63,0x6277C7D7,0x539FDD4A,0xF5E8D6FE, + 0x647E3AD9,0xC209316D,0xF3E12BF0,0x55962044,0x90311ECA,0x3646157E,0x07AE0FE3,0xA1D90457, + 0x579174BE,0xF1E67F0A,0xC00E6597,0x66796E23,0xA3DE50AD,0x05A95B19,0x34414184,0x92364A30 }, + + { 0x00000000,0xCCAA009E,0x4225077D,0x8E8F07E3,0x844A0EFA,0x48E00E64,0xC66F0987,0x0AC50919, + 0xD3E51BB5,0x1F4F1B2B,0x91C01CC8,0x5D6A1C56,0x57AF154F,0x9B0515D1,0x158A1232,0xD92012AC, + 0x7CBB312B,0xB01131B5,0x3E9E3656,0xF23436C8,0xF8F13FD1,0x345B3F4F,0xBAD438AC,0x767E3832, + 0xAF5E2A9E,0x63F42A00,0xED7B2DE3,0x21D12D7D,0x2B142464,0xE7BE24FA,0x69312319,0xA59B2387, + 0xF9766256,0x35DC62C8,0xBB53652B,0x77F965B5,0x7D3C6CAC,0xB1966C32,0x3F196BD1,0xF3B36B4F, + 0x2A9379E3,0xE639797D,0x68B67E9E,0xA41C7E00,0xAED97719,0x62737787,0xECFC7064,0x205670FA, + 0x85CD537D,0x496753E3,0xC7E85400,0x0B42549E,0x01875D87,0xCD2D5D19,0x43A25AFA,0x8F085A64, + 0x562848C8,0x9A824856,0x140D4FB5,0xD8A74F2B,0xD2624632,0x1EC846AC,0x9047414F,0x5CED41D1, + 0x299DC2ED,0xE537C273,0x6BB8C590,0xA712C50E,0xADD7CC17,0x617DCC89,0xEFF2CB6A,0x2358CBF4, + 0xFA78D958,0x36D2D9C6,0xB85DDE25,0x74F7DEBB,0x7E32D7A2,0xB298D73C,0x3C17D0DF,0xF0BDD041, + 0x5526F3C6,0x998CF358,0x1703F4BB,0xDBA9F425,0xD16CFD3C,0x1DC6FDA2,0x9349FA41,0x5FE3FADF, + 0x86C3E873,0x4A69E8ED,0xC4E6EF0E,0x084CEF90,0x0289E689,0xCE23E617,0x40ACE1F4,0x8C06E16A, + 0xD0EBA0BB,0x1C41A025,0x92CEA7C6,0x5E64A758,0x54A1AE41,0x980BAEDF,0x1684A93C,0xDA2EA9A2, + 0x030EBB0E,0xCFA4BB90,0x412BBC73,0x8D81BCED,0x8744B5F4,0x4BEEB56A,0xC561B289,0x09CBB217, + 0xAC509190,0x60FA910E,0xEE7596ED,0x22DF9673,0x281A9F6A,0xE4B09FF4,0x6A3F9817,0xA6959889, + 0x7FB58A25,0xB31F8ABB,0x3D908D58,0xF13A8DC6,0xFBFF84DF,0x37558441,0xB9DA83A2,0x7570833C, + 0x533B85DA,0x9F918544,0x111E82A7,0xDDB48239,0xD7718B20,0x1BDB8BBE,0x95548C5D,0x59FE8CC3, + 0x80DE9E6F,0x4C749EF1,0xC2FB9912,0x0E51998C,0x04949095,0xC83E900B,0x46B197E8,0x8A1B9776, + 0x2F80B4F1,0xE32AB46F,0x6DA5B38C,0xA10FB312,0xABCABA0B,0x6760BA95,0xE9EFBD76,0x2545BDE8, + 0xFC65AF44,0x30CFAFDA,0xBE40A839,0x72EAA8A7,0x782FA1BE,0xB485A120,0x3A0AA6C3,0xF6A0A65D, + 0xAA4DE78C,0x66E7E712,0xE868E0F1,0x24C2E06F,0x2E07E976,0xE2ADE9E8,0x6C22EE0B,0xA088EE95, + 0x79A8FC39,0xB502FCA7,0x3B8DFB44,0xF727FBDA,0xFDE2F2C3,0x3148F25D,0xBFC7F5BE,0x736DF520, + 0xD6F6D6A7,0x1A5CD639,0x94D3D1DA,0x5879D144,0x52BCD85D,0x9E16D8C3,0x1099DF20,0xDC33DFBE, + 0x0513CD12,0xC9B9CD8C,0x4736CA6F,0x8B9CCAF1,0x8159C3E8,0x4DF3C376,0xC37CC495,0x0FD6C40B, + 0x7AA64737,0xB60C47A9,0x3883404A,0xF42940D4,0xFEEC49CD,0x32464953,0xBCC94EB0,0x70634E2E, + 0xA9435C82,0x65E95C1C,0xEB665BFF,0x27CC5B61,0x2D095278,0xE1A352E6,0x6F2C5505,0xA386559B, + 0x061D761C,0xCAB77682,0x44387161,0x889271FF,0x825778E6,0x4EFD7878,0xC0727F9B,0x0CD87F05, + 0xD5F86DA9,0x19526D37,0x97DD6AD4,0x5B776A4A,0x51B26353,0x9D1863CD,0x1397642E,0xDF3D64B0, + 0x83D02561,0x4F7A25FF,0xC1F5221C,0x0D5F2282,0x079A2B9B,0xCB302B05,0x45BF2CE6,0x89152C78, + 0x50353ED4,0x9C9F3E4A,0x121039A9,0xDEBA3937,0xD47F302E,0x18D530B0,0x965A3753,0x5AF037CD, + 0xFF6B144A,0x33C114D4,0xBD4E1337,0x71E413A9,0x7B211AB0,0xB78B1A2E,0x39041DCD,0xF5AE1D53, + 0x2C8E0FFF,0xE0240F61,0x6EAB0882,0xA201081C,0xA8C40105,0x646E019B,0xEAE10678,0x264B06E6 } +}; + +#define MIN(a,b) (((a)<(b))?(a):(b)) + +uint32_t crc32_1(const void* data, size_t len, uint32_t prev_value) { + uint32_t crc = ~prev_value; + const uint8_t* cur = (const uint8_t*) data; + + while (len-- > 0) { + crc = (crc >> 8) ^ crc_lookup[0][(crc & 0xFF) ^ *cur++]; + } + + return ~crc; +} + +uint32_t crc32_8(const void* data, size_t len, uint32_t prev_value) { + uint32_t* cur = (uint32_t*)data; + uint32_t crc = ~prev_value; + + while (len >= 8) { +#if 0 + // Disabled big-endian platform. + // #ifdef _BIG_ENDIAN + uint32_t one = *cur++ ^ bitswap32(crc); + uint32_t two = *cur++; + crc = + crc_lookup[7][(one>>24) & 0xFF] ^ + crc_lookup[6][(one>>16) & 0xFF] ^ + crc_lookup[5][(one>> 8) & 0xFF] ^ + crc_lookup[4][(one ) & 0xFF] ^ + crc_lookup[3][(two>>24) & 0xFF] ^ + crc_lookup[2][(two>>16) & 0xFF] ^ + crc_lookup[1][(two>> 8) & 0xFF] ^ + crc_lookup[0][(two ) & 0xFF]; +#else + uint32_t one = *cur++ ^ crc; + uint32_t two = *cur++; + crc = + crc_lookup[7][(one ) & 0xFF] ^ + crc_lookup[6][(one>> 8) & 0xFF] ^ + crc_lookup[5][(one>>16) & 0xFF] ^ + crc_lookup[4][(one>>24) & 0xFF] ^ + crc_lookup[3][(two ) & 0xFF] ^ + crc_lookup[2][(two>> 8) & 0xFF] ^ + crc_lookup[1][(two>>16) & 0xFF] ^ + crc_lookup[0][(two>>24) & 0xFF]; +#endif + len -= 8; + } + + unsigned char *cur_byte = (unsigned char*) cur; + while (len--) { + crc = (crc >> 8) ^ crc_lookup[0][(crc & 0xFF) ^ *cur_byte++]; + } + + return ~crc; +} + +uint32_t crc32_8_last8(const void* data, size_t len, uint32_t prev_value) { + size_t min = MIN(len, 8); + void *src = (char*)data + (len-min); +#ifdef _ALIGN_MEM_ACCESS + uint64_t temp; // aligned + memcpy(&temp, src, min); + src = &temp; +#endif + return crc32_8(src, min, prev_value); +} + +// LCOV_EXCL_STOP + diff --git a/src/crc32.h b/src/crc32.h new file mode 100644 index 0000000..2ba4380 --- /dev/null +++ b/src/crc32.h @@ -0,0 +1,39 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2011-2016 Stephan Brumme +See URL: https://github.com/stbrumme/crc32 +See Original ZLib License: https://github.com/stbrumme/crc32/blob/master/LICENSE + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#ifndef _JSAHN_CRC32_H +#define _JSAHN_CRC32_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +uint32_t crc32_1(const void* data, size_t len, uint32_t prev_value); +uint32_t crc32_8(const void* data, size_t len, uint32_t prev_value); +uint32_t crc32_8_last8(const void* data, size_t len, uint32_t prev_value); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/db_config.cc b/src/db_config.cc new file mode 100644 index 0000000..518a8a3 --- /dev/null +++ b/src/db_config.cc @@ -0,0 +1,72 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "db_mgr.h" + +#include + +namespace jungle { + +bool DBConfig::isValid() const { + return true; +} + +uint64_t DBConfig::getMaxTableSize(size_t level) const { + if (level == 0) return maxL0TableSize; + + uint64_t ret = maxL1TableSize; + size_t num_ratio_elems = tableSizeRatio.size(); + double last_ratio = num_ratio_elems + ? *tableSizeRatio.rbegin() + : 10; + for (size_t ii = 1; ii < level; ++ii) { + size_t vector_idx = ii - 1; + if (num_ratio_elems > vector_idx) { + ret *= tableSizeRatio[vector_idx]; + } else { + ret *= last_ratio; + } + } + return ret; +} + +size_t DBConfig::getMaxParallelWriters() const { + // If given, just return it. + if (maxParallelWritesPerJob) return maxParallelWritesPerJob; + if (readOnly) return 1; + + // If zero, calculate it. + DBMgr* mgr = DBMgr::getWithoutInit(); + if (!mgr) return 1; + + GlobalConfig* g_conf = mgr->getGlobalConfig(); + if (!g_conf) return 1; + + size_t num_task_threads = g_conf->numFlusherThreads + + g_conf->numCompactorThreads; + if (!num_task_threads) return 1; + + // Round-up. + size_t ret = ( g_conf->numTableWriters + (num_task_threads * 2) - 1 ) / + num_task_threads; + if (!ret) return 1; + if (ret == 1 && g_conf->numTableWriters) return 2; + return ret; +} + + +}; // namespace jungle; + diff --git a/src/db_group.cc b/src/db_group.cc new file mode 100644 index 0000000..b1e2b78 --- /dev/null +++ b/src/db_group.cc @@ -0,0 +1,166 @@ +#include "db_internal.h" +#include "db_mgr.h" +#include "fileops_directio.h" +#include "fileops_posix.h" +#include "internal_helper.h" + +#include _MACRO_TO_STR(LOGGER_H) + +#include + +namespace jungle { + +DBGroup::DBGroup() : p(new DBGroupInternal()) {} +DBGroup::~DBGroup() { + delete p; +} + +// Static function. +Status DBGroup::open(DBGroup** ptr_out, + std::string path, + const DBConfig& db_config) +{ + Status s; + DBMgr* mgr = DBMgr::get(); + std::string empty_kvs_name; + DB* default_db = mgr->openExisting(path, empty_kvs_name); + if (default_db) { + if (default_db->p->dbGroup == nullptr) { + // User already directly opened DB using DB::open(). + // Cannot use DBGroup at the same time. Error. + mgr->close(default_db); + return Status::INVALID_HANDLE_USAGE; + } else { + // Return existing one. + *ptr_out = default_db->p->dbGroup; + return Status(); + } + } + + // Otherwise: create a new one. + s = DB::open(&default_db, path, db_config); + if (!s) return s; + + DBGroup* db_group; + db_group = new DBGroup(); + + DBGroup::DBGroupInternal* p = db_group->p; + p->path = path; + p->config = db_config; + p->defaultDB = default_db; + + p->defaultDB->p->dbGroup = db_group; + *ptr_out = db_group; + return Status(); +} + +// Static function. +Status DBGroup::close(DBGroup* db_group) { + Status s; + + if (db_group->p->defaultDB) { + DBMgr* mgr = DBMgr::getWithoutInit(); + if (!mgr) return Status::ALREADY_SHUTDOWN; + + s = mgr->close(db_group->p->defaultDB); + } + return Status(); +} + +// Static function. +Status DBGroup::openDefaultDB(DB** ptr_out) +{ + if (!p->defaultDB) return Status::NOT_INITIALIZED; + + *ptr_out = p->defaultDB; + return Status(); +} + +Status DBGroup::openDB(DB** ptr_out, + std::string db_name) +{ + return openDB(ptr_out, db_name, p->config); +} + +Status DBGroup::openDB(DB** ptr_out, + std::string db_name, + const DBConfig& db_config) +{ + if (!ptr_out || db_name.empty() || !db_config.isValid()) { + return Status::INVALID_PARAMETERS; + } + + DBMgr* db_mgr = DBMgr::get(); + DB* src = p->defaultDB; + DB* db = db_mgr->openExisting(src->p->path, db_name); + if (db) { + *ptr_out = db; + return Status(); + } + + Status s; + + db = new DB(); + db->p->path = src->p->path; + db->p->fOps = new FileOpsPosix; + db->p->fDirectOps = new FileOpsDirectIO(src->p->myLog); + db->p->dbConfig = db_config; + db->p->myLog = src->p->myLog; + + // Shared objects: mani. + db->p->mani = src->p->mani; + + // Own objects: logMgr, tableMgr. + s = db->p->mani->getKVSID(db_name, db->p->kvsID); + if (!s) { + // Create a new KVS + s = db->p->mani->addNewKVS(db_name); + if (!s) return s; + + s = db->p->mani->getKVSID(db_name, db->p->kvsID); + if (!s) return s; + + db->p->mani->store(); + db->p->mani->sync(); + } + + db->p->kvsName = db_name; + + LogMgrOptions log_mgr_opt; + log_mgr_opt.fOps = db->p->fOps; + log_mgr_opt.fDirectOps = db->p->fDirectOps; + log_mgr_opt.path = db->p->path; + log_mgr_opt.prefixNum = db->p->kvsID; + log_mgr_opt.kvsName = db_name; + log_mgr_opt.dbConfig = &db->p->dbConfig; + db->p->logMgr = new LogMgr(db); + db->p->logMgr->setLogger(db->p->myLog); + db->p->logMgr->init(log_mgr_opt); + + TableMgrOptions table_mgr_opt; + table_mgr_opt.fOps = db->p->fOps; + table_mgr_opt.path = db->p->path; + table_mgr_opt.prefixNum = db->p->kvsID; + table_mgr_opt.dbConfig = &db->p->dbConfig; + db->p->tableMgr = new TableMgr(db); + db->p->tableMgr->setLogger(db->p->myLog); + db->p->tableMgr->init(table_mgr_opt); + + // In case of previous crash, + // sync table's last seqnum if log is lagging behind. + db->p->logMgr->syncSeqnum(db->p->tableMgr); + + s = db_mgr->assignNew(db); + if (!s) { + // Other thread already creates the handle. + db->p->destroy(); + delete db; + db = db_mgr->openExisting(src->p->path, db_name); + } + *ptr_out = db; + + return Status(); +} + +} + diff --git a/src/db_internal.h b/src/db_internal.h new file mode 100644 index 0000000..e4c3dd2 --- /dev/null +++ b/src/db_internal.h @@ -0,0 +1,289 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "avltree.h" +#include "db_manifest.h" +#include "db_mgr.h" +#include "event_awaiter.h" +#include "fileops_base.h" +#include "internal_helper.h" +#include "log_mgr.h" +#include "table_mgr.h" + +#include + +#include +#include +#include +#include +#include +#include + +using simple_thread_pool::TaskHandle; + +class SimpleLogger; + +namespace jungle { + +class DB::DBInternal { +public: + static const size_t MAX_CLOSE_RETRY = 100; // 10 seconds. + + struct ThrottlingStats { + ThrottlingStats() + : lastLogFlushRate(0) + , lastTableFlushRate(0) + , lastSplitRate(0) + { + // 10 seconds. + lastSplitRateExpiry.setDurationMs(10 * 1000); + lastTableFlushRateExpiry.setDurationMs(10 * 1000); + } + // Unit: IOPS. + std::atomic lastLogFlushRate; + + // Table compaction will not happen frequently. Need expiry. + Timer lastTableFlushRateExpiry; + std::atomic lastTableFlushRate; + + // Split will not happen frequently. Need expiry. + Timer lastSplitRateExpiry; + std::atomic lastSplitRate; + }; + + struct Flags { + Flags() + : onGoingBgTasks(0) + , closing(false) + , rollbackInProgress(false) + , seqLoading(false) + {} + std::atomic onGoingBgTasks; + std::atomic closing; + std::atomic rollbackInProgress; + std::atomic seqLoading; + }; + + DBInternal(); + + ~DBInternal(); + + void destroy(); + + // Correct contradict attributes. + void adjustConfig(); + + // Wait for on-going background tasks to be finished (or canceled). + void waitForBgTasks(); + + void incBgTask() { + flags.onGoingBgTasks.fetch_add(1); + } + + void decBgTask() { + assert(flags.onGoingBgTasks.load()); + flags.onGoingBgTasks.fetch_sub(1); + } + + void updateOpHistory(); + + enum OpType { + OPTYPE_READ = 0x1, + OPTYPE_WRITE = 0x2, + OPTYPE_FLUSH = 0x3, + OPTYPE_COMPACT = 0x4, + }; + Status checkHandleValidity(OpType op_type = OpType::OPTYPE_READ); + + // DB directory path (e.g., /foo/bar), + // not including "/" at the end. + std::string path; + + // Current DB config. + DBConfig dbConfig; + + // Backward pointer to parent DBGroup. + // If NULL, it was directly opened using DB::open() without DBGroup. + DBGroup* dbGroup; + + // Backward pointer to wrapper of this DB instance. + DBWrap* wrapper; + + // File ops handle, depends on the platform. + FileOps* fOps; + + // File ops handle based on direct-io + FileOps* fDirectOps; + + // DB manifest. + DBManifest* mani; + + // Sub key-value store ID. + uint64_t kvsID; + + // Sub key-value store name. + std::string kvsName; + + // Log manager. + LogMgr* logMgr; + + // Table manager. + TableMgr* tableMgr; + + // Throttling statistics. + ThrottlingStats tStats; + + // Logger. + SimpleLogger* myLog; + + // Verbose logging control for async flush. + VerboseLog vlAsyncFlush; + + // Flags. + Flags flags; + + // Last async flush job, if delayed task is scheduled. + std::shared_ptr asyncFlushJob; + std::mutex asyncFlushJobLock; +}; + +class DB::SnapInternal { +public: + SnapInternal(uint64_t _last_flush, uint64_t _chk_num) + : lastFlush(_last_flush) + , chkNum(_chk_num) + , logList(nullptr) + , tableList(nullptr) + {} + uint64_t lastFlush; + uint64_t chkNum; + std::list* logList; + std::list* tableList; +}; + +class DBGroup::DBGroupInternal { +public: + DBGroupInternal() + : defaultDB(nullptr) + {} + + std::string path; + DBConfig config; + + DB* defaultDB; +}; + +class Iterator::IteratorInternal { +public: + IteratorInternal(Iterator* _parent) + : db(nullptr) + , windowCursor(nullptr) + , parent(_parent) + { + avl_init(&curWindow, nullptr); + } + + enum Type { + BY_KEY = 0, + BY_SEQ = 1 + }; + struct ItrItem { + ItrItem() : flags(0x0), logItr(nullptr), tableItr(nullptr) {} + enum Flag { + none = 0x0, + no_more_prev = 0x1, + no_more_next = 0x2, + }; + inline static int cmpSeq(avl_node *a, avl_node *b, void *aux) { + ItrItem* aa = _get_entry(a, ItrItem, an); + ItrItem* bb = _get_entry(b, ItrItem, an); + if (aa->lastRec.seqNum < bb->lastRec.seqNum) return -1; + else if (aa->lastRec.seqNum > bb->lastRec.seqNum) return 1; + + // Seqnumber cannot be the same in the iterator. + // Pick log first and then table. + if (aa->logItr && !bb->logItr) return -1; + else if (!aa->logItr && bb->logItr) return 1; + + // Cannot happen. Bug. + assert(0); + return 0; + } + inline static int cmpKey(avl_node *a, avl_node *b, void *aux) { + ItrItem* aa = _get_entry(a, ItrItem, an); + ItrItem* bb = _get_entry(b, ItrItem, an); + + CMP_NULL_CHK(aa->lastRec.kv.key.data, bb->lastRec.kv.key.data); + + int cmp = 0; + if (aux) { + // Custom cmp mode. + DB* db = reinterpret_cast(aux); + CustomCmpFunc func = db->p->dbConfig.cmpFunc; + void* param = db->p->dbConfig.cmpFuncParam; + cmp = func(aa->lastRec.kv.key.data, aa->lastRec.kv.key.size, + bb->lastRec.kv.key.data, bb->lastRec.kv.key.size, + param); + } else { + cmp = SizedBuf::cmp(aa->lastRec.kv.key, bb->lastRec.kv.key); + } + + // NOTE: + // key: ascending, seq: descending order. + // e.g.) + // K1 (seq 5), K1 (seq 2), K2 (seq 3), K2 (seq 1) .. + if (cmp == 0) return cmpSeq(b, a, aux); + return cmp; + } + avl_node an; + uint8_t flags; + LogMgr::Iterator* logItr; + TableMgr::Iterator* tableItr; + + // WARNING: Jungle's iterator doesn't own the memory of `lastRec`. + // It is managed by either log iterator or table iterator. + Record lastRec; + }; + + Status moveToLastValid(); + + Status seekInternal(const SizedBuf& key, + const uint64_t seqnum, + SeekOption opt, + bool goto_end = false); + + inline int cmpSizedBuf(const SizedBuf& l, const SizedBuf& r); + + bool checkValidBySeq(ItrItem* item, + const uint64_t cur_seq, + const bool is_prev = false); + bool checkValidByKey(ItrItem* item, + const SizedBuf& cur_key, + const bool is_prev = false); + + Type type; + DB* db; + //LogMgr::Iterator logItr; + std::vector itrs; + avl_tree curWindow; + avl_node* windowCursor; + Iterator* parent; +}; + +} // namespace jungle + diff --git a/src/db_manifest.cc b/src/db_manifest.cc new file mode 100644 index 0000000..bab4bc9 --- /dev/null +++ b/src/db_manifest.cc @@ -0,0 +1,239 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "db_manifest.h" + +#include "crc32.h" +#include "internal_helper.h" + +#include _MACRO_TO_STR(LOGGER_H) + +#include + +namespace jungle { + +static uint8_t DBMANI_FOOTER[8] = {0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0xab, 0xab}; +static uint32_t DBMANI_VERSION = 0x1; + +DBManifest::DBManifest(FileOps* _f_ops) + : fOps(_f_ops) + , mFile(nullptr) + , maxKVSID(1) // 0 is reserved for default. + , myLog(nullptr) +{} + +DBManifest::~DBManifest() { + if (mFile) { + delete mFile; + } +} + +Status DBManifest::create(const std::string& path, + const std::string& filename) +{ + if (!fOps) + return Status::NOT_INITIALIZED; + if (fOps->exist(filename.c_str())) + return Status::ALREADY_EXIST; + if (filename.empty()) { + return Status::INVALID_PARAMETERS; + } + + dirPath = path; + mFileName = filename; + + // Create a new file. + Status s; + _log_info(myLog, "Create new DB manifest %s", mFileName.c_str()); + fOps->open(&mFile, mFileName.c_str()); + + store(); + + return Status(); +} + +Status DBManifest::load(const std::string& path, + const std::string& filename) +{ + if (!fOps) return Status::NOT_INITIALIZED; + if (mFile) return Status::ALREADY_LOADED; + if (filename.empty()) return Status::INVALID_PARAMETERS; + + dirPath = path; + mFileName = filename; + + Status s; + + _log_debug(myLog, "Load DB manifest %s", mFileName.c_str()); + EP( fOps->open(&mFile, mFileName.c_str()) ); + + // File should be bigger than 12 bytes (footer + version + CRC32). + size_t file_size = fOps->eof(mFile); + if (file_size < 16) return Status::FILE_CORRUPTION; + + SizedBuf mani_buf(file_size); + SizedBuf::Holder h_mani_buf(mani_buf); + EP( fOps->pread(mFile, mani_buf.data, mani_buf.size, 0) ); + + RwSerializer ss(mani_buf); + + // Magic check + uint8_t footer_file[8]; + ss.pos(file_size - 16); + ss.get(footer_file, 8); + if (memcmp(DBMANI_FOOTER, footer_file, 8) != 0) return Status::FILE_CORRUPTION; + + // Version check + uint32_t ver_file = ss.getU32(s); + (void)ver_file; + + // CRC check + uint32_t crc_file = ss.getU32(s); + + SizedBuf chk_buf(file_size - 4); + SizedBuf::Holder h_chk_buf(chk_buf); + ss.pos(0); + ss.get(chk_buf.data, chk_buf.size); + uint32_t crc_local = crc32_8(chk_buf.data, chk_buf.size, 0); + if (crc_local != crc_file) return Status::CHECKSUM_ERROR; + + ss.pos(0); + maxKVSID.store(ss.getU64(s), MOR); + uint32_t num_kvs = ss.getU32(s); + + _log_debug(myLog, "Max KVS ID %ld, # KVS %d", maxKVSID.load(), num_kvs); + + char kvs_name_buf[256]; + for (uint32_t ii=0; ii> + // Latest KVS ID, 8 bytes + // Number of KVSes, 4 bytes + ss.putU64(maxKVSID.load(MOR)); + ss.putU32(IDToName.size()); + _log_debug(myLog, "Max KVS ID %ld, # KVS %d", maxKVSID.load(), IDToName.size()); + + for (auto& entry: IDToName) { + // << KVS entry format >> + // KVS ID, 8 bytes + // Length of name, 4 bytes + // Name, xx bytes + ss.putU64(entry.first); + std::string kvs_name = entry.second; + ss.putU32(kvs_name.size()); + ss.put(kvs_name.c_str(), kvs_name.size()); + _log_debug(myLog, "ID %d, name %s", entry.first, kvs_name.c_str()); + } + + // Footer. + ss.put(DBMANI_FOOTER, 8); + + // Version. + ss.putU32(DBMANI_VERSION); + + // CRC32. + uint32_t crc_val = crc32_8(mani_buf.data, ss.pos(), 0); + + ss.putU32(crc_val); + + EP( fOps->pwrite(mFile, mani_buf.data, ss.pos(), 0) ); + + // Should truncate tail. + fOps->ftruncate(mFile, ss.pos()); + + // After success, make a backup file one more time, + // using the latest data. + EP(BackupRestore::backup(fOps, mFileName)); + + return Status(); +} + +Status DBManifest::sync() +{ + return fOps->fsync(mFile); +} + +Status DBManifest::addNewKVS(const std::string& kvs_name) +{ + auto entry = NameToID.find(kvs_name); + if (entry != NameToID.end()) { + _log_debug(myLog, "Add new KVS %s failed, already exists.", + kvs_name.c_str()); + return Status::ALREADY_EXIST; + } + + uint64_t new_id = maxKVSID.fetch_add(1, MOR); + NameToID.insert( std::make_pair(kvs_name, new_id) ); + IDToName.insert( std::make_pair(new_id, kvs_name) ); + _log_debug(myLog, "Added new KVS %ld %s.", new_id, kvs_name.c_str()); + + return Status(); +} + +Status DBManifest::getKVSID(const std::string& kvs_name, + uint64_t& kvs_id_out) +{ + auto entry = NameToID.find(kvs_name); + if (entry == NameToID.end()) + return Status::KVS_NOT_FOUND; + + kvs_id_out = entry->second; + + return Status(); +} + +Status DBManifest::getKVSName(const uint64_t kvs_id, + std::string& kvs_name_out) +{ + auto entry = IDToName.find(kvs_id); + if (entry == IDToName.end()) + return Status::KVS_NOT_FOUND; + + kvs_name_out = entry->second; + + return Status(); +} + + +} // namespace jungle + diff --git a/src/db_manifest.h b/src/db_manifest.h new file mode 100644 index 0000000..485a7cb --- /dev/null +++ b/src/db_manifest.h @@ -0,0 +1,62 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "fileops_base.h" +#include "skiplist.h" +#include "table_file.h" + +#include + +#include +#include + +class SimpleLogger; + +namespace jungle { + +class DBManifest { +public: + DBManifest(FileOps* _f_ops); + ~DBManifest(); + + Status create(const std::string& path, + const std::string& filename); + Status load(const std::string& path, + const std::string& filename); + Status store(); + Status sync(); + Status addNewKVS(const std::string& kvs_name); + Status getKVSID(const std::string& kvs_name, + uint64_t& kvs_id_out); + Status getKVSName(const uint64_t kvs_id, + std::string& kvs_name_out); + + void setLogger(SimpleLogger* logger) { myLog = logger; } + +private: + FileOps* fOps; + FileHandle* mFile; + std::string dirPath; + std::string mFileName; + std::atomic maxKVSID; + std::map NameToID; + std::map IDToName; + SimpleLogger* myLog; +}; + +} // namespace jungle diff --git a/src/db_mgr.cc b/src/db_mgr.cc new file mode 100644 index 0000000..51e165c --- /dev/null +++ b/src/db_mgr.cc @@ -0,0 +1,452 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "db_mgr.h" + +#include "cmd_handler.h" +#include "compactor.h" +#include "db_internal.h" +#include "flusher.h" +#include "internal_helper.h" +#include "log_reclaimer.h" + +#include _MACRO_TO_STR(LOGGER_H) + +namespace jungle { + +std::atomic DBMgr::instance(nullptr); +std::mutex DBMgr::instanceLock; + +DBMgr* DBMgr::init(const GlobalConfig& config) { + DBMgr* mgr = instance.load(MOR); + if (!mgr) { + std::lock_guard l(instanceLock); + mgr = instance.load(MOR); + if (!mgr) { + mgr = new DBMgr(); + instance.store(mgr, MOR); + + mgr->initInternal(config); + } + } + return mgr; +} + +void DBMgr::printGlobalConfig() { + if (gConfig.itcOpt.timeWindow_sec) { + _log_info(myLog, "idle time compaction checking window %zu sec, " + "lower than %zu iops, adjusted factor %zu", + gConfig.itcOpt.timeWindow_sec, + gConfig.itcOpt.iopsThreshold, + gConfig.itcOpt.factor); + if (gConfig.itcOpt.startHour == gConfig.itcOpt.endHour) { + _log_info(myLog, "idle time compaction will happen regardless of time"); + } else { + int tz_gap = SimpleLoggerMgr::getTzGap(); + int tz_gap_abs = (tz_gap < 0) ? (tz_gap * -1) : (tz_gap); + _log_info(myLog, "idle time compaction time: %02zu to %02zu, %c%02d:%02d", + gConfig.itcOpt.startHour, gConfig.itcOpt.endHour, + (tz_gap >= 0)?'+':'-', tz_gap_abs / 60, tz_gap_abs % 60); + } + } else { + _log_info(myLog, "idle time compaction disabled"); + } +} + +void DBMgr::initInternal(const GlobalConfig& config) { + updateGlobalTime(); + + gConfig = config; + + std::string log_file = gConfig.globalLogPath + "/jungle_global.log"; + myLog = new SimpleLogger(log_file, 1024, 32*1024*1024, 4); + myLog->setLogLevel(4); + myLog->setDispLevel(-1); + myLog->start(); + + printGlobalConfig(); + + for (size_t ii=0; iiaddWorker(flusher); + flusher->run(); + } + _log_info(myLog, "%zu flusher threads, sleep time %zu ms", + config.numFlusherThreads, + config.flusherSleepDuration_ms); + + for (size_t ii=0; iiaddWorker(compactor); + compactor->run(); + } + _log_info(myLog, "%zu compactor threads, sleep time %zu ms", + config.numCompactorThreads, + config.compactorSleepDuration_ms); + _log_info(myLog, "auto sync by flusher: %s, " + "min number of records to flush: %zu", + config.flusherAutoSync ? "ON" : "OFF", + config.flusherMinRecordsToTrigger); + + { + CmdHandler* cmd_handler = new CmdHandler("cmd_handler", config); + wMgr->addWorker(cmd_handler); + cmd_handler->run(); + _log_info(myLog, "initiated cmd handler thread"); + } + + simple_thread_pool::ThreadPoolOptions tp_opt; + tp_opt.numInitialThreads = 0; // main thread only. + tpMgr.init(tp_opt); + _log_info(myLog, "initiated async timer"); + + twMgr->init(); +} + +DBMgr* DBMgr::get() { + DBMgr* mgr = instance.load(MOR); + if (!mgr) { + return init(); + } + return mgr; +} + +DBMgr* DBMgr::getWithoutInit() { + DBMgr* mgr = instance.load(MOR); + return mgr; +} + +void DBMgr::destroy() { + std::lock_guard l(instanceLock); + DBMgr* mgr = instance.load(MOR); + if (mgr) { + for (size_t ii = 0; ii < MAX_OP_HISTORY; ++ii) { + _log_trace(mgr->myLog, "[%zu] %zd", + ii, mgr->opHistory[ii]->load()); + } + + delete mgr; + instance.store(nullptr, MOR); + } +} + + +DBMgr::DBMgr() + : wMgr(new WorkerMgr()) + , fQueue(new FlusherQueue()) + , twMgr(new TableWriterMgr()) + , idleTraffic(false) + , myLog(nullptr) +{ + skiplist_init(&dbMap, DBWrap::cmp); + + for (size_t ii = 0; ii < MAX_OP_HISTORY; ++ii) { + opHistory.push_back( new std::atomic(-1) ); + } +} + +DBMgr::~DBMgr() { + { // Remove all pending files. + std::lock_guard l(filesToRemoveLock); + for (auto& entry: filesToRemove) { + if (!FileMgr::exist(entry)) continue; + Timer tt; + FileMgr::remove(entry); + _log_info(myLog, "removed pending file %s. %zu us", + entry.c_str(), tt.getUs()); + } + filesToRemove.clear(); + } + + // Should close all workers before closing DBMgr. + if (wMgr) delete wMgr; + if (fQueue) delete fQueue; + if (twMgr) delete twMgr; + + skiplist_node* cursor = skiplist_begin(&dbMap); + while (cursor) { + DBWrap* db_wrap = _get_entry(cursor, DBWrap, snode); + cursor = skiplist_next(&dbMap, cursor); + delete db_wrap; + } + skiplist_free(&dbMap); + + tpMgr.shutdown(); + + for (auto& entry: opHistory) delete entry; + + DELETE(myLog); +} + +Status DBMgr::addLogReclaimer() { + std::string t_name = "reclaimer_0"; + // Check if it already exists. + WorkerBase* existing = wMgr->getWorker(t_name); + if (existing) return Status::ALREADY_EXIST; + + LogReclaimer* reclaimer = new LogReclaimer(t_name, gConfig); + wMgr->addWorker(reclaimer); + reclaimer->run(); + + _log_info( myLog, "added log file reclaimer thread, " + "sleep time %zu sec", + gConfig.logFileReclaimerSleep_sec ); + + return Status(); +} + +DB* DBMgr::openExisting(const std::string& path, const std::string& kvs_name) { + std::lock_guard l(dbMapLock); + + DBWrap query; + query.path = path; + query.kvsName = kvs_name; + + skiplist_node* cursor = skiplist_find(&dbMap, &query.snode); + if (!cursor) { + return nullptr; + } + + DBWrap* db_wrap = _get_entry(cursor, DBWrap, snode); + db_wrap->refCount.fetch_add(1, MOR); + skiplist_release_node(&db_wrap->snode); + return db_wrap->db; +} + +Status DBMgr::assignNew(DB* db) { + DBWrap* db_wrap = nullptr; + + std::lock_guard l(dbMapLock); + + DBWrap query; + query.path = db->p->path; + query.kvsName = db->p->kvsName; + + skiplist_node* cursor = skiplist_find(&dbMap, &query.snode); + if (cursor) { + skiplist_release_node(cursor); + return Status::ALREADY_EXIST; + } + + db_wrap = new DBWrap(); + db_wrap->db = db; + db_wrap->path = db->p->path; + db_wrap->kvsName = db->p->kvsName; + db_wrap->refCount.fetch_add(1, MOR); + skiplist_insert(&dbMap, &db_wrap->snode); + + db->p->wrapper = db_wrap; + + return Status(); +} + +Status DBMgr::close(DB* db) { + DBWrap* db_wrap = db->p->wrapper; + if (!db_wrap) + return Status(); + + if (db_wrap->refCount.load(MOR) == 0) + return Status::ALREADY_CLOSED; + + uint64_t expected = 1; + uint64_t val = 0; + if (db_wrap->refCount.compare_exchange_weak(expected, val)) { + // Destroy DB handle. + _log_debug(db->p->myLog, "Destroy DB %p", db); + std::lock_guard l(dbMapLock); + skiplist_erase_node(&dbMap, &db_wrap->snode); + skiplist_wait_for_free(&db_wrap->snode); + if (db->p->dbGroup) { + // This is a default handle of group handle. + // Delete group handle as well. + delete db->p->dbGroup; + } + db->p->destroy(); + delete db_wrap; + delete db; + } else { + uint64_t ref_c = db_wrap->refCount.fetch_sub(1, MOR); + _log_debug(db->p->myLog, "Decrease DB %p ref_count = %ld", db, ref_c); + } + + return Status(); +} + +Status DBMgr::closeAll(const std::string& path) { + std::lock_guard l(dbMapLock); + + // Close & free all handles for the given path. + DBWrap query; + query.path = path; + + skiplist_node* cursor = skiplist_find(&dbMap, &query.snode); + while (cursor) { + DBWrap* db_wrap = _get_entry(cursor, DBWrap, snode); + cursor = skiplist_next(&dbMap, cursor); + + if (db_wrap->path == path) { + skiplist_erase_node(&dbMap, &db_wrap->snode); + skiplist_release_node(&db_wrap->snode); + skiplist_wait_for_free(&db_wrap->snode); + db_wrap->db->p->destroy(); + delete db_wrap->db; + delete db_wrap; + } else { + skiplist_release_node(&db_wrap->snode); + break; + } + skiplist_release_node(&db_wrap->snode); + } + if (cursor) skiplist_release_node(cursor); + + return Status(); +} + +Status DBMgr::addFileToRemove(const std::string& full_path) { + if (!FileMgr::exist(full_path)) return Status::FILE_NOT_EXIST; + + std::lock_guard l(filesToRemoveLock); + filesToRemove.push_back(full_path); + + return Status(); +} + +Status DBMgr::popFileToRemove(std::string& full_path) { + std::lock_guard l(filesToRemoveLock); + auto entry = filesToRemove.begin(); + if (entry == filesToRemove.end()) return Status::FILE_NOT_EXIST; + + full_path = *entry; + filesToRemove.pop_front(); + return Status(); +} + +Status DBMgr::forceRemoveFiles() { + // Remove all files in queue (blocking). + std::lock_guard l(filesToRemoveLock); + for (const std::string& full_path: filesToRemove) { + Timer tt; + FileMgr::remove(full_path); + _log_info( getLogger(), + "force removed pending file %s, %zu us", + full_path.c_str(), tt.getUs() ); + } + filesToRemove.clear(); + return Status(); +} + +void DBMgr::setDebugParams(const DebugParams& to, + size_t effective_time_sec) +{ + std::lock_guard l(debugParamsLock); + debugParams = to; + debugParamsTimer.setDurationMs(effective_time_sec * 1000); + debugParamsTimer.reset(); + _log_warn(myLog, "new debugging parameters (effective %zu seconds): " + "compaction delay %zu %zu, " + "urgent compaction size %zu, " + "urgent compaction ratio %zu, " + "rollback delay %zu", + effective_time_sec, + debugParams.compactionDelayUs, + debugParams.compactionItrScanDelayUs, + debugParams.urgentCompactionFilesize, + debugParams.urgentCompactionRatio, + debugParams.rollbackDelayUs); +} + +void DBMgr::updateGlobalTime() { + struct timeval tv; + gettimeofday(&tv, NULL); + globalTime.store(tv.tv_sec, MOR); + _log_trace(myLog, "updated global time: %zu", tv.tv_sec); +} + +uint64_t DBMgr::getGlobalTime() const { + return globalTime.load(MOR); +} + +void DBMgr::updateOpHistory(size_t amount) { + uint64_t g_time = getGlobalTime(); + uint64_t idx = (g_time / 10) % MAX_OP_HISTORY; + int64_t prev = opHistory[idx]->load(MOR); + if ( prev == -1 ) { + int64_t exp = -1; + int64_t des = (int64_t)amount; + if (opHistory[idx]->compare_exchange_weak(exp, des, MOR) ) { + uint64_t next_idx = (idx + 1) % MAX_OP_HISTORY; + opHistory[next_idx]->store(-1); + return; + } + } + opHistory[idx]->fetch_add(amount, MOR); +} + +bool DBMgr::determineIdleStatus() { + if (!gConfig.itcOpt.timeWindow_sec) return false; + + if (gConfig.itcOpt.startHour < gConfig.itcOpt.endHour) { + // e.g.) 01 - 04: from 01:00:00 to 03:59:59. + SimpleLoggerMgr::TimeInfo lt = std::chrono::system_clock::now(); + if ( lt.hour < (int)gConfig.itcOpt.startHour || + lt.hour >= (int)gConfig.itcOpt.endHour ) { + return false; + } + } else { + // e.g.) 23 - 01: from 23:00:00 to 23:59:59 and + // 00:00:00 to 00:59:59. + SimpleLoggerMgr::TimeInfo lt = std::chrono::system_clock::now(); + if ( lt.hour > (int)gConfig.itcOpt.endHour && + lt.hour < (int)gConfig.itcOpt.startHour ) return false; + } + + uint64_t g_time = getGlobalTime(); + uint64_t s_idx = (g_time / 10) % MAX_OP_HISTORY; + size_t count = 0; + for ( int ii = MAX_OP_HISTORY + s_idx; ii > (int)s_idx; --ii ) { + int idx = ii % MAX_OP_HISTORY; + int64_t entry = opHistory[idx]->load(MOR); + if (entry < 0) continue; + if (entry / 10 > gConfig.itcOpt.iopsThreshold) return false; + + count++; + if (count >= 3 && count >= gConfig.itcOpt.timeWindow_sec / 10) break; + } + + if (count >= 3) { + // We need at least two slots (including the current one) + // to make sure no false alarm. + return true; + } + return false; +} + +bool DBMgr::setIdleStatus(bool to) { + bool prev = idleTraffic.load(MOR); + if (prev != to) { + return idleTraffic.compare_exchange_weak(prev, to, MOR); + } + return false; +} + +bool DBMgr::isIdleTraffic() const { + return idleTraffic.load(MOR); +} + +} // namespace jungle + diff --git a/src/db_mgr.h b/src/db_mgr.h new file mode 100644 index 0000000..f46db77 --- /dev/null +++ b/src/db_mgr.h @@ -0,0 +1,191 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "internal_helper.h" +#include "skiplist.h" +#include "table_writer.h" +#include "worker_mgr.h" + +#include + +#include "simple_thread_pool.h" + +#include +#include +#include +#include +#include + +class SimpleLogger; + +namespace jungle { + +struct DBWrap { + DBWrap() : db(nullptr), refCount(0) { + skiplist_init_node(&snode); + } + ~DBWrap() { + skiplist_free_node(&snode); + } + + static int cmp(skiplist_node *a, skiplist_node *b, void *aux) { + DBWrap *aa, *bb; + aa = _get_entry(a, DBWrap, snode); + bb = _get_entry(b, DBWrap, snode); + + if (aa->path < bb->path) return -1; + else if (aa->path > bb->path) return 1; + else { + if (aa->kvsName < bb->kvsName) return -1; + else if (aa->kvsName > bb->kvsName) return 1; + else return 0; + } + } + + DB* db; + std::atomic refCount; + std::string path; + std::string kvsName; + skiplist_node snode; +}; + +static GlobalConfig _default_global_config_; + +class FlusherQueue; + +// Singleton class +class DBMgr { + friend class Flusher; + friend class CmdHandler; + friend class Compactor; + friend class LogReclaimer; + friend class Merger; + +public: + static const size_t MAX_OP_HISTORY = 360; // 1 hour. + + static DBMgr* init(const GlobalConfig& config = _default_global_config_); + static DBMgr* get(); + static DBMgr* getWithoutInit(); + static void destroy(); + + Status addLogReclaimer(); + + DB* openExisting(const std::string& path, const std::string& kvs_name); + Status assignNew(DB* db); + + Status close(DB* db); + Status closeAll(const std::string& path); + + Status addFileToRemove(const std::string& full_path); + Status popFileToRemove(std::string& full_path); + Status forceRemoveFiles(); + + void updateGlobalTime(); + + uint64_t getGlobalTime() const; + + void updateOpHistory(size_t amount = 1); + + bool determineIdleStatus(); + + bool setIdleStatus(bool to); + + bool isIdleTraffic() const; + + WorkerMgr* workerMgr() const { return wMgr; } + + FlusherQueue* flusherQueue() const { return fQueue; } + + TableWriterMgr* tableWriterMgr() const { return twMgr; } + + GlobalConfig* getGlobalConfig() { return &gConfig; } + + SimpleLogger* getLogger() const { return myLog; } + + simple_thread_pool::ThreadPoolMgr* getTpMgr() { return &tpMgr; } + + void setDebugParams(const DebugParams& to, + size_t effective_time_sec = 3600); + + DebugParams getDebugParams() { + std::lock_guard l(debugParamsLock); + return debugParams; + } + + bool isDebugParamsEffective() const { return !debugParamsTimer.timeout(); } + +private: + DBMgr(); + + ~DBMgr(); + + void printGlobalConfig(); + + void initInternal(const GlobalConfig& config); + + // Singleton instance and lock. + static std::atomic instance; + static std::mutex instanceLock; + + // Global config. + GlobalConfig gConfig; + + // Map of pairs. + skiplist_raw dbMap; + std::mutex dbMapLock; + + // Worker manager. + WorkerMgr* wMgr; + + // Async flush request queue. + FlusherQueue* fQueue; + + // Pending files to be removed. + std::list filesToRemove; + std::mutex filesToRemoveLock; + + // Table writers pool manager. + TableWriterMgr* twMgr; + + // Debugging parameters. + DebugParams debugParams; + std::mutex debugParamsLock; + Timer debugParamsTimer; + + // For async timer purpose. + simple_thread_pool::ThreadPoolMgr tpMgr; + + // Global time (epoch in seconds). + std::atomic globalTime; + + // Circular history of the number of operations (an entry: 10 seconds). + // -1: not initialized, should skip. + std::vector< std::atomic* > opHistory; + + // `true` if the current traffic to this process is idle. + std::atomic idleTraffic; + + // Logger. + SimpleLogger* myLog; +}; + +} // namespace jungle + + + diff --git a/src/endian_encode.h b/src/endian_encode.h new file mode 100644 index 0000000..66fdd16 --- /dev/null +++ b/src/endian_encode.h @@ -0,0 +1,118 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2016 Couchbase, Inc. +See URL: https://github.com/couchbase/forestdb + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#if defined(WIN32) || defined(_WIN32) + #ifndef _LITTLE_ENDIAN + #define _LITTLE_ENDIAN + #endif + +#elif __linux__ + #include + #if __BYTE_ORDER == __LITTLE_ENDIAN + #ifndef _LITTLE_ENDIAN + #define _LITTLE_ENDIAN + #endif + #elif __BYTE_ORDER == __BIG_ENDIAN + #ifndef _BIG_ENDIAN + #define _BIG_ENDIAN + #endif + #else + #error "not supported platform" + #endif + +#elif __APPLE__ + #include + #if BYTE_ORDER == LITTLE_ENDIAN + #ifndef _LITTLE_ENDIAN + #define _LITTLE_ENDIAN + #endif + #elif BYTE_ORDER == BIG_ENDIAN + #ifndef _BIG_ENDIAN + #define _BIG_ENDIAN + #endif + #else + #error "not supported endian" + #endif + +#else + #error "not supported platform" + +#endif + +#ifndef reverse_order_64 +#define reverse_order_64(v) \ + ( (((v) & 0xff00000000000000ULL) >> 56) \ + | (((v) & 0x00ff000000000000ULL) >> 40) \ + | (((v) & 0x0000ff0000000000ULL) >> 24) \ + | (((v) & 0x000000ff00000000ULL) >> 8) \ + | (((v) & 0x00000000ff000000ULL) << 8) \ + | (((v) & 0x0000000000ff0000ULL) << 24) \ + | (((v) & 0x000000000000ff00ULL) << 40) \ + | (((v) & 0x00000000000000ffULL) << 56) ) +#endif + +#ifndef reverse_order_32 +#define reverse_order_32(v) \ + ( (((v) & 0xff000000) >> 24) \ + | (((v) & 0x00ff0000) >> 8) \ + | (((v) & 0x0000ff00) << 8) \ + | (((v) & 0x000000ff) << 24) ) +#endif + +#ifndef reverse_order_16 +#define reverse_order_16(v) \ + ( (((v) & 0xff00) >> 8) \ + | (((v) & 0x00ff) << 8) ) +#endif + +#if defined(_LITTLE_ENDIAN) + // convert to big endian + #define _enc64(v) reverse_order_64(v) + #define _dec64(v) reverse_order_64(v) + #define _enc32(v) reverse_order_32(v) + #define _dec32(v) reverse_order_32(v) + #define _enc16(v) reverse_order_16(v) + #define _dec16(v) reverse_order_16(v) +#else + // big endian .. do nothing + #define _enc64(v) (v) + #define _dec64(v) (v) + #define _enc32(v) (v) + #define _dec32(v) (v) + #define _enc16(v) (v) + #define _dec16(v) (v) +#endif + +#define __ENDIAN_SAFE +#ifdef __ENDIAN_SAFE +#define _enc(v) \ + ((sizeof(v) == 8)?(_enc64(v)):( \ + (sizeof(v) == 4)?(_enc32(v)):( \ + (sizeof(v) == 2)?(_enc16(v)):(v)))) +#define _dec(v) \ + ((sizeof(v) == 8)?(_dec64(v)):( \ + (sizeof(v) == 4)?(_dec32(v)):( \ + (sizeof(v) == 2)?(_dec16(v)):(v)))) +#else +#define _enc(v) (v) +#define _dec(v) (v) +#endif + diff --git a/src/event_awaiter.h b/src/event_awaiter.h new file mode 100644 index 0000000..c11726d --- /dev/null +++ b/src/event_awaiter.h @@ -0,0 +1,97 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright: +See URL: https://github.com/greensky00/event_awaiter + (v0.1.1) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include +#include +#include + +class EventAwaiter { +private: + enum class AS { + idle = 0x0, + ready = 0x1, + waiting = 0x2, + done = 0x3 + }; + +public: + EventAwaiter() : status(AS::idle) {} + + void reset() { + status.store(AS::idle); + } + + void wait() { + wait_us(0); + } + + void wait_ms(size_t time_ms) { + wait_us(time_ms * 1000); + } + + void wait_us(size_t time_us) { + AS expected = AS::idle; + if (status.compare_exchange_strong(expected, AS::ready)) { + // invoke() has not been invoked yet, wait for it. + std::unique_lock l(cvLock); + expected = AS::ready; + if (status.compare_exchange_strong(expected, AS::waiting)) { + if (time_us) { + cv.wait_for(l, std::chrono::microseconds(time_us)); + } else { + cv.wait(l); + } + status.store(AS::done); + } else { + // invoke() has grabbed `cvLock` earlier than this. + } + } else { + // invoke() already has been called earlier than this. + } + } + + void invoke() { + AS expected = AS::idle; + if (status.compare_exchange_strong(expected, AS::done)) { + // wait() has not been invoked yet, do nothing. + return; + } + + std::unique_lock l(cvLock); + expected = AS::ready; + if (status.compare_exchange_strong(expected, AS::done)) { + // wait() has been called earlier than invoke(), + // but invoke() has grabbed `cvLock` earlier than wait(). + // Do nothing. + } else { + // wait() is waiting for ack. + cv.notify_all(); + } + } + +private: + std::atomic status; + std::mutex cvLock; + std::condition_variable cv; +}; + + diff --git a/src/fileops_base.h b/src/fileops_base.h new file mode 100644 index 0000000..65f4f12 --- /dev/null +++ b/src/fileops_base.h @@ -0,0 +1,134 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include + +#include + +#include +#include +#include + +namespace jungle { + +using cs_off_t = int64_t; + +class FileOps; +class FileHandle { +public: + FileHandle() : fOps(nullptr) { } + virtual ~FileHandle() { } + + FileOps* ops() const { return fOps; } + + virtual bool isOpened() { return false; } + +protected: + FileOps* fOps; +}; + +class FileOps { +public: + virtual ~FileOps() { } + + enum Mode { + NORMAL = 0x0, + READ_ONLY = 0x1, + }; + + static int getFlags(FileOps::Mode mode) { + int flags = 0; + if (mode == FileOps::NORMAL) { + flags = O_CREAT | O_RDWR; + } else { + flags = O_RDONLY; + } + + return flags; + } + + static bool supportDirectIO() { + // Can't define both __APPLE__ and __linux__ +#if defined(__APPLE__) && defined(__linux__) + return false; +#endif + +#ifdef __APPLE__ + #ifdef F_NOCACHE + return true; + #else + return false; + #endif +#elif defined(__linux__) + #ifndef _GNU_SOURCE + #define _GNU_SOURCE + #endif + + #ifdef O_DIRECT + return true; + #else + return false; + #endif +#else + return false; +#endif + } + + Status open(FileHandle** fhandle_out, + const std::string& pathname) { + return open(fhandle_out, pathname, NORMAL); + } + + virtual Status open(FileHandle** fhandle_out, + const std::string& pathname, + FileOps::Mode mode) = 0; + + virtual Status close(FileHandle* fhandle) = 0; + + virtual Status pread(FileHandle* fhandle, + void* buf, + size_t count, + cs_off_t offset) = 0; + + virtual Status pwrite(FileHandle* fhandle, + const void *buf, + size_t count, + cs_off_t offset) = 0; + + virtual Status append(FileHandle* fhandle, + const void* buf, + size_t count) = 0; + + virtual cs_off_t eof(FileHandle* fhandle) = 0; + + virtual Status flush(FileHandle* fhandle) = 0; + + virtual Status fsync(FileHandle* fhandle) = 0; + + virtual Status ftruncate(FileHandle* fhandle, + cs_off_t length) = 0; + + virtual Status mkdir(const std::string& path) = 0; + + virtual bool exist(const std::string& path) = 0; + + virtual Status remove(const std::string& path) = 0; +}; + +} // namespace jungle + diff --git a/src/fileops_directio.cc b/src/fileops_directio.cc new file mode 100644 index 0000000..51fa841 --- /dev/null +++ b/src/fileops_directio.cc @@ -0,0 +1,613 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "fileops_directio.h" + +#include +#include +#include +#include +#include + +namespace jungle { + +#define MAX_TRIES (100) +#define MAX_SLEEP_US (500000) // 500 ms + +class FileHandleDirectIO : public FileHandle { +public: + FileHandleDirectIO(std::string _pathname, + int _fd, + FileOps* _ops, + void* _alignedBuf) + : pathname(std::move(_pathname)) + , fd(_fd) + , filePos(0) + , flushedFilePos(0) + , fsyncedFilePos(0) + , bufPtr(_alignedBuf) + , bufPos(0) + { + fOps = _ops; + bufCharPtr = static_cast(bufPtr); + } + + ~FileHandleDirectIO() override { + if (isOpened()) { + ::close(fd); + filePos = 0; + flushedFilePos = 0; + fsyncedFilePos = 0; + } + if (bufPtr) { + free(bufPtr); + bufPtr = nullptr; + bufCharPtr = nullptr; + bufPos = 0; + } + } + + bool isOpened() override { return (fd > 0); } + + std::string pathname; + int fd; + size_t filePos; + size_t flushedFilePos; + size_t fsyncedFilePos; + void* bufPtr; + uint8_t* bufCharPtr; + size_t bufPos; +}; + +struct AlignedMemoryHolder { + explicit AlignedMemoryHolder(void* _bufPtr) : bufPtr(_bufPtr) {} + ~AlignedMemoryHolder() { + if (bufPtr) { + free(bufPtr); + bufPtr = nullptr; + } + } + void* bufPtr; +}; + +static FileHandleDirectIO* getHandle(FileHandle* fhandle) { + return dynamic_cast(fhandle); +} + +FileOpsDirectIO::FileOpsDirectIO(SimpleLogger* log) : myLog(log) {} +FileOpsDirectIO::~FileOpsDirectIO() {} + +Status FileOpsDirectIO::allocAlignedBuf(void** aligned_buf) { + int mr = posix_memalign(aligned_buf, + ALIGNMENT, + ALIGNED_BUFFER_SIZE + ALIGNMENT); + if (mr) { + _log_err(myLog, + "failed to allocate aligned buffer with error code %d, " + "aligned size %d, buffer size %d", + mr, ALIGNMENT, ALIGNED_BUFFER_SIZE); + return Status::DIRECT_IO_NOT_SUPPORTED; + } + // One ALIGNMENT more for padding + memset(*aligned_buf, 0x0, ALIGNED_BUFFER_SIZE + ALIGNMENT); + return Status::OK; +} + +Status FileOpsDirectIO::readInternal(const std::string &pathname, + int fd, + void* buf, + size_t nbyte, + off_t offset) +{ + size_t num_tries = 0; + size_t sleep_time_us = 1; + ssize_t r = -1; + size_t read_size = nbyte; + size_t need_align = nbyte % ALIGNMENT; + if (need_align > 0) { + // This is the case that reading the tail of the file with un-aligned size + // Extend the read size even it may be larger than file size + // The size of bytes returned should be exactly equal to nbyte + read_size += (ALIGNMENT - need_align); + } + + do { + r = ::pread(fd, buf, read_size, offset); + if (r == (ssize_t) nbyte) break; + int n = errno; + num_tries++; + _log_err(myLog, + "failed to read log file %s, offset %ld, " + "bytes %zu, errno %d, msg %s, retrying %d...", + pathname.c_str(), offset, + nbyte, n, strerror(n), num_tries); + Timer::sleepUs(sleep_time_us); + sleep_time_us = std::min(sleep_time_us * 2, (size_t) MAX_SLEEP_US); + } while (r != (ssize_t) nbyte && ++num_tries < MAX_TRIES); + if (r != (ssize_t) nbyte) { + _log_err(myLog, + "failed to read log file %s after %d retries, " + "offset %ld, bytes %zu", + pathname.c_str(), num_tries, offset, nbyte); + return Status::FILE_READ_SIZE_MISMATCH; + } else if (num_tries > 0) { + _log_warn(myLog, + "read log file %s succeed after %d retries, " + "offset %ld, bytes %zu", + pathname.c_str(), num_tries, offset, nbyte); + } + return Status::OK; +} + +Status FileOpsDirectIO::open(FileHandle** fhandle_out, + const std::string &pathname, + FileOps::Mode mode) +{ + if (!supportDirectIO()) return Status::DIRECT_IO_NOT_SUPPORTED; + + int flags = getFlags(mode); +#ifdef __linux__ + flags |= O_DIRECT; + _log_debug(myLog, + "open log file %s with flags O_CREAT|O_RDWR|O_DIRECT", + pathname.c_str()); +#endif + + int r = ::open(pathname.c_str(), flags, 0644); + if (r <= 0) { + int n = errno; + _log_err(myLog, + "failed to open log file %s with errno %d, msg %s", + pathname.c_str(), n, strerror(n)); + return Status::ERROR; + } + +#ifdef __APPLE__ + int nr = fcntl(r, F_NOCACHE, 1); + if (-1 == nr) { + int n = errno; + _log_err(myLog, + "failed to set log file %s F_NOCACHE with errno %d, msg %s", + pathname.c_str(), n, strerror(n)); + return Status::DIRECT_IO_NOT_SUPPORTED; + } + _log_debug(myLog, + "open log file %s with flags O_CREAT|O_RDWR|F_NOCACHE", + pathname.c_str()); +#endif + + void* aligned_buf = nullptr; + + Status s; + EP(allocAlignedBuf(&aligned_buf)); + + auto* fhandle = new FileHandleDirectIO(pathname, r, this, aligned_buf); + + // Set the aligned write buffer + cs_off_t file_size_tmp = eof(fhandle); + if (file_size_tmp < 0) return Status::ERROR; + auto file_size = static_cast(file_size_tmp); + + size_t need_align = file_size % ALIGNMENT; + if (need_align == 0) { + fhandle->filePos = file_size; + fhandle->flushedFilePos = file_size; + fhandle->fsyncedFilePos = file_size; + fhandle->bufPos = 0; + } else { + // Read the last un-aligned data + EP(readInternal(pathname, + r, + aligned_buf, + need_align, + file_size - need_align)); + fhandle->filePos = file_size - need_align; + fhandle->flushedFilePos = file_size; + fhandle->fsyncedFilePos = file_size; + fhandle->bufPos = need_align; + } + + _log_debug(myLog, + "log file %s is opened, size %zu, file_pos %zu, buf_pos %zu", + pathname.c_str(), file_size, fhandle->filePos, fhandle->bufPos); + + *fhandle_out = fhandle; + return Status(); +} + +Status FileOpsDirectIO::close(FileHandle* fhandle) { + FileHandleDirectIO* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + + Status s = flush(fhandle); + if (!s) { + _log_err(myLog, + "failed to close log file %s due to flush failure %d", + phandle->pathname.c_str(), s); + return s; + } + + cs_off_t file_size = eof(fhandle); + + int r = ::close(phandle->fd); + if (r) { + int n = errno; + _log_err(myLog, + "failed to close log file %s size %ld with errno %d, msg %s", + phandle->pathname.c_str(), file_size, n, strerror(n)); + return Status::ERROR; + } + + phandle->fd = -1; + + _log_debug(myLog, + "log file %s is closed, file_pos %zu, " + "buf_pos %zu, file_size %ld", + phandle->pathname.c_str(), phandle->filePos, + phandle->bufPos, file_size); + return Status(); +} + +Status FileOpsDirectIO::pread(FileHandle* fhandle, + void* buf, + size_t count, + cs_off_t _offset) +{ + FileHandleDirectIO* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + if (count == 0) return Status::OK; + + if (!buf || _offset < 0) return Status::INVALID_PARAMETERS; + auto offset = static_cast(_offset); + + cs_off_t file_size_tmp = eof(fhandle); + if (file_size_tmp < 0) return Status::ERROR; + auto file_size = static_cast(file_size_tmp); + if (offset + count > file_size) return Status::FILE_SIZE_MISMATCH; + + // Aligned buffer to direct read + thread_local void* aligned_buf = nullptr; + thread_local Status aligned_buf_status = allocAlignedBuf(&aligned_buf); + thread_local auto* aligned_buf_char = static_cast(aligned_buf); + if (!aligned_buf_status) return aligned_buf_status; + assert(aligned_buf != nullptr && aligned_buf_char != nullptr); + thread_local AlignedMemoryHolder buf_holder(aligned_buf); + (void) buf_holder; + + Status s; + size_t aligned_file_offset = offset; + size_t need_align = offset % ALIGNMENT; + if (need_align > 0) { + aligned_file_offset -= need_align; + } + + auto* buf_char = static_cast(buf); + while (aligned_file_offset < offset + count) { + size_t aligned_slice; + { + size_t slice = std::min((size_t) (offset + count - aligned_file_offset), + (size_t) ALIGNED_BUFFER_SIZE); + aligned_slice = slice; + need_align = slice % ALIGNMENT; + if (need_align > 0) { + aligned_slice = slice + (ALIGNMENT - need_align); + } + assert(aligned_slice > 0 && aligned_slice <= ALIGNED_BUFFER_SIZE); + // The tail of file is allowed to be un-aligned size + if (aligned_file_offset + aligned_slice > file_size) { + aligned_slice = file_size - aligned_file_offset; + } + } + + s = readInternal(phandle->pathname, + phandle->fd, + aligned_buf, + aligned_slice, + aligned_file_offset); + if (!s) break; + + memcpy(buf_char + (aligned_file_offset > offset + ? (aligned_file_offset - offset) + : 0), + aligned_buf_char + (offset > aligned_file_offset + ? (offset - aligned_file_offset) + : 0), + std::min(offset + count, aligned_file_offset + aligned_slice) + - std::max(offset, aligned_file_offset)); + aligned_file_offset += aligned_slice; + } + + return s; +} + +Status FileOpsDirectIO::pwrite(FileHandle* fhandle, + const void* buf, + size_t count, + cs_off_t offset) +{ + return Status::NOT_IMPLEMENTED; +} + +Status FileOpsDirectIO::append(FileHandle* fhandle, + const void* buf, + size_t count) +{ + FileHandleDirectIO* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + if (count == 0) return Status::OK; + + size_t sleep_time_us = 1; + ssize_t r = -1; + + size_t source_pos = 0; + + const auto* buf_char = static_cast(buf); + while (source_pos < count) { + size_t write_buf_size = ALIGNED_BUFFER_SIZE - phandle->bufPos; + size_t left_source_size = count - source_pos; + size_t slice = std::min(left_source_size, write_buf_size); + + assert(slice > 0 && slice <= ALIGNED_BUFFER_SIZE); + memcpy(phandle->bufCharPtr + phandle->bufPos, + buf_char + source_pos, + slice); + + phandle->bufPos += slice; + source_pos += slice; + + // Flush, once write buffer is full + if (ALIGNED_BUFFER_SIZE <= phandle->bufPos) { + size_t num_tries = 0; + do { + r = ::pwrite(phandle->fd, + phandle->bufPtr, + ALIGNED_BUFFER_SIZE, + phandle->filePos); + if (r == (ssize_t) ALIGNED_BUFFER_SIZE) break; + int n = errno; + num_tries++; + _log_err(myLog, + "failed to write log file %s, offset %zu, " + "bytes %d, errno %d, msg %s, retrying %d...", + phandle->pathname.c_str(), phandle->filePos, + ALIGNED_BUFFER_SIZE, n, strerror(n), num_tries); + Timer::sleepUs(sleep_time_us); + sleep_time_us = std::min(sleep_time_us * 2, (size_t) MAX_SLEEP_US); + } while (r != (ssize_t) ALIGNED_BUFFER_SIZE && num_tries < MAX_TRIES); + if (r != (ssize_t) ALIGNED_BUFFER_SIZE) { + _log_err(myLog, + "failed to write log file %s after %d retries, " + "offset %zu, bytes %d", + phandle->pathname.c_str(), num_tries, + phandle->filePos, ALIGNED_BUFFER_SIZE); + return Status::FILE_WRITE_SIZE_MISMATCH; + } else if (num_tries > 0) { + _log_warn(myLog, + "Write log file %s succeed after %d retries, " + "offset %zu, bytes %d", + phandle->pathname.c_str(), num_tries, + phandle->filePos, ALIGNED_BUFFER_SIZE); + } + phandle->bufPos = 0; + phandle->filePos += ALIGNED_BUFFER_SIZE; + if (phandle->filePos > phandle->flushedFilePos) { + phandle->flushedFilePos = phandle->filePos; + } + } + } + return Status(); +} + +cs_off_t FileOpsDirectIO::eof(FileHandle* fhandle) { + FileHandleDirectIO* phandle = getHandle(fhandle); + if (!phandle) return -1; + if (phandle->fd <= 0) return -1; + int r = ::lseek(phandle->fd, 0, SEEK_END); + if (r < 0) { + int n = errno; + _log_err(myLog, + "failed to get size of log file %s with errno %d, msg %s", + phandle->pathname.c_str(), n, strerror(n)); + } + return r; +} + +Status FileOpsDirectIO::flush(FileHandle* fhandle) { + FileHandleDirectIO* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + if (0 == phandle->bufPos) return Status(); + // Avoid duplicate flush + if (phandle->flushedFilePos >= phandle->filePos + phandle->bufPos) { + return Status(); + } + + size_t need_align = phandle->bufPos % ALIGNMENT; + size_t aligned_pos = phandle->bufPos; + if (need_align > 0) { + size_t padding = ALIGNMENT - need_align; + if (padding < 8) { + // Enough room for padding flags + padding += ALIGNMENT; + } + aligned_pos = phandle->bufPos + padding; + + // Fill zero + memset(phandle->bufCharPtr + phandle->bufPos, 0x0, padding); + // Set flag to indicate padding + uint64_t offset = 0; + append_mem_64(phandle->bufCharPtr + phandle->bufPos, + PADDING_HEADER_FLAG, + 0, + offset); + } + assert(aligned_pos > 0 && aligned_pos <= ALIGNED_BUFFER_SIZE); + + size_t num_tries = 0; + size_t sleep_time_us = 1; + ssize_t r = -1; + do { + r = ::pwrite(phandle->fd, + phandle->bufPtr, + aligned_pos, + phandle->filePos); + if (r == (ssize_t) aligned_pos) break; + int n = errno; + num_tries++; + _log_err(myLog, + "failed to write log file %s, offset %zu, " + "bytes %zu, errno %d, msg %s, retrying %d...", + phandle->pathname.c_str(), phandle->filePos, + aligned_pos, n, strerror(n), num_tries); + Timer::sleepUs(sleep_time_us); + sleep_time_us = std::min(sleep_time_us * 2, (size_t) MAX_SLEEP_US); + } while (r != (ssize_t) aligned_pos && num_tries < MAX_TRIES); + if (r != (ssize_t) aligned_pos) { + _log_err(myLog, + "failed to write log file %s after %d retries, " + "offset %zu, bytes %zu", + phandle->pathname.c_str(), num_tries, + phandle->filePos, aligned_pos); + return Status::FILE_WRITE_SIZE_MISMATCH; + } else if (num_tries > 0) { + _log_warn(myLog, + "write log file %s succeed after %d retries, " + "offset %zu, bytes %zu", + phandle->pathname.c_str(), num_tries, + phandle->filePos, aligned_pos); + } + + phandle->flushedFilePos = phandle->filePos + phandle->bufPos; + // Don't need to change buf&file position + // Wait for next writes to make buffer eventually alignment + return Status(); +} + +Status FileOpsDirectIO::fsync(FileHandle* fhandle) { + FileHandleDirectIO* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + // Avoid duplicate fsync + if (phandle->fsyncedFilePos >= phandle->flushedFilePos) { + return Status(); + } + + int r = ::fsync(phandle->fd); + if (r) { + int n = errno; + _log_err(myLog, + "failed to fsync log file %s with errno %d, msg %s", + phandle->pathname.c_str(), n, strerror(n)); + return Status::ERROR; + } + phandle->fsyncedFilePos = phandle->flushedFilePos; + return Status(); +} + +Status FileOpsDirectIO::ftruncate(FileHandle* fhandle, + cs_off_t _length) +{ + FileHandleDirectIO* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + if (_length < 0) return Status::INVALID_PARAMETERS; + auto length = static_cast(_length); + + // Memtable will be synced before truncate, + // so it's safe not to handle pending data in phandle->bufPtr. + cs_off_t file_size_tmp = eof(fhandle); + if (file_size_tmp < 0) return Status::ERROR; + auto file_size = static_cast(file_size_tmp); + if (length >= file_size) return Status::OK; + + size_t aligned_length = length; + size_t need_align = length % ALIGNMENT; + if (need_align > 0) { + aligned_length = length + (ALIGNMENT - need_align); + } + + int r = ::ftruncate(phandle->fd, length); + if (r) { + int n = errno; + _log_err(myLog, + "failed to truncate log file %s with errno %d, msg %s", + phandle->pathname.c_str(), n, strerror(n)); + return Status::ERROR; + } + + // Set the aligned write buffer + if (aligned_length == length) { + phandle->filePos = aligned_length; + phandle->flushedFilePos = aligned_length; + phandle->bufPos = 0; + } else { + // Read the last un-aligned data + Status s; + EP(readInternal(phandle->pathname, + phandle->fd, + phandle->bufPtr, + need_align, + aligned_length - ALIGNMENT)); + phandle->filePos = aligned_length - ALIGNMENT; + phandle->flushedFilePos = length; + phandle->bufPos = need_align; + } + + // Double check file size + file_size_tmp = eof(fhandle); + if (file_size_tmp < 0) return Status::ERROR; + file_size = static_cast(file_size_tmp); + if (file_size != length) { + _log_err(myLog, + "failed to truncate log file %s as " + "file size %zu is not equal to expected %zu", + phandle->pathname.c_str(), file_size, length); + return Status::ERROR; + } + + _log_debug(myLog, + "log file %s is truncated, file_pos %zu, " + "buf_pos %zu, file_size %zu", + phandle->pathname.c_str(), phandle->filePos, + phandle->bufPos, file_size); + + return Status(); +} + +Status FileOpsDirectIO::mkdir(const std::string &path) { + int r = ::mkdir(path.c_str(), 0755); + if (r < 0) return Status::ERROR; + + return Status(); +} + +Status FileOpsDirectIO::remove(const std::string &path) { + int r = ::remove(path.c_str()); + if (r < 0) return Status::ERROR; + + return Status(); +} + +bool FileOpsDirectIO::exist(const std::string &path) { + struct stat st; + int result = stat(path.c_str(), &st); + return (result == 0); +} + +} // namespace jungle diff --git a/src/fileops_directio.h b/src/fileops_directio.h new file mode 100644 index 0000000..d638e01 --- /dev/null +++ b/src/fileops_directio.h @@ -0,0 +1,81 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "fileops_base.h" +#include "internal_helper.h" + +#include _MACRO_TO_STR(LOGGER_H) + +namespace jungle { + +#define ALIGNMENT 512 +// 16KB block size +#define ALIGNED_BUFFER_SIZE 16384 + +class FileOpsDirectIO : public FileOps { +public: + FileOpsDirectIO(SimpleLogger* log); + ~FileOpsDirectIO(); + + Status open(FileHandle** fhandle_out, + const std::string& pathname, + FileOps::Mode mode); + + Status close(FileHandle* fhandle); + + Status pread(FileHandle* fhandle, + void* buf, + size_t count, + cs_off_t offset); + + Status pwrite(FileHandle* fhandle, + const void* buf, + size_t count, + cs_off_t offset); + + Status append(FileHandle* fhandle, + const void* buf, + size_t count); + + cs_off_t eof(FileHandle* fhandle); + + // Flush internal aligned write buffer + Status flush(FileHandle* fhandle); + + Status fsync(FileHandle* fhandle); + + Status ftruncate(FileHandle* fhandle, + cs_off_t length); + + Status mkdir(const std::string& path); + + bool exist(const std::string& path); + + Status remove(const std::string& path); +private: + Status allocAlignedBuf(void** aligned_buf); + Status readInternal(const std::string &pathname, + int fd, + void* buf, + size_t nbyte, + off_t offset); + + SimpleLogger* myLog; +}; + +} // namespace jungle diff --git a/src/fileops_posix.cc b/src/fileops_posix.cc new file mode 100644 index 0000000..e53c481 --- /dev/null +++ b/src/fileops_posix.cc @@ -0,0 +1,221 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "fileops_posix.h" + +#include "internal_helper.h" + +#include +#include +#include + +namespace jungle { + +#define MAX_TRIES (100) +#define MAX_SLEEP_US (500000) // 500 ms + +class FileHandlePosix : public FileHandle { +public: + FileHandlePosix(int _fd, FileOps* _ops) + : fd(_fd) { + fOps = _ops; + } + + ~FileHandlePosix() { + if (isOpened()) { + ::close(fd); + } + } + + bool isOpened() { return (fd > 0); } + + int fd; +}; + + +static FileHandlePosix* getHandle(FileHandle* fhandle) { + return static_cast(fhandle); +} + +FileOpsPosix::FileOpsPosix() { } +FileOpsPosix::~FileOpsPosix() { } + +Status FileOpsPosix::open(FileHandle** fhandle_out, + const std::string& pathname, + FileOps::Mode mode) +{ + int flags = getFlags(mode); + + int r = ::open(pathname.c_str(), flags, 0644); + if (r <= 0) return Status::ERROR; + + FileHandlePosix* fhandle = new FileHandlePosix(r, this); + *fhandle_out = fhandle; + + return Status(); +} + +Status FileOpsPosix::close(FileHandle* fhandle) { + FileHandlePosix* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + + int r = ::close(phandle->fd); + if (r < 0) return Status::ERROR; + + phandle->fd = -1; + return Status(); +} + +Status FileOpsPosix::pread(FileHandle* fhandle, + void *buf, + size_t count, + cs_off_t offset) +{ + FileHandlePosix* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + if (count == 0) return Status::OK; + + if (!buf) return Status::INVALID_PARAMETERS; + + size_t num_tries = 0; + size_t sleep_time_us = 1; + ssize_t r = -1; + do { + r = ::pread(phandle->fd, buf, count, offset); + if (r == (ssize_t)count) break; + + Timer::sleepUs(sleep_time_us); + sleep_time_us = std::min(sleep_time_us * 2, (size_t)MAX_SLEEP_US); + } while (r != (ssize_t)count && ++num_tries < MAX_TRIES); + if (r != (ssize_t)count) return Status::FILE_READ_SIZE_MISMATCH; + + return Status(); +} + +Status FileOpsPosix::pwrite(FileHandle* fhandle, + const void *buf, + size_t count, + cs_off_t offset) +{ + FileHandlePosix* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + if (count == 0) return Status::OK; + + + size_t num_tries = 0; + size_t sleep_time_us = 1; + ssize_t r = -1; + do { + r = ::pwrite(phandle->fd, buf, count, offset); + if (r == (ssize_t)count) break; + + Timer::sleepUs(sleep_time_us); + sleep_time_us = std::min(sleep_time_us * 2, (size_t)MAX_SLEEP_US); + } while (r != (ssize_t)count && ++num_tries < MAX_TRIES); + if (r != (ssize_t)count) return Status::FILE_WRITE_SIZE_MISMATCH; + + return Status(); +} + +Status FileOpsPosix::append(FileHandle* fhandle, + const void* buf, + size_t count) +{ + FileHandlePosix* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + if (count == 0) return Status::OK; + + size_t num_tries = 0; + size_t sleep_time_us = 1; + ssize_t r = -1; + off_t offset = ::lseek(phandle->fd, 0, SEEK_END); + do { + r = ::pwrite(phandle->fd, buf, count, offset); + if (r == (ssize_t)count) break; + + Timer::sleepUs(sleep_time_us); + sleep_time_us = std::min(sleep_time_us * 2, (size_t)MAX_SLEEP_US); + } while (r != (ssize_t)count && ++num_tries < MAX_TRIES); + if (r != (ssize_t)count) return Status::FILE_WRITE_SIZE_MISMATCH; + + return Status(); +} + +cs_off_t FileOpsPosix::eof(FileHandle* fhandle) { + FileHandlePosix* phandle = getHandle(fhandle); + if (!phandle) return -1; + if (phandle->fd <= 0) return -1; + + return ::lseek(phandle->fd, 0, SEEK_END); +} + +Status FileOpsPosix::flush(FileHandle* fhandle) { + FileHandlePosix* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + return Status(); +} + +Status FileOpsPosix::fsync(FileHandle* fhandle) { + FileHandlePosix* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + + int r = ::fsync(phandle->fd); + if (r < 0) return Status::ERROR; + + return Status(); +} + +Status FileOpsPosix::ftruncate(FileHandle* fhandle, + cs_off_t length) +{ + FileHandlePosix* phandle = getHandle(fhandle); + if (!phandle) return Status::NULL_FILEOPS_HANDLE; + if (phandle->fd <= 0) return Status::INVALID_FILE_DESCRIPTOR; + + int r = ::ftruncate(phandle->fd, length); + if (r < 0) return Status::ERROR; + + return Status(); +} + +Status FileOpsPosix::mkdir(const std::string& path) { + int r = ::mkdir(path.c_str(), 0755); + if (r < 0) return Status::ERROR; + + return Status(); +} + +Status FileOpsPosix::remove(const std::string& path) { + int r = ::remove(path.c_str()); + if (r < 0) return Status::ERROR; + + return Status(); +} + +bool FileOpsPosix::exist(const std::string& path) { + struct stat st; + int result = stat(path.c_str(), &st); + return (result == 0); +} + + +} // namespace jungle diff --git a/src/fileops_posix.h b/src/fileops_posix.h new file mode 100644 index 0000000..42cfd73 --- /dev/null +++ b/src/fileops_posix.h @@ -0,0 +1,64 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "fileops_base.h" + +namespace jungle { + +class FileOpsPosix : public FileOps { +public: + FileOpsPosix(); + ~FileOpsPosix(); + + Status open(FileHandle** fhandle_out, + const std::string& pathname, + FileOps::Mode mode); + + Status close(FileHandle* fhandle); + + Status pread(FileHandle* fhandle, + void* buf, + size_t count, + cs_off_t offset); + + Status pwrite(FileHandle* fhandle, + const void* buf, + size_t count, + cs_off_t offset); + + Status append(FileHandle* fhandle, + const void* buf, + size_t count); + + cs_off_t eof(FileHandle* fhandle); + + Status flush(FileHandle* fhandle); + + Status fsync(FileHandle* fhandle); + + Status ftruncate(FileHandle* fhandle, + cs_off_t length); + + Status mkdir(const std::string& path); + + bool exist(const std::string& path); + + Status remove(const std::string& path); +}; + +} // namespace jungle diff --git a/src/flusher.cc b/src/flusher.cc new file mode 100644 index 0000000..681814c --- /dev/null +++ b/src/flusher.cc @@ -0,0 +1,262 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "db_mgr.h" +#include "db_internal.h" +#include "flusher.h" +#include "internal_helper.h" +#include "log_mgr.h" +#include "skiplist.h" + +#include _MACRO_TO_STR(LOGGER_H) + +#include + +namespace jungle { + +FlusherQueue::~FlusherQueue() { + std::lock_guard l(queueLock); + for (auto& entry: queue) { + FlusherQueueElem*& elem = entry; + delete elem; + } +} + +void FlusherQueue::push(FlusherQueueElem* elem) { + std::unique_lock l(queueLock); + // Find existing request for the same DB. + for (auto& entry: queue) { + FlusherQueueElem*& elem_entry = entry; + if (elem_entry->targetDb == elem->targetDb) { + // Merge handler list. + for (auto& he: elem->handlers) { + elem_entry->handlers.push_back(he); + } + // Set up-to-date info. + elem_entry->fOptions = elem->fOptions; + elem_entry->seqUpto = elem->seqUpto; + _log_debug(elem_entry->targetDb->p->myLog, + "Overwrote existing req %p by %p.", + elem_entry, elem); + + // Delete newly given one. + delete elem; + return; + } + } + // Not found. Add new. + queue.push_back(elem); + _log_debug(elem->targetDb->p->myLog, + "Inserted new req %p into flusher queue.", elem); + l.unlock(); +} + +FlusherQueueElem* FlusherQueue::pop() { + std::lock_guard l(queueLock); + auto entry = queue.begin(); + if (entry != queue.end()) { + FlusherQueueElem* elem = *entry; + queue.pop_front(); + return elem; + } + return nullptr; +} + +size_t FlusherQueue::size() const { + std::lock_guard l(queueLock); + return queue.size(); +} + + +Flusher::Flusher(const std::string& _w_name, + const GlobalConfig& _config) + : lastCheckedFileIndex(0xffff) // Any big number to start from 0. +{ + workerName = _w_name; + gConfig = _config; + FlusherOptions options; + options.sleepDuration_ms = gConfig.flusherSleepDuration_ms; + options.worker = this; + curOptions = options; + handle = std::thread(WorkerBase::loop, &curOptions); +} + +Flusher::~Flusher() { +} + +void Flusher::work(WorkerOptions* opt_base) { + Status s; + + DBMgr* dbm = DBMgr::getWithoutInit(); + if (!dbm) return; + + DB* target_db = nullptr; + + FlusherQueueElem* elem = dbm->flusherQueue()->pop(); + if (elem) { + // User assigned work check if it is already closed. + std::lock_guard l(dbm->dbMapLock); + skiplist_node* cursor = skiplist_begin(&dbm->dbMap); + while (cursor) { + DBWrap* dbwrap = _get_entry(cursor, DBWrap, snode); + if (dbwrap->db == elem->targetDb) { + target_db = elem->targetDb; + target_db->p->incBgTask(); + break; + } + cursor = skiplist_next(&dbm->dbMap, cursor); + skiplist_release_node(&dbwrap->snode); + } + if (cursor) skiplist_release_node(cursor); + + } else { + // Otherwise: check DB map. + std::lock_guard l(dbm->dbMapLock); + + // NOTE: + // Start from right next DB of the last checked one. + // Checking outside skiplist's loop will be safe + // as long as we are holding `dbMapLock`. + std::vector dbs_to_check; + + skiplist_node* cursor = skiplist_begin(&dbm->dbMap); + while (cursor) { + DBWrap* dbwrap = _get_entry(cursor, DBWrap, snode); + dbs_to_check.push_back(dbwrap); + cursor = skiplist_next(&dbm->dbMap, cursor); + skiplist_release_node(&dbwrap->snode); + } + if (cursor) skiplist_release_node(cursor); + + size_t num_dbs = dbs_to_check.size(); + if (++lastCheckedFileIndex >= num_dbs) lastCheckedFileIndex = 0; + + size_t s_idx = lastCheckedFileIndex; + size_t e_idx = lastCheckedFileIndex + num_dbs; + for (size_t ii = s_idx; ii < e_idx; ++ii) { + lastCheckedFileIndex = ii % num_dbs; + DBWrap* dbwrap = dbs_to_check[lastCheckedFileIndex]; + if (dbwrap->db->p->logMgr->checkTimeToFlush(gConfig)) { + target_db = dbwrap->db; + target_db->p->incBgTask(); + break; + } + } + } + + if (target_db) { + _log_debug(target_db->p->myLog, + "DB %p is selected for flushing: req %p.", + target_db, elem); + + bool call_fsync = false; + bool sync_only = false; + if (elem) { + if (elem->fOptions.callFsync) call_fsync = true; + if (elem->fOptions.syncOnly) sync_only = true; + } + + if (gConfig.flusherAutoSync || sync_only) { + s = target_db->sync(call_fsync); + } + if (s) { + FlushOptions f_options; + uint64_t seq_upto = NOT_INITIALIZED; + if (elem) { + // Requested by user. + f_options = elem->fOptions; + if (valid_number(elem->seqUpto)) seq_upto = elem->seqUpto; + + } else { + // Auto flush. + f_options.numFilesLimit = 8; + f_options.beyondLastSync = !gConfig.flusherAutoSync; + if (target_db->p->dbConfig.nextLevelExtension) { + // In LSM mode, bigger batch is always better. + f_options.numFilesLimit = 16; + } + } + + if (!sync_only) { + s = target_db->flushLogs(f_options, seq_upto); + } + + if (s && !elem) { + // Successful flush + auto flush mode + // = do not sleep next time (continuously work). + doNotSleepNextTime = true; + } + } + + } else { + s = Status::DB_HANDLE_NOT_FOUND; + } + + bool delayed_task = false; + if (elem) { + size_t elem_count = 0, handler_count = 0; + for (auto& entry: elem->handlers) { + FlusherQueueElem::HandlerElem& he = entry; + if (he.handler) { + he.handler(s, he.ctx); + handler_count++; + } + elem_count++; + } + if (target_db) { + _log_debug(target_db->p->myLog, + "total %zu handlers out of %zu requests " + "have been invoked together", + handler_count, elem_count); + } else { + _log_info(dbm->getLogger(), + "got stale request %p, target DB doesn't exist", + elem); + } + if (elem->fOptions.execDelayUs) delayed_task = true; + delete elem; + } + + // WARNING: + // We should decrease reference counter AFTER + // user handler finishes its job. + if (target_db) { + target_db->p->decBgTask(); + } + + { // Remove pending files if exist (spend up to 1 second). + Timer tt; + tt.setDurationMs(1000); + while (!tt.timeout()) { + std::string full_path; + s = dbm->popFileToRemove(full_path); + if (!s) break; + Timer tt; + FileMgr::remove(full_path); + _log_info(dbm->getLogger(), + "removed pending file %s, %zu us", + full_path.c_str(), tt.getUs()); + } + } + + if ( dbm->flusherQueue()->size() && + !delayed_task ) { + doNotSleepNextTime = true; + } +} + +} // namespace jungle + diff --git a/src/flusher.h b/src/flusher.h new file mode 100644 index 0000000..630ba3e --- /dev/null +++ b/src/flusher.h @@ -0,0 +1,88 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "worker_mgr.h" + +#include + +#include + +namespace jungle { + +struct FlusherQueueElem { + FlusherQueueElem() + : targetDb(nullptr) + , seqUpto(NOT_INITIALIZED) + {} + + FlusherQueueElem(DB* _db, + const FlushOptions& _f_options, + const uint64_t _seq_upto, + UserHandler _handler, + void* _ctx) + : targetDb(_db) + , fOptions(_f_options) + , seqUpto(_seq_upto) + { + handlers.push_back( HandlerElem(_handler, _ctx) ); + } + + struct HandlerElem { + HandlerElem(UserHandler h = nullptr, void* c = nullptr) + : handler(h), ctx(c) + {} + UserHandler handler; + void* ctx; + }; + + DB* targetDb; + FlushOptions fOptions; + uint64_t seqUpto; + std::list handlers; +}; + +class FlusherQueue { +public: + FlusherQueue() {} + ~FlusherQueue(); + + void push(FlusherQueueElem* elem); + FlusherQueueElem* pop(); + size_t size() const; + +private: + mutable std::mutex queueLock; + std::list queue; +}; + +class Flusher : public WorkerBase { +public: + struct FlusherOptions : public WorkerOptions { + }; + + Flusher(const std::string& _w_name, + const GlobalConfig& _config); + ~Flusher(); + void work(WorkerOptions* opt_base); + + GlobalConfig gConfig; + size_t lastCheckedFileIndex; +}; + + +} // namespace jungle diff --git a/src/generic_bitmap.h b/src/generic_bitmap.h new file mode 100644 index 0000000..0b277bf --- /dev/null +++ b/src/generic_bitmap.h @@ -0,0 +1,203 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright: +See URL: https://github.com/greensky00/generic_bitmap + (v0.1.2) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include +#include + +#include +#include + +class GenericBitmap { +public: + /** + * Default constructor: initialize to 0. + */ + GenericBitmap(uint64_t num_bits, + size_t num_threads = 0) + : numThreads(num_threads) + , numBits(num_bits) + { + // Ceiling. + memorySizeByte = (numBits + 7) / 8; + if (memorySizeByte) { + myBitmap = (uint8_t*)calloc(1, memorySizeByte); + } else { + myBitmap = nullptr; + } + + init(); + } + + /** + * Copy from given memory blob. + */ + GenericBitmap(void* memory_ptr, + size_t memory_ptr_size, + size_t num_bits, + size_t num_threads = 0) + : numThreads(num_threads) + , numBits(num_bits) + , memorySizeByte(memory_ptr_size) + { + myBitmap = (uint8_t*)calloc(1, memorySizeByte); + memcpy(myBitmap, memory_ptr, memorySizeByte); + + init(); + } + + /** + * Destructor. + */ + ~GenericBitmap() { + free(myBitmap); + myBitmap = nullptr; + + delete[] locks; + locks = nullptr; + } + + /** + * Replace internal bitmap with given memory region. + * It will take ownership without memory copy. + */ + void moveFrom(void* memory_ptr, + size_t memory_ptr_size, + size_t num_bits) + { + free(myBitmap); + myBitmap = (uint8_t*)memory_ptr; + memorySizeByte = memory_ptr_size; + numBits = num_bits; + } + + /** + * Return the size of bitmap (number of bits). + */ + size_t size() const { return numBits; } + + /** + * Return the size of allocated memory region (in byte). + */ + size_t getMemorySize() const { return memorySizeByte; } + + /** + * Return the memory address of allocated memory region. + */ + void* getPtr() const { return myBitmap; } + + /** + * Get the bitmap value of given index. + */ + inline bool get(uint64_t idx) { + uint64_t lock_idx = 0, offset = 0, byte_idx = 0; + parse(idx, lock_idx, offset, byte_idx); + + std::lock_guard l(locks[lock_idx]); + return getInternal(byte_idx, offset); + } + + /** + * Set the bitmap value of given index. + */ + inline bool set(uint64_t idx, bool val) { + uint64_t lock_idx = 0, offset = 0, byte_idx = 0; + parse(idx, lock_idx, offset, byte_idx); + + std::lock_guard l(locks[lock_idx]); + // NOTE: bool -> int conversion is defined in C++ standard. + return setInternal(offset, byte_idx, val); + } + + inline size_t getNumThreads() const { return numThreads; } + + inline std::mutex* getLocks() const { return locks; } + +private: + void init() { + masks8[0] = 0x80; + masks8[1] = 0x40; + masks8[2] = 0x20; + masks8[3] = 0x10; + masks8[4] = 0x08; + masks8[5] = 0x04; + masks8[6] = 0x02; + masks8[7] = 0x01; + + // `numThreads` should be 2^n. + if (!numThreads) { + numThreads = 1; + size_t num_cores = std::thread::hardware_concurrency(); + while (numThreads < num_cores) numThreads *= 2; + } + + // TODO: + // To support partitioned lock, need to resolve + // aligned memory update issue. + // Until then, just use global latch. + numThreads = 1; + locks = new std::mutex[numThreads]; + + for (size_t prev = 0; prev < 256; ++prev) { + for (size_t offset = 0; offset < 8; ++offset) { + uint8_t mask = masks8[offset]; + calcTable[prev][offset][0] = prev & (~mask); + calcTable[prev][offset][1] = prev | mask; + } + } + } + + inline void parse(uint64_t idx, + uint64_t& lock_idx_out, + uint64_t& offset_out, + uint64_t& byte_idx_out) const + { + lock_idx_out = idx & (numThreads - 1); + offset_out = idx & 0x7; + byte_idx_out = idx >> 3; + } + + inline bool getInternal(uint64_t byte_idx, + uint64_t offset) const + { + uint8_t val = myBitmap[byte_idx]; + return val & masks8[offset]; + } + + inline bool setInternal(uint64_t offset, + uint64_t byte_idx, + uint8_t val) + { + uint8_t mask = masks8[offset]; + uint8_t prev = myBitmap[byte_idx]; + myBitmap[byte_idx] = calcTable[prev][offset][val]; + return prev & mask; + } + + size_t numThreads; + uint64_t numBits; + uint64_t memorySizeByte; + uint8_t* myBitmap; + uint8_t calcTable[256][8][2]; + uint8_t masks8[8]; + std::mutex* locks; +}; + diff --git a/src/hex_dump.h b/src/hex_dump.h new file mode 100644 index 0000000..4984fc9 --- /dev/null +++ b/src/hex_dump.h @@ -0,0 +1,251 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright: +See URL: https://github.com/greensky00/hex-dump + (v0.1.8) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include +#include +#include +#include + +#ifndef _CLM_DEFINED +#define _CLM_DEFINED (1) + +#ifdef HEX_DUMP_NO_COLOR + #define _CLM_D_GRAY "" + #define _CLM_GREEN "" + #define _CLM_B_GREEN "" + #define _CLM_RED "" + #define _CLM_B_RED "" + #define _CLM_BROWN "" + #define _CLM_B_BROWN "" + #define _CLM_BLUE "" + #define _CLM_B_BLUE "" + #define _CLM_MAGENTA "" + #define _CLM_B_MAGENTA "" + #define _CLM_CYAN "" + #define _CLM_END "" + + #define _CLM_WHITE_FG_RED_BG "" +#else + #define _CLM_D_GRAY "\033[1;30m" + #define _CLM_GREEN "\033[32m" + #define _CLM_B_GREEN "\033[1;32m" + #define _CLM_RED "\033[31m" + #define _CLM_B_RED "\033[1;31m" + #define _CLM_BROWN "\033[33m" + #define _CLM_B_BROWN "\033[1;33m" + #define _CLM_BLUE "\033[34m" + #define _CLM_B_BLUE "\033[1;34m" + #define _CLM_MAGENTA "\033[35m" + #define _CLM_B_MAGENTA "\033[1;35m" + #define _CLM_CYAN "\033[36m" + #define _CLM_B_GREY "\033[1;37m" + #define _CLM_END "\033[0m" + + #define _CLM_WHITE_FG_RED_BG "\033[37;41m" +#endif + +#define _CL_D_GRAY(str) _CLM_D_GRAY str _CLM_END +#define _CL_GREEN(str) _CLM_GREEN str _CLM_END +#define _CL_RED(str) _CLM_RED str _CLM_END +#define _CL_B_RED(str) _CLM_B_RED str _CLM_END +#define _CL_MAGENTA(str) _CLM_MAGENTA str _CLM_END +#define _CL_BROWN(str) _CLM_BROWN str _CLM_END +#define _CL_B_BROWN(str) _CLM_B_BROWN str _CLM_END +#define _CL_B_BLUE(str) _CLM_B_BLUE str _CLM_END +#define _CL_B_MAGENTA(str) _CLM_B_MAGENTA str _CLM_END +#define _CL_CYAN(str) _CLM_CYAN str _CLM_END +#define _CL_B_GRAY(str) _CLM_B_GREY str _CLM_END + +#define _CL_WHITE_FG_RED_BG(str) _CLM_WHITE_FG_RED_BG str _CLM_END + +#endif + +// LCOV_EXCL_START + +struct print_hex_options { + // If set, print colorful hex dump using ANSI color codes. + int enable_colors; + // If set, print actual memory address. + int actual_address; + // The number of bytes per line. + int align; +}; + +#define PRINT_HEX_OPTIONS_INITIALIZER \ + (struct print_hex_options){1, 1, 16} + +static void _print_white_space(FILE* stream, + size_t len) { + for (size_t i=0; i= (uint64_t)buf && + start_address + j < (uint64_t)buf + buflen) { + uint64_t idx = j - surplus_bytes; + fprintf(stream, + (options.enable_colors) + ? _CL_GREEN("%02x ") + : "%02x ", + ((uint8_t*)buf)[idx]); + } else { + fprintf(stream, " "); + } + + if ((j + 1) % 8 == 0) { + fprintf(stream, " "); + } + } + + // Ascii character part + fprintf(stream, " "); + for (j = i; j < i+options.align; ++j){ + if (j < buflen + surplus_bytes && + start_address + j >= (uint64_t)buf && + start_address + j < (uint64_t)buf + buflen) { + uint64_t idx = j - surplus_bytes; + + // print only readable ascii character + if (0x20 <= ((char*)buf)[idx] && ((char*)buf)[idx] <= 0x7d) { + // Printable character + fprintf(stream, + (options.enable_colors) + ? _CL_B_BLUE("%c") + : "%c", + ((char*)buf)[idx]); + } else { + // Otherwise + fprintf(stream, "."); + } + } else { + fprintf(stream, " "); + } + } + fprintf(stream, "\n"); + } +} + +static void __attribute__((unused)) + print_hex_to_buf(char** output_buf, + size_t* output_buf_len, + const void* buf, + size_t buflen, + struct print_hex_options options) +{ + FILE* stream; + stream = open_memstream(output_buf, output_buf_len); + print_hex_stream(stream, buf, buflen, options); + fflush(stream); + fclose(stream); +} + +// LCOV_EXCL_STOP + diff --git a/src/histogram.h b/src/histogram.h new file mode 100644 index 0000000..927ba5a --- /dev/null +++ b/src/histogram.h @@ -0,0 +1,324 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright: +See URL: https://github.com/greensky00/latency-collector + (v0.1.7) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +using HistBin = std::atomic; + +class Histogram; +class HistItr { +public: + HistItr() : idx(0), maxBins(0), owner(nullptr) { } + + HistItr(size_t _idx, size_t _max_bins, const Histogram* _owner) + : idx(_idx), maxBins(_max_bins), owner(_owner) {} + + // ++A + HistItr& operator++() { + idx++; + if (idx > maxBins) idx = maxBins; + return *this; + } + + // A++ + HistItr operator++(int) { + idx++; + if (idx > maxBins) idx = maxBins; + return *this; + } + + // --A + HistItr& operator--() { + if (idx || idx == maxBins) { + // end() + idx = maxBins; + } else { + idx--; + } + return *this; + } + + // A-- + HistItr operator--(int) { + if (idx || idx == maxBins) { + // end() + idx = maxBins; + } else { + idx--; + } + return *this; + } + + HistItr& operator*() { + // Just return itself + return *this; + } + + + bool operator==(const HistItr& val) const { + return (idx == val.idx); + } + + bool operator!=(const HistItr& val) const { + return (idx != val.idx); + } + + size_t getIdx() const { return idx; } + + inline uint64_t getCount(); + + uint64_t getLowerBound() { + size_t idx_rev = maxBins - idx - 1; + uint64_t ret = 1; + + if (idx_rev) { + return ret << (idx_rev-1); + } else { + return 0; + } + } + + uint64_t getUpperBound() { + size_t idx_rev = maxBins - idx - 1; + uint64_t ret = 1; + + if (!idx) return std::numeric_limits::max(); + return ret << idx_rev; + } + +private: + size_t idx; + size_t maxBins; + const Histogram* owner; +}; + +class Histogram { + friend class HistItr; + +public: + using iterator = HistItr; + + Histogram(double base = 2.0) + : EXP_BASE(base) + , EXP_BASE_LOG( log(base) ) + , count(0) + , sum(0) + , max(0) + { + bins = new HistBin[MAX_BINS]; + for (size_t i=0; i (int)MAX_BINS) return 0; + return (int)MAX_BINS - idx_rvs; + } + + void add(uint64_t val) { + // if `val` == 1 + // == 0x00...01 + // ^ + // 64th bit + // then `idx` = 63. + // + // if `val` == UINT64_MAX + // == 0xff...ff + // ^ + // 1st bit + // then `idx` = 0. + // + // so we should handle `val` == 0 as a special case (`idx` = 64), + // that's the reason why num bins is 65. + + int idx = MAX_BINS - 1; + if (val) { +#if defined(__linux__) || defined(__APPLE__) + idx = __builtin_clzl(val); + +#elif defined(WIN32) || defined(_WIN32) + idx = getIdx(val); +#endif + } + bins[idx].fetch_add(1, std::memory_order_relaxed); + count.fetch_add(1, std::memory_order_relaxed); + sum.fetch_add(val, std::memory_order_relaxed); + + size_t num_trial = 0; + while (num_trial++ < MAX_TRIAL && + max.load(std::memory_order_relaxed) < val) { + // 'max' may not be updated properly under race condition. + max.store(val, std::memory_order_relaxed); + } + } + + uint64_t getTotal() const { return count; } + uint64_t getSum() const { return sum; } + uint64_t getAverage() const { return ( (count) ? (sum / count) : 0 ); } + uint64_t getMax() const { return max; } + + iterator find(double percentile) { + if (percentile <= 0 || percentile >= 100) { + return end(); + } + + double rev = 100 - percentile; + size_t i; + uint64_t sum = 0; + uint64_t total = getTotal(); + uint64_t threshold = (uint64_t)( (double)total * rev / 100.0 ); + + for (i=0; i= threshold) { + return HistItr(i, MAX_BINS, this); + } + } + return end(); + } + + uint64_t estimate(double percentile) { + if (percentile <= 0 || percentile >= 100) { + return 0; + } + + double rev = 100 - percentile; + size_t i; + uint64_t sum = 0; + uint64_t total = getTotal(); + uint64_t threshold = (uint64_t)( (double)total * rev / 100.0 ); + + if (!threshold) { + // No samples between the given percentile and the max number. + // Return max number. + return max; + } + + for (i=0; i count; + std::atomic sum; + std::atomic max; +}; + +uint64_t HistItr::getCount() { + return owner->bins[idx]; +} + diff --git a/src/internal_helper.cc b/src/internal_helper.cc new file mode 100644 index 0000000..ed1bf66 --- /dev/null +++ b/src/internal_helper.cc @@ -0,0 +1,492 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "internal_helper.h" + +#include "fileops_base.h" +#include "hex_dump.h" +#include "libjungle/jungle.h" + +#include + +#include +#include +#include +#include +#include +#include + +namespace jungle { + +size_t RwSerializer::size() { + if (len) return len; + return fOps->eof(dstFile); +} + +size_t RwSerializer::pos() const { + return curPos; +} + +size_t RwSerializer::pos(size_t new_pos) { + if (len) { + assert(new_pos <= len); + } + curPos = new_pos; + return curPos; +} + +void RwSerializer::chkResize(size_t new_data_size) { + // Not resizable mode. + if (!rBuf) return; + + if (rBuf->empty()) { + // Allocate 32 bytes if empty. + rBuf->alloc(32); + } + + size_t new_buf_size = rBuf->size; + while (curPos + new_data_size > new_buf_size) { + // Double the size. + new_buf_size *= 2; + } + rBuf->resize(new_buf_size); + dstPtr = rBuf->data; + len = rBuf->size; +} + +Status RwSerializer::put(const void* data, size_t size) { + uint64_t off = pos(); + Status s; + chkResize(size); + if (dstPtr) { + append_mem(dstPtr, data, size, 0, off); + } else if (supportAppend) { + s = append_file(fOps, dstFile, data, size); + } else { + s = append_file(fOps, dstFile, data, size, 0, off); + } + if (s) pos(off); + return s; +} + +Status RwSerializer::putSb(const SizedBuf& s_buf) { + Status s; + s = putU32(s_buf.size); + if (!s) return s; + + s = put(s_buf.data, s_buf.size); + if (!s) { + // Rollback. + pos( pos() - sizeof(uint32_t) ); + } + return s; +} + +Status RwSerializer::putU64(uint64_t val) { + uint64_t off = pos(); + Status s; + chkResize( sizeof(val) ); + if (dstPtr) { + append_mem_64(dstPtr, val, 0, off); + } else if (supportAppend) { + s = append_file_64(fOps, dstFile, val); + } else { + s = append_file_64(fOps, dstFile, val, 0, off); + } + if (s) pos(off); + return s; +} + +Status RwSerializer::putU32(uint32_t val) { + uint64_t off = pos(); + Status s; + chkResize( sizeof(val) ); + if (dstPtr) { + append_mem_32(dstPtr, val, 0, off); + } else if (supportAppend) { + s = append_file_32(fOps, dstFile, val); + } else { + s = append_file_32(fOps, dstFile, val, 0, off); + } + if (s) pos(off); + return s; +} + +Status RwSerializer::putU16(uint16_t val) { + uint64_t off = pos(); + Status s; + chkResize( sizeof(val) ); + if (dstPtr) { + append_mem_16(dstPtr, val, 0, off); + } else if (supportAppend) { + s = append_file_16(fOps, dstFile, val); + } else { + s = append_file_16(fOps, dstFile, val, 0, off); + } + if (s) pos(off); + return s; +} + +Status RwSerializer::putU8(uint8_t val) { + uint64_t off = pos(); + Status s; + chkResize( sizeof(val) ); + if (dstPtr) { + append_mem_8(dstPtr, val, 0, off); + } else if (supportAppend) { + s = append_file_8(fOps, dstFile, val); + } else { + s = append_file_8(fOps, dstFile, val, 0, off); + } + if (s) pos(off); + return s; +} + +Status RwSerializer::get(void* data, size_t size) { + uint64_t off = pos(); + Status s; + if (dstPtr) { + if (curPos + size > len) { + s = Status::READ_VIOLATION; + } else { + read_mem(dstPtr, data, size, 0, off); + } + } else { + s = read_file(fOps, dstFile, data, size, 0, off); + } + if (s) pos(off); + return s; +} + +Status RwSerializer::getSb(SizedBuf& s_buf_out, bool clone) { + Status s; + uint32_t len = getU32(s); + s_buf_out.clear(); + if (len) { + if (clone) { + s_buf_out.alloc(len); + s = get(s_buf_out.data, len); + if (!s) s_buf_out.free(); + } else { + s_buf_out.data = dstPtr + curPos; + s_buf_out.size = len; + pos( pos() + len ); + } + } + return s; +} + +uint64_t RwSerializer::getU64(Status& s_out) { + uint64_t off = pos(); + uint64_t val = 0; + if (dstPtr) { + if (curPos + sizeof(val) > len) { + s_out = Status::READ_VIOLATION; + } else { + read_mem_64(dstPtr, val, 0, off); + } + } else { + s_out = read_file_64(fOps, dstFile, val, 0, off); + } + if (s_out) pos(off); + return val; +} + +uint32_t RwSerializer::getU32(Status& s_out) { + uint64_t off = pos(); + uint32_t val = 0; + if (dstPtr) { + if (curPos + sizeof(val) > len) { + s_out = Status::READ_VIOLATION; + } else { + read_mem_32(dstPtr, val, 0, off); + } + } else { + s_out = read_file_32(fOps, dstFile, val, 0, off); + } + if (s_out) pos(off); + return val; +} + +uint16_t RwSerializer::getU16(Status& s_out) { + uint64_t off = pos(); + uint16_t val = 0; + if (dstPtr) { + if (curPos + sizeof(val) > len) { + s_out = Status::READ_VIOLATION; + } else { + read_mem_16(dstPtr, val, 0, off); + } + } else { + s_out = read_file_16(fOps, dstFile, val, 0, off); + } + if (s_out) pos(off); + return val; +} + +uint8_t RwSerializer::getU8(Status& s_out) { + uint64_t off = pos(); + uint8_t val = 0; + if (dstPtr) { + if (curPos + sizeof(val) > len) { + s_out = Status::READ_VIOLATION; + } else { + read_mem_8(dstPtr, val, 0, off); + } + } else { + s_out = read_file_8(fOps, dstFile, val, 0, off); + } + if (s_out) pos(off); + return val; +} + +bool RwSerializer::available(size_t nbyte) { + if (dstPtr) { + return (pos() + nbyte) <= len; + } else if (fOps) { + return true; + } else { + return false; + } +} + +Status BackupRestore::copyFile(FileOps* f_ops, + const std::string& src_file, + const std::string& dst_file) +{ + Status s; + if (!f_ops->exist(src_file)) return Status::FILE_NOT_EXIST; + + FileHandle* s_file = nullptr; + FileHandle* d_file = nullptr; + + try { + TC(f_ops->open(&s_file, src_file)); + TC(f_ops->open(&d_file, dst_file)); + + size_t file_size = f_ops->eof(s_file); + SizedBuf tmp_buf(file_size); + SizedBuf::Holder h_tmp_buf(tmp_buf); + + TC( f_ops->pread(s_file, tmp_buf.data, tmp_buf.size, 0) ); + TC( f_ops->pwrite(d_file, tmp_buf.data, tmp_buf.size, 0) ); + f_ops->ftruncate(d_file, file_size); + + f_ops->close(s_file); + f_ops->close(d_file); + delete s_file; + delete d_file; + + return s; + + } catch (Status s) { + if (s_file) { + f_ops->close(s_file); + delete s_file; + } + if (d_file) { + f_ops->close(d_file); + delete d_file; + } + return s; + } +} + +Status BackupRestore::backup(FileOps* f_ops, + const std::string& filename) +{ + return copyFile(f_ops, filename, filename + ".bak"); +} + +Status BackupRestore::backup(FileOps* f_ops, + const std::string& filename, + const SizedBuf& ctx, + size_t length) +{ + Status s; + std::string dst_file = filename + ".bak"; + + FileHandle* d_file = nullptr; + + try { + TC( f_ops->open(&d_file, dst_file) ); + TC( f_ops->pwrite(d_file, ctx.data, length, 0) ); + f_ops->ftruncate(d_file, length); + f_ops->close(d_file); + delete d_file; + return s; + + } catch (Status s) { + if (d_file) { + f_ops->close(d_file); + delete d_file; + } + return s; + } +} + +Status BackupRestore::restore(FileOps* f_ops, + const std::string& filename) +{ + return copyFile(f_ops, filename + ".bak", filename); +} + +std::string HexDump::toString(const std::string& str) { + return toString(str.data(), str.size()); +} + +std::string HexDump::toString(const void* pd, size_t len) { + char* buffer; + size_t buffer_len; + print_hex_options opt = PRINT_HEX_OPTIONS_INITIALIZER; + opt.actual_address = 0; + opt.enable_colors = 0; + print_hex_to_buf(&buffer, &buffer_len, + pd, len, + opt); + std::string s = std::string(buffer); + free(buffer); + return s; +} + +std::string HexDump::toString(const SizedBuf& buf) { + return toString(buf.data, buf.size); +} + +std::string HexDump::rStr(const std::string& str, size_t limit) { + std::stringstream ss; + size_t size = std::min(str.size(), limit); + for (size_t ii=0; ii& files_out) +{ + DIR *dir_info = nullptr; + struct dirent *dir_entry = nullptr; + + dir_info = opendir(path.c_str()); + while ( dir_info && (dir_entry = readdir(dir_info)) ) { + files_out.push_back(dir_entry->d_name); + } + if (dir_info) { + closedir(dir_info); + } + return 0; +} + +std::string FileMgr::filePart(const std::string& full_path) { + size_t pos = full_path.rfind("/"); + if (pos == std::string::npos) return full_path; + return full_path.substr(pos + 1, full_path.size() - pos - 1); +} + +bool FileMgr::exist(const std::string& path) { + struct stat st; + int result = stat(path.c_str(), &st); + return (result == 0); +} + +uint64_t FileMgr::fileSize(const std::string& path) { + struct stat st; + int result = stat(path.c_str(), &st); + if (result != 0) return 0; + return st.st_size; +} + +int FileMgr::remove(const std::string& path) { + if (!exist(path)) return 0; + return ::remove(path.c_str()); +} + +int FileMgr::removeDir(const std::string& path) { + if (!exist(path)) return 0; + // TODO: non-Posix OS. + std::string cmd = "rm -rf " + path; + return ::system(cmd.c_str()); +} + +int FileMgr::copy(const std::string& from, const std::string& to) { + std::string cmd = "cp -R " + from + " " + to + " 2> /dev/null"; + FILE* fp = popen(cmd.c_str(), "r"); + return pclose(fp); +} + +int FileMgr::move(const std::string& from, const std::string& to) { + std::string cmd = "mv " + from + " " + to + " 2> /dev/null"; + FILE* fp = popen(cmd.c_str(), "r"); + return pclose(fp); +} + +int FileMgr::mkdir(const std::string& path) { + return ::mkdir(path.c_str(), 0755); +} + +uint64_t FileMgr::dirSize(const std::string& path, + bool recursive) +{ + uint64_t ret = 0; + DIR *dir_info = nullptr; + struct dirent *dir_entry = nullptr; + + dir_info = opendir(path.c_str()); + while ( dir_info && (dir_entry = readdir(dir_info)) ) { + std::string d_name = dir_entry->d_name; + if (d_name == "." || d_name == "..") continue; + + std::string full_path = path + "/" + d_name; + + if (dir_entry->d_type == DT_REG) { + struct stat st; + if (stat(full_path.c_str(), &st) == 0) { + ret += st.st_size; + } + + } else if (recursive && dir_entry->d_type == DT_DIR) { + ret += dirSize(full_path, recursive); + } + } + if (dir_info) { + closedir(dir_info); + } + return ret; +} + +size_t RndGen::fromProbDist(std::vector& prob_dist) { + uint64_t sum = 0; + for (size_t& prob: prob_dist) sum += (prob * 65536); + assert(sum > 0); + + uint64_t rr = rand() % sum; + uint64_t cnt = 0; + for (size_t ii=0; ii +#include +#include +#include +#include +#include +#include + +#include +#include + +#define __MACRO_TO_STR(arg) #arg +#define _MACRO_TO_STR(arg) __MACRO_TO_STR(arg) + +namespace jungle { + +static Status lastRwStatus; + +class RwSerializer { +public: + static Status lastStatus; + + // Memory buffer mode. + RwSerializer(void* dst_ptr, size_t _len) + : dstPtr(static_cast(dst_ptr)) + , rBuf(nullptr) + , fOps(nullptr) + , dstFile(nullptr) + , len(_len) + , curPos(0) + , supportAppend(false) + {} + + // SizedBuf mode (same as memory buffer mode). + RwSerializer(const SizedBuf& s_buf) + : dstPtr(static_cast(s_buf.data)) + , rBuf(nullptr) + , fOps(nullptr) + , dstFile(nullptr) + , len(s_buf.size) + , curPos(0) + , supportAppend(false) + {} + + // Resizable SizedBuf mode. + RwSerializer(SizedBuf* s_buf) + : dstPtr(static_cast(s_buf->data)) + , rBuf(s_buf) + , fOps(nullptr) + , dstFile(nullptr) + , len(s_buf->size) + , curPos(0) + , supportAppend(false) + {} + + // File mode. + RwSerializer(FileOps* f_ops, FileHandle* dst_file, bool _supportAppend = false) + : dstPtr(nullptr) + , rBuf(nullptr) + , fOps(f_ops) + , dstFile(dst_file) + , len(0) + , curPos(0) + , supportAppend(_supportAppend) + {} + + size_t size(); + size_t pos() const; + size_t pos(size_t new_pos); + void chkResize(size_t new_data_size); + Status put(const void* data, size_t size); + Status putSb(const SizedBuf& s_buf); + Status putU64(uint64_t val); + Status putU32(uint32_t val); + Status putU16(uint16_t val); + Status putU8(uint8_t val); + Status get(void* data, size_t size); + Status getSb(SizedBuf& s_buf_out, bool clone = true); + uint64_t getU64(Status& s_out = lastRwStatus); + uint32_t getU32(Status& s_out = lastRwStatus); + uint16_t getU16(Status& s_out = lastRwStatus); + uint8_t getU8(Status& s_out = lastRwStatus); + bool available(size_t nbyte); + +private: + uint8_t* dstPtr; + SizedBuf* rBuf; + FileOps* fOps; + FileHandle* dstFile; + size_t len; + size_t curPos; + bool supportAppend; +}; + +class BackupRestore { +public: + static Status copyFile(FileOps* f_ops, + const std::string& src_file, + const std::string& dst_file); + static Status backup(FileOps* f_ops, + const std::string& filename); + static Status backup(FileOps* f_ops, + const std::string& filename, + const SizedBuf& ctx, + size_t length); + static Status restore(FileOps* f_ops, + const std::string& filename); +}; + +struct Timer { + Timer(size_t duration_ms = 0) + : durationUs(0) + { + if (duration_ms) setDurationMs(duration_ms); + reset(); + } + void reset() { + std::lock_guard l(lock); + tCreated = std::chrono::system_clock::now(); + } + void setDurationMs(size_t to) { + durationUs = to * 1000; + } + bool timeout() const { + std::lock_guard l(lock); + auto cur = std::chrono::system_clock::now(); + std::chrono::duration elapsed = cur - tCreated; + return (durationUs < elapsed.count() * 1000000); + } + bool timeoutAndReset() { + std::lock_guard l(lock); + auto cur = std::chrono::system_clock::now(); + std::chrono::duration elapsed = cur - tCreated; + if (durationUs < elapsed.count() * 1000000) { + tCreated = cur; + return true; + } + return false; + } + uint64_t getUs() { + std::lock_guard l(lock); + auto cur = std::chrono::system_clock::now(); + std::chrono::duration elapsed = cur - tCreated; + return (uint64_t)1000000 * elapsed.count(); + } + uint64_t getMs() { + std::lock_guard l(lock); + auto cur = std::chrono::system_clock::now(); + std::chrono::duration elapsed = cur - tCreated; + return (uint64_t)1000 * elapsed.count(); + } + double getSec() { + std::lock_guard l(lock); + auto cur = std::chrono::system_clock::now(); + std::chrono::duration elapsed = cur - tCreated; + return elapsed.count(); + } + static void sleepUs(size_t us) { + std::this_thread::sleep_for(std::chrono::microseconds(us)); + } + static void sleepMs(size_t ms) { + std::this_thread::sleep_for(std::chrono::milliseconds(ms)); + } + std::chrono::time_point tCreated; + size_t durationUs; + mutable std::mutex lock; +}; + +struct VerboseLog { + VerboseLog(size_t time_window_ms) + : timer(time_window_ms) + , numSuppressed(0) + {} + + bool checkPrint(int& num_suppressed_out) { + if (timer.timeoutAndReset()) { + num_suppressed_out = numSuppressed; + numSuppressed.fetch_sub(num_suppressed_out); + return true; + } + numSuppressed.fetch_add(1); + return false; + } + + Timer timer; + std::atomic numSuppressed; +}; + +#define _SCU32(val) static_cast(val) +#define _SCU64(val) static_cast(val) + +// Error Pass +#define EP(cmd) \ + s = cmd; \ + if (!s) return s \ + +// Try Catch +#define TC(cmd) \ + s = cmd; \ + if (!s) throw s \ + +// Error Break +#define EB(cmd, msg) \ + s = cmd; \ + if (!s) { m = msg; break; } \ + +#define EP_(cmd) \ + cmd; \ + if (!s) return s \ + +#define TC_(cmd) \ + cmd; \ + if (!s) throw s \ + +#define EB_(cmd, msg) \ + cmd; \ + if (!s) { m = msg; break; } \ + +// Delete / free and clear the pointer. +#define DELETE(ptr) \ + delete ptr; \ + ptr = nullptr \ + +#define FREE(ptr) \ + free(ptr); \ + ptr = nullptr \ + +// If-clause that can use `break` in the middle. +#define IF(cond) \ + for ( bool __first_loop__ = true; \ + __first_loop__ && (cond); \ + __first_loop__ = false ) \ + +using mGuard = std::unique_lock; + +static const std::memory_order MOR = std::memory_order_relaxed; +static const std::memory_order MOSC = std::memory_order_seq_cst; + +static const uint64_t NOT_INITIALIZED = static_cast(-1); + +// fe 00 00 00 00 00 00 00 +static const uint64_t PADDING_HEADER_FLAG = 0xfeUL << 56; + +inline bool valid_number(uint64_t seq) { + return (seq != NOT_INITIALIZED); +} + +inline std::string _seq_str(uint64_t seqnum) { + if (!valid_number(seqnum)) { + return "(NIL)"; + } + return std::to_string(seqnum); +} + + +inline Status append_file(FileOps* ops, FileHandle* fhandle, + const void* data, size_t size, + uint64_t start, uint64_t& offset) +{ + Status s; + s = ops->pwrite(fhandle, data, size, start + offset); + if (s) offset += size; + return s; +} +inline Status append_file_64(FileOps* ops, FileHandle* fhandle, + uint64_t val, uint64_t start, uint64_t& offset) +{ + uint64_t e64 = _enc(val); + return append_file(ops, fhandle, &e64, sizeof(e64), start, offset); +} +inline Status append_file_32(FileOps* ops, FileHandle* fhandle, + uint32_t val, uint64_t start, uint64_t& offset) +{ + uint32_t e32 = _enc(val); + return append_file(ops, fhandle, &e32, sizeof(e32), start, offset); +} +inline Status append_file_16(FileOps* ops, FileHandle* fhandle, + uint16_t val, uint64_t start, uint64_t& offset) +{ + uint16_t e16 = _enc(val); + return append_file(ops, fhandle, &e16, sizeof(e16), start, offset); +} +inline Status append_file_8(FileOps* ops, FileHandle* fhandle, + uint8_t val, uint64_t start, uint64_t& offset) +{ + return append_file(ops, fhandle, &val, sizeof(val), start, offset); +} + +// Real append file +inline Status append_file(FileOps* ops, FileHandle* fhandle, + const void* data, size_t size) +{ + Status s; + s = ops->append(fhandle, data, size); + return s; +} +inline Status append_file_64(FileOps* ops, FileHandle* fhandle, uint64_t val) +{ + uint64_t e64 = _enc(val); + return append_file(ops, fhandle, &e64, sizeof(e64)); +} +inline Status append_file_32(FileOps* ops, FileHandle* fhandle, uint32_t val) +{ + uint32_t e32 = _enc(val); + return append_file(ops, fhandle, &e32, sizeof(e32)); +} +inline Status append_file_16(FileOps* ops, FileHandle* fhandle, uint16_t val) +{ + uint16_t e16 = _enc(val); + return append_file(ops, fhandle, &e16, sizeof(e16)); +} +inline Status append_file_8(FileOps* ops, FileHandle* fhandle, uint8_t val) +{ + return append_file(ops, fhandle, &val, sizeof(val)); +} + + +inline Status read_file(FileOps* ops, FileHandle* fhandle, + void* data, size_t size, + uint64_t start, uint64_t& offset) +{ + Status s; + s = ops->pread(fhandle, data, size, start + offset); + if (s) offset += size; + return s; +} +inline Status read_file_64(FileOps* ops, FileHandle* fhandle, + uint64_t& val, uint64_t start, uint64_t& offset) +{ + uint64_t e64; + Status s = read_file(ops, fhandle, &e64, sizeof(e64), start, offset); + if (!s) return s; + val = _dec(e64); + return s; +} +inline Status read_file_32(FileOps* ops, FileHandle* fhandle, + uint32_t& val, uint64_t start, uint64_t& offset) +{ + uint32_t e32; + Status s = read_file(ops, fhandle, &e32, sizeof(e32), start, offset); + if (!s) return s; + val = _dec(e32); + return s; +} +inline Status read_file_16(FileOps* ops, FileHandle* fhandle, + uint16_t& val, uint64_t start, uint64_t& offset) +{ + uint16_t e16; + Status s = read_file(ops, fhandle, &e16, sizeof(e16), start, offset); + if (!s) return s; + val = _dec(e16); + return s; +} +inline Status read_file_8(FileOps* ops, FileHandle* fhandle, + uint8_t& val, uint64_t start, uint64_t& offset) +{ + Status s = read_file(ops, fhandle, &val, sizeof(val), start, offset); + return s; +} + + +inline void append_mem(void* dst, const void* data, size_t size, + uint64_t start, uint64_t& offset) +{ + uint8_t* ptr = static_cast(dst); + memcpy(ptr + start + offset, data, size); + offset += size; +} +inline void append_mem_64(void* dst, uint64_t val, + uint64_t start, uint64_t& offset) +{ + uint64_t e64 = _enc(val); + return append_mem(dst, &e64, sizeof(e64), start, offset); +} +inline void append_mem_32(void* dst, uint32_t val, + uint64_t start, uint64_t& offset) +{ + uint32_t e32 = _enc(val); + return append_mem(dst, &e32, sizeof(e32), start, offset); +} +inline void append_mem_16(void* dst, uint16_t val, + uint64_t start, uint64_t& offset) +{ + uint16_t e16 = _enc(val); + return append_mem(dst, &e16, sizeof(e16), start, offset); +} +inline void append_mem_8(void* dst, uint8_t val, + uint64_t start, uint64_t& offset) +{ + return append_mem(dst, &val, sizeof(val), start, offset); +} + + +inline void read_mem(void* dst, void* data, size_t size, + uint64_t start, uint64_t& offset) +{ + uint8_t* ptr = static_cast(dst); + memcpy(data, ptr + start + offset, size); + offset += size; +} +inline void read_mem_64(void* dst, uint64_t& val, + uint64_t start, uint64_t& offset) +{ + uint64_t e64; + read_mem(dst, &e64, sizeof(e64), start, offset); + val = _dec(e64); +} +inline void read_mem_32(void* dst, uint32_t& val, + uint64_t start, uint64_t& offset) +{ + uint32_t e32; + read_mem(dst, &e32, sizeof(e32), start, offset); + val = _dec(e32); +} +inline void read_mem_16(void* dst, uint16_t& val, + uint64_t start, uint64_t& offset) +{ + uint16_t e16; + read_mem(dst, &e16, sizeof(e16), start, offset); + val = _dec(e16); +} +inline void read_mem_8(void* dst, uint8_t& val, + uint64_t start, uint64_t& offset) +{ + read_mem(dst, &val, sizeof(val), start, offset); +} + +inline bool contains_null(void* a, void *b) { + return !a || !b; +} + +inline int cmp_null_chk(void* a, void *b) { + if (!a && !b) return 0; // Both are null: a == b + if (!a && b) return -1; // Only `a` is null: a < b. + if ( a && !b) return 1; // Only `b` is null: a > b. + assert(0); + return 0xff; +} + +inline size_t getMurmurHash32(const SizedBuf& data) { + uint32_t output = 0; + MurmurHash3_x86_32(data.data, data.size, 0, &output); + return output; +} + +inline size_t getMurmurHash(const SizedBuf& data, size_t limit) { + uint32_t output = 0; + MurmurHash3_x86_32(data.data, data.size, 0, &output); + return output % limit; +} + +inline uint64_t getMurmurHash64(const SizedBuf& data) { + uint64_t output[2]; + MurmurHash3_x64_128(data.data, data.size, 0, output); + return output[0]; +} + +inline const char* getOnOffStr(bool cond) { + if (cond) return "ON"; + else return "OFF"; +} + +#define CMP_NULL_CHK(a, b) \ + if ( contains_null( (a), (b) ) ) return cmp_null_chk( (a), (b) ); + +class HexDump { +public: + static std::string toString(const std::string& str); + static std::string toString(const void* pd, size_t len); + static std::string toString(const SizedBuf& buf); + static std::string rStr(const std::string& str, size_t limit = 16); +}; + +class FileMgr { +public: + static int scan(const std::string& path, + std::vector& files_out); + static std::string filePart(const std::string& full_path); + static bool exist(const std::string& path); + static uint64_t fileSize(const std::string& path); + static int remove(const std::string& path); + static int removeDir(const std::string& path); + static int copy(const std::string& from, const std::string& to); + static int move(const std::string& from, const std::string& to); + static int mkdir(const std::string& path); + static uint64_t dirSize(const std::string& path, + bool recursive = false); +}; + +template +class GcDelete { +public: + GcDelete(T& _src) : done(false), src(_src) {} + ~GcDelete() { + gcNow(); + } + void gcNow() { + if (!done) { + delete src; + src = nullptr; + done = true; + } + } +private: + bool done; + T& src; +}; + +class RndGen { +public: + static size_t fromProbDist(std::vector& prob_dist); +}; + +class Formatter { +public: + static std::string usToString(uint64_t us, size_t precision = 1) { + std::stringstream ss; + if (us < 1000) { + // us + ss << std::fixed << std::setprecision(0) << us << " us"; + } else if (us < 1000000) { + // ms + double tmp = static_cast(us / 1000.0); + ss << std::fixed << std::setprecision(precision) << tmp << " ms"; + } else if (us < (uint64_t)600 * 1000000) { + // second: 1 s -- 600 s (10 mins) + double tmp = static_cast(us / 1000000.0); + ss << std::fixed << std::setprecision(precision) << tmp << " s"; + } else { + // minute + double tmp = static_cast(us / 60.0 / 1000000.0); + ss << std::fixed << std::setprecision(0) << tmp << " m"; + } + return ss.str(); + } + + static std::string countToString(uint64_t count, size_t precision = 1) { + std::stringstream ss; + if (count < 1000) { + ss << count; + } else if (count < 1000000) { + // K + double tmp = static_cast(count / 1000.0); + ss << std::fixed << std::setprecision(precision) << tmp << "K"; + } else if (count < (uint64_t)1000000000) { + // M + double tmp = static_cast(count / 1000000.0); + ss << std::fixed << std::setprecision(precision) << tmp << "M"; + } else { + // B + double tmp = static_cast(count / 1000000000.0); + ss << std::fixed << std::setprecision(precision) << tmp << "B"; + } + return ss.str(); + } + + static std::string sizeToString(uint64_t size, size_t precision = 1) { + std::stringstream ss; + if (size < 1024) { + ss << size << " B"; + } else if (size < 1024*1024) { + // K + double tmp = static_cast(size / 1024.0); + ss << std::fixed << std::setprecision(precision) << tmp << " KiB"; + } else if (size < (uint64_t)1024*1024*1024) { + // M + double tmp = static_cast(size / 1024.0 / 1024.0); + ss << std::fixed << std::setprecision(precision) << tmp << " MiB"; + } else { + // B + double tmp = static_cast(size / 1024.0 / 1024.0 / 1024.0); + ss << std::fixed << std::setprecision(precision) << tmp << " GiB"; + } + return ss.str(); + } + +}; + +class GcFunc { +public: + using Func = std::function< void() >; + + GcFunc(Func _func) : done(false), func(_func) {} + ~GcFunc() { gcNow(); } + void gcNow() { + if (!done) { + func(); + done = true; + } + } +private: + bool done; + Func func; +}; + +class StrHelper { +public: + // Replace all `before`s in `src_str` with `after. + // e.g.) before="a", after="A", src_str="ababa" + // result: "AbAbA" + static std::string replace(const std::string& src_str, + const std::string& before, + const std::string& after) + { + size_t last = 0; + size_t pos = src_str.find(before, last); + std::string ret; + while (pos != std::string::npos) { + ret += src_str.substr(last, pos - last); + ret += after; + last = pos + before.size(); + pos = src_str.find(before, last); + } + if (last < src_str.size()) { + ret += src_str.substr(last); + } + return ret; + } + + // e.g.) + // src = "a,b,c", delim = "," + // result = {"a", "b", "c"} + static std::vector tokenize(const std::string& src, + const std::string& delim) + { + std::vector ret; + size_t last = 0; + size_t pos = src.find(delim, last); + while (pos != std::string::npos) { + ret.push_back( src.substr(last, pos - last) ); + last = pos + delim.size(); + pos = src.find(delim, last); + } + if (last < src.size()) { + ret.push_back( src.substr(last) ); + } + return ret; + } + + // Trim heading whitespace and trailing whitespace + // e.g.) + // src = " a,b,c ", whitespace =" " + // result = "a,b,c" + static std::string trim(const std::string& src, + const std::string whitespace = " \t\n") + { + // start pos + const size_t pos = src.find_first_not_of(whitespace); + if (pos == std::string::npos) + return ""; + + const size_t last = src.find_last_not_of(whitespace); + const size_t len = last - pos + 1; + + return src.substr(pos, len); + } +}; + +} // namespace jungle + diff --git a/src/iterator.cc b/src/iterator.cc new file mode 100644 index 0000000..cda97ae --- /dev/null +++ b/src/iterator.cc @@ -0,0 +1,598 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "db_internal.h" + +#include + +namespace jungle { + +Iterator::Iterator() : p(new IteratorInternal(this)) {} +Iterator::~Iterator() { + close(); + delete p; +} + +Status Iterator::init(DB* dd, + const SizedBuf& start_key, + const SizedBuf& end_key) +{ + Status s; + EP( dd->p->checkHandleValidity() ); + + p->db = dd; + p->type = ItrInt::BY_KEY; + if (p->db->p->dbConfig.cmpFunc) { + // Custom cmp mode. + avl_set_aux(&p->curWindow, (void*)p->db); + } + + // LogMgr iterator + ItrInt::ItrItem* ctx_log = new ItrInt::ItrItem(); + ctx_log->logItr = new LogMgr::Iterator(); + s = ctx_log->logItr->init( (dd->sn) ? dd : nullptr, + p->db->p->logMgr, + start_key, + end_key ); + if (s) s = ctx_log->logItr->get(ctx_log->lastRec); + if (s) { + avl_node* avl_ret = + avl_insert(&p->curWindow, &ctx_log->an, ItrInt::ItrItem::cmpKey); + assert(avl_ret == &ctx_log->an); + (void)avl_ret; + } + p->itrs.push_back(ctx_log); + + // TableMgr iterator + ItrInt::ItrItem* ctx_table = new ItrInt::ItrItem(); + ctx_table->tableItr = new TableMgr::Iterator(); + s = ctx_table->tableItr->init( (dd->sn) ? dd : nullptr, + p->db->p->tableMgr, + start_key, + end_key ); + if (s) s = ctx_table->tableItr->get(ctx_table->lastRec); + if (s) { + avl_node* avl_ret = + avl_insert(&p->curWindow, &ctx_table->an, ItrInt::ItrItem::cmpKey); + assert(avl_ret == &ctx_table->an); + (void)avl_ret; + } + p->itrs.push_back(ctx_table); + + p->windowCursor = avl_first(&p->curWindow); + + // NOTE: + // Even though this is an empty iterator (i.e., p->windowCursor == NULL), + // return OK for now, but all other API will not work. + if (p->type == ItrInt::BY_KEY) { + ItrInt::ItrItem* cur_item = + _get_entry(p->windowCursor, ItrInt::ItrItem, an); + while (cur_item && !cur_item->lastRec.isIns()) { + s = next(); + if (!s) return Status(); + cur_item = _get_entry(p->windowCursor, ItrInt::ItrItem, an); + } + } + + return Status(); +} + +Status Iterator::initSN(DB* db, + const uint64_t min_seq, + const uint64_t max_seq) +{ + Status s; + EP( db->p->checkHandleValidity() ); + + p->db = db; + p->type = ItrInt::BY_SEQ; + if (p->db->p->dbConfig.cmpFunc) { + // Custom cmp mode. + avl_set_aux(&p->curWindow, (void*)p->db); + } + + // LogMgr iterator + ItrInt::ItrItem* ctx_log = new ItrInt::ItrItem(); + ctx_log->logItr = new LogMgr::Iterator(); + s = ctx_log->logItr->initSN( (db->sn) ? db : nullptr, + p->db->p->logMgr, + min_seq, + max_seq ); + if (s) s = ctx_log->logItr->get(ctx_log->lastRec); + if (s) { + avl_node* ret = + avl_insert(&p->curWindow, &ctx_log->an, ItrInt::ItrItem::cmpSeq); + assert(ret == &ctx_log->an); + (void)ret; + } + p->itrs.push_back(ctx_log); + + // TableMgr iterator + ItrInt::ItrItem* ctx_table = new ItrInt::ItrItem(); + ctx_table->tableItr = new TableMgr::Iterator(); + s = ctx_table->tableItr->initSN( (db->sn) ? db : nullptr, + p->db->p->tableMgr, + min_seq, + max_seq ); + if (s) s = ctx_table->tableItr->get(ctx_table->lastRec); + if (s) { + avl_node* ret = + avl_insert(&p->curWindow, &ctx_table->an, ItrInt::ItrItem::cmpSeq); + assert(ret == &ctx_table->an); + (void)ret; + } + p->itrs.push_back(ctx_table); + + p->windowCursor = avl_first(&p->curWindow); + + // NOTE: + // Same as in init(), allow empty iterator. + return Status(); +} + +Status Iterator::get(Record& rec_out) { + if (!p || !p->db) return Status::NOT_INITIALIZED; + if (!p->windowCursor) return Status::KEY_NOT_FOUND; + if (p && p->db) p->db->p->updateOpHistory(); + + Status s; + ItrInt::ItrItem* item = _get_entry(p->windowCursor, ItrInt::ItrItem, an); + if ( p->type == ItrInt::BY_KEY && + !item->lastRec.isIns() ) { + // by-key: only allow insertion record. + return Status::KEY_NOT_FOUND; + } + item->lastRec.copyTo(rec_out); + return Status(); +} + +Status Iterator::prev() { + if (!p || !p->db) return Status::NOT_INITIALIZED; + if (!p->windowCursor) return Status::OUT_OF_RANGE; + if (p && p->db) p->db->p->updateOpHistory(); + + // Due to deleted key, it may include multiple steps. + for (;;) { + ItrInt::ItrItem* cur_item = _get_entry(p->windowCursor, ItrInt::ItrItem, an); + uint64_t cur_seq = cur_item->lastRec.seqNum; + SizedBuf cur_key; + cur_key.alloc(cur_item->lastRec.kv.key); + + Status s; + avl_node* cursor = avl_last(&p->curWindow); + while (cursor) { + ItrInt::ItrItem* item = _get_entry(cursor, ItrInt::ItrItem, an); + if (item->flags & ItrInt::ItrItem::no_more_prev) { + s = Status::ERROR; + } else { + if ( p->type == ItrInt::BY_SEQ && + item->lastRec.seqNum < cur_seq ) break; + if ( p->type == ItrInt::BY_KEY && + p->cmpSizedBuf(item->lastRec.kv.key, cur_key) < 0 ) break; + + if (item->logItr) s = item->logItr->prev(); + if (item->tableItr) s = item->tableItr->prev(); + } + + if (s) { + avl_remove(&p->curWindow, &item->an); + item->flags = ItrInt::ItrItem::none; + if (item->logItr) { + s = item->logItr->get(item->lastRec); + } + if (item->tableItr) { + s = item->tableItr->get(item->lastRec); + } + assert(s); + + avl_cmp_func* cmp_func = (p->type == ItrInt::BY_SEQ) + ? (ItrInt::ItrItem::cmpSeq) + : (ItrInt::ItrItem::cmpKey); + avl_node* avl_ret = avl_insert(&p->curWindow, &item->an, cmp_func); + assert(avl_ret == &item->an); + (void)avl_ret; + cursor = avl_last(&p->curWindow); + } else { + item->flags |= ItrInt::ItrItem::no_more_prev; + cursor = avl_prev(&item->an); + } + } + + p->windowCursor = avl_last(&p->curWindow); + ItrInt::ItrItem* last_valid_item = nullptr; + while (p->windowCursor) { + // Find *LAST* valid item (only for BY_KEY). + ItrInt::ItrItem* item = _get_entry(p->windowCursor, ItrInt::ItrItem, an); + + bool valid = false; + if (p->type == ItrInt::BY_SEQ) { + valid = p->checkValidBySeq(item, cur_seq, true); + if (!valid) p->windowCursor = avl_prev(p->windowCursor); + else break; + + } else if (p->type == ItrInt::BY_KEY) { + valid = p->checkValidByKey(item, cur_key, true); + if (last_valid_item && + p->cmpSizedBuf(item->lastRec.kv.key, + last_valid_item->lastRec.kv.key) < 0) break; + if (valid) last_valid_item = item; + p->windowCursor = avl_prev(p->windowCursor); + } + } + + if (last_valid_item) p->windowCursor = &last_valid_item->an; + + cur_key.free(); + + if (!p->windowCursor) { + // Reached the end. + p->windowCursor = avl_first(&p->curWindow); + return Status::OUT_OF_RANGE; + } + + if (p->type == ItrInt::BY_KEY) { + cur_item = _get_entry(p->windowCursor, ItrInt::ItrItem, an); + if ( !cur_item->lastRec.isIns() ) { + // Deleted key, move further. + continue; + } + } + break; + } + + return Status(); +} + +Status Iterator::next() { + if (!p || !p->db) return Status::NOT_INITIALIZED; + if (!p->windowCursor) return Status::OUT_OF_RANGE; + if (p && p->db) p->db->p->updateOpHistory(); + + // Due to deleted key, it may include multiple steps. + for (;;) { + ItrInt::ItrItem* cur_item = _get_entry(p->windowCursor, ItrInt::ItrItem, an); + uint64_t cur_seq = cur_item->lastRec.seqNum; + SizedBuf cur_key; + cur_key.alloc(cur_item->lastRec.kv.key); + + Status s; + avl_node* cursor = avl_first(&p->curWindow); + while (cursor) { + ItrInt::ItrItem* item = _get_entry(cursor, ItrInt::ItrItem, an); + if (item->flags & ItrInt::ItrItem::no_more_next) { + s = Status::ERROR; + } else { + if ( p->type == ItrInt::BY_SEQ && + item->lastRec.seqNum > cur_seq ) break; + if ( p->type == ItrInt::BY_KEY && + p->cmpSizedBuf(item->lastRec.kv.key, cur_key) > 0 ) break; + + if (item->logItr) s = item->logItr->next(); + if (item->tableItr) s = item->tableItr->next(); + } + + if (s) { + avl_remove(&p->curWindow, &item->an); + item->flags = ItrInt::ItrItem::none; + if (item->logItr) { + s = item->logItr->get(item->lastRec); + } + if (item->tableItr) { + s = item->tableItr->get(item->lastRec); + } + assert(s); + + avl_cmp_func* cmp_func = (p->type == ItrInt::BY_SEQ) + ? (ItrInt::ItrItem::cmpSeq) + : (ItrInt::ItrItem::cmpKey); + avl_node* avl_ret = avl_insert(&p->curWindow, &item->an, cmp_func); + assert(avl_ret == &item->an); + (void)avl_ret; + cursor = avl_first(&p->curWindow); + } else { + item->flags |= ItrInt::ItrItem::no_more_next; + cursor = avl_next(&item->an); + } + } + + p->windowCursor = avl_first(&p->curWindow); + while (p->windowCursor) { + // Find first valid item. + ItrInt::ItrItem* item = _get_entry(p->windowCursor, ItrInt::ItrItem, an); + + bool valid = false; + if (p->type == ItrInt::BY_SEQ) { + valid = p->checkValidBySeq(item, cur_seq); + } else if (p->type == ItrInt::BY_KEY) { + valid = p->checkValidByKey(item, cur_key); + } + + if (!valid) { + p->windowCursor = avl_next(p->windowCursor); + } else { + break; + } + } + cur_key.free(); + + if (!p->windowCursor) { + // Reached the end. + p->moveToLastValid(); + return Status::OUT_OF_RANGE; + } + + if (p->type == ItrInt::BY_KEY) { + cur_item = _get_entry(p->windowCursor, ItrInt::ItrItem, an); + if ( !cur_item->lastRec.isIns() ) { + // Deleted key, move further. + continue; + } + } + break; + } + + return Status(); +} + +Status Iterator::seek(const SizedBuf& key, SeekOption opt) { + return p->seekInternal(key, NOT_INITIALIZED, opt); +} + +Status Iterator::seekSN(const uint64_t seqnum, SeekOption opt) { + SizedBuf dummy_key; + return p->seekInternal(dummy_key, seqnum, opt); +} + +Status Iterator::gotoBegin() { + SizedBuf empty_key; + return p->seekInternal(empty_key, 0, GREATER); +} + +Status Iterator::gotoEnd() { + SizedBuf empty_key; + return p->seekInternal(empty_key, 0, SMALLER, true); +} + +Status Iterator::IteratorInternal::moveToLastValid() { + windowCursor = avl_last(&curWindow); + while (windowCursor) { + // Find *LAST* valid item (only for BY_KEY). + // + // e.g.) + // ... Del K9 (seq 100), Ins K9 (seq 99) + // We should pick up `Del K9`. + ItrInt::ItrItem* item = _get_entry(windowCursor, ItrInt::ItrItem, an); + + if (type == ItrInt::BY_KEY) { + ItrInt::ItrItem* prev_item = nullptr; + avl_node* prev_cursor = avl_prev(windowCursor); + if (prev_cursor) prev_item = _get_entry(prev_cursor, ItrItem, an); + + if (prev_item) { + int cmp = cmpSizedBuf( item->lastRec.kv.key, + prev_item->lastRec.kv.key ); + if (cmp == 0) { + // Same key, should take previous one. + windowCursor = prev_cursor; + continue; + } + } + } + break; +#if 0 + if (item->flags == ItrItem::none) break; + else windowCursor = avl_prev(windowCursor); +#endif + } + return Status(); +} + +Status Iterator::IteratorInternal::seekInternal + ( const SizedBuf& key, + const uint64_t seqnum, + SeekOption opt, + bool goto_end ) +{ + // Empty iterator: do nothing. + if (!windowCursor) return Status(); + + Status s; + + // Remove current items from `curWindow`. + std::vector items; + avl_node* cursor = avl_first(&curWindow); + while (cursor) { + ItrInt::ItrItem* item = _get_entry(cursor, ItrInt::ItrItem, an); + cursor = avl_next(&item->an); + avl_remove(&curWindow, &item->an); + items.push_back(item); + } + + // Seek for all items. + for (auto& entry: items) { + ItrInt::ItrItem*& item = entry; + if (item->logItr) { + if (goto_end) { + s = item->logItr->gotoEnd(); + } else { + if (type == ItrInt::BY_SEQ) { + s = item->logItr->seekSN(seqnum, (LogMgr::Iterator::SeekOption)opt); + } else { + s = item->logItr->seek(key, (LogMgr::Iterator::SeekOption)opt); + } + } + + } else { + if (goto_end) { + s = item->tableItr->gotoEnd(); + } else { + if (type == ItrInt::BY_SEQ) { + s = item->tableItr->seekSN(seqnum, (TableMgr::Iterator::SeekOption)opt); + } else { + s = item->tableItr->seek(key, (TableMgr::Iterator::SeekOption)opt); + } + } + } + + if (s) { + if (item->logItr) { + s = item->logItr->get(item->lastRec); + } + if (item->tableItr) { + s = item->tableItr->get(item->lastRec); + } + assert(s); + + int cmp = 0; + if (goto_end) { + // Goto end: special case. + cmp = -1; + } else { + if (type == ItrInt::BY_SEQ) { + if (item->lastRec.seqNum < seqnum) cmp = -1; + else if (item->lastRec.seqNum > seqnum) cmp = 1; + else cmp = 0; + } else { + cmp = ItrInt::cmpSizedBuf(item->lastRec.kv.key, key); + } + } + + item->flags = ItrInt::ItrItem::none; + if (opt == GREATER && cmp < 0) { + item->flags |= ItrInt::ItrItem::no_more_next; + } else if (opt == SMALLER && cmp > 0) { + item->flags |= ItrInt::ItrItem::no_more_prev; + } + } else { + item->flags = ItrInt::ItrItem::no_more_prev | + ItrInt::ItrItem::no_more_next; + } + + avl_cmp_func* cmp_func = (type == ItrInt::BY_SEQ) + ? (ItrInt::ItrItem::cmpSeq) + : (ItrInt::ItrItem::cmpKey); + avl_node* avl_ret = avl_insert(&curWindow, &item->an, cmp_func); + assert(avl_ret == &item->an); + (void)avl_ret; + } + + if (opt == GREATER) { + windowCursor = avl_first(&curWindow); + while (windowCursor) { + // Find first valid item. + ItrInt::ItrItem* item = _get_entry(windowCursor, ItrInt::ItrItem, an); + + if (item->flags == ItrItem::none) break; + else windowCursor = avl_next(windowCursor); + } + } else { // SMALLER + moveToLastValid(); + } + + if (!windowCursor) { + // Reached the end. + if (opt == GREATER) windowCursor = avl_last(&curWindow); + if (opt == SMALLER) windowCursor = avl_first(&curWindow); + } + + if (type == BY_KEY) { + ItrItem* item = _get_entry(windowCursor, ItrItem, an); + while ( !item->lastRec.isIns() ) { + // Deleted key, move the cursor. + if (opt == GREATER) s = parent->next(); + if (opt == SMALLER) s = parent->prev(); + + // NOTE: no key exists, should we return OK? + // if (!s) return s; + if (!s) break;; + item = _get_entry(windowCursor, ItrItem, an); + } + } + + return Status(); +} + + +int Iterator::IteratorInternal::cmpSizedBuf(const SizedBuf& l, const SizedBuf& r) { + CMP_NULL_CHK(l.data, r.data); + if (db->p->dbConfig.cmpFunc) { + // Custom cmp mode. + CustomCmpFunc func = db->p->dbConfig.cmpFunc; + void* param = db->p->dbConfig.cmpFuncParam; + return func(l.data, l.size, r.data, r.size, param); + } + return SizedBuf::cmp(l, r); +} + +bool Iterator::IteratorInternal::checkValidBySeq(ItrInt::ItrItem* item, + const uint64_t cur_seq, + const bool is_prev) +{ + if ( ( !is_prev && (item->flags & ItrInt::ItrItem::no_more_next) ) || + ( is_prev && (item->flags & ItrInt::ItrItem::no_more_prev) ) ) { + return false; + } else if (item->lastRec.seqNum == cur_seq) { + // Duplicate item, skip. + return false; + } + return true; +} + +bool Iterator::IteratorInternal::checkValidByKey(ItrInt::ItrItem* item, + const SizedBuf& cur_key, + const bool is_prev) +{ + if ( ( !is_prev && (item->flags & ItrInt::ItrItem::no_more_next) ) || + ( is_prev && (item->flags & ItrInt::ItrItem::no_more_prev) ) ) { + return false; + } else if (cmpSizedBuf(item->lastRec.kv.key, cur_key) == 0) { + // Duplicate item, skip. + return false; + } + return true; +} + +Status Iterator::close() { + if (p) { + avl_node* cursor = avl_first(&p->curWindow); + while (cursor) { + ItrInt::ItrItem* item = _get_entry(cursor, ItrInt::ItrItem, an); + cursor = avl_next(&item->an); + avl_remove(&p->curWindow, &item->an); + } + + for (auto& entry: p->itrs) { + ItrInt::ItrItem* item = entry; + if (item->logItr) { + item->logItr->close(); + delete item->logItr; + } + if (item->tableItr) { + item->tableItr->close(); + delete item->tableItr; + } + delete item; + } + + p->itrs.clear(); + p->db = nullptr; + p->windowCursor = nullptr; + } + return Status(); +} + +}; // namespace jungle + diff --git a/src/jungle.cc b/src/jungle.cc new file mode 100644 index 0000000..523f080 --- /dev/null +++ b/src/jungle.cc @@ -0,0 +1,818 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "db_internal.h" +#include "db_mgr.h" +#include "fileops_directio.h" +#include "fileops_posix.h" +#include "flusher.h" +#include "internal_helper.h" + +#include _MACRO_TO_STR(LOGGER_H) + +#include + +#include + +namespace jungle { + +DB::DB() : p(new DBInternal()), sn(nullptr) {} + +DB::DB(DB* _parent, uint64_t last_flush, uint64_t checkpoint) + : p(_parent->p) + , sn( new SnapInternal(last_flush, checkpoint) ) + {} + +DB::~DB() { + if (!sn) { + delete p; + } else { + delete sn; + } +} + +Status DB::open(DB** ptr_out, + const std::string& path, + const DBConfig& db_config) +{ + if (!ptr_out || path.empty() || !db_config.isValid()) { + return Status::INVALID_PARAMETERS; + } + + Status s; + + DBMgr* db_mgr = DBMgr::get(); + std::string empty_kvs_name; + DB* db = db_mgr->openExisting(path, empty_kvs_name); + if (db) { + _log_info(db->p->myLog, "Open existing DB %s %p", path.c_str(), db); + *ptr_out = db; + return Status(); + } + + try { + db = new DB(); + db->p->path = path; + db->p->fOps = new FileOpsPosix(); + db->p->dbConfig = db_config; + db->p->adjustConfig(); + + bool previous_exists = false; + if ( db->p->fOps->exist(db->p->path) && + db->p->fOps->exist(db->p->path + "/db_manifest") ) { + previous_exists = true; + } + + if (!previous_exists) { + if (db->p->dbConfig.readOnly) { + // Read-only mode: should fail. + throw Status(Status::FILE_NOT_EXIST); + } + // Create the directory. + db->p->fOps->mkdir(db->p->path.c_str()); + } + + // Start logger if enabled. + if ( db->p->dbConfig.allowLogging && + !db->p->myLog ) { + std::string logfile = db->p->path + "/system_logs.log"; + db->p->myLog = new SimpleLogger(logfile, 1024, 32*1024*1024, 4); + db->p->myLog->setLogLevel(4); + db->p->myLog->setDispLevel(-1); + db->p->myLog->start(); + } + db->p->fDirectOps = new FileOpsDirectIO(db->p->myLog); + _log_info(db->p->myLog, "Open new DB handle %s %p", path.c_str(), db); + _log_info(db->p->myLog, "cache size %zu", db_mgr->getGlobalConfig()->fdbCacheSize); + + if (previous_exists) { + // DB already exists, load it. + // Load DB manifest. + db->p->mani = new DBManifest(db->p->fOps); + db->p->mani->setLogger(db->p->myLog); + std::string m_filename = db->p->path + "/db_manifest"; + db->p->mani->load(db->p->path, m_filename); + } else { + // Create DB manifest. + db->p->mani = new DBManifest(db->p->fOps); + db->p->mani->setLogger(db->p->myLog); + std::string m_filename = db->p->path + "/db_manifest"; + db->p->mani->create(db->p->path, m_filename); + } + + // Main DB handle's ID is 0. + db->p->kvsID = 0; + + // Init log manager. + // It will manage log-manifest and log files. + LogMgrOptions log_mgr_opt; + log_mgr_opt.fOps = db->p->fOps; + log_mgr_opt.fDirectOps = db->p->fDirectOps; + log_mgr_opt.path = db->p->path; + log_mgr_opt.prefixNum = db->p->kvsID; + log_mgr_opt.dbConfig = &db->p->dbConfig; + db->p->logMgr = new LogMgr(db); + db->p->logMgr->setLogger(db->p->myLog); + TC(db->p->logMgr->init(log_mgr_opt)); + + // Init table manager. + // It will manage table-manifest and table files. + TableMgrOptions table_mgr_opt; + table_mgr_opt.fOps = db->p->fOps; + table_mgr_opt.path = db->p->path; + table_mgr_opt.prefixNum = db->p->kvsID; + table_mgr_opt.dbConfig = &db->p->dbConfig; + db->p->tableMgr = new TableMgr(db); + db->p->tableMgr->setLogger(db->p->myLog); + TC(db->p->tableMgr->init(table_mgr_opt)); + + // In case of previous crash, + // sync table's last seqnum if log is lagging behind. + db->p->logMgr->syncSeqnum(db->p->tableMgr); + + s = db_mgr->assignNew(db); + if (!s) { + // Other thread already creates the handle. + _log_debug(db->p->myLog, "Duplicate DB handle for %s %p", path.c_str(), db); + db->p->destroy(); + delete db; + db = db_mgr->openExisting(path, empty_kvs_name); + } + + *ptr_out = db; + return Status(); + + } catch (Status s) { + if (db && db->p) db->p->destroy(); + delete db; + return s; + } +} + +bool DB::isLogSectionMode(const std::string& path) { + if (!FileMgr::exist(path)) { + // Path doesn't exist. + return false; + } + + // TODO: Other non-default DB as well? + std::string t_mani_file = path + "/table0000_manifest"; + if (!FileMgr::exist(t_mani_file)) { + // Table manifest file doesn't exist. + return false; + } + + std::unique_ptr f_ops = + std::unique_ptr( new FileOpsPosix() ); + + Status s; + FileHandle* m_file = nullptr; + s = f_ops->open(&m_file, t_mani_file); + if (!s) return false; + + RwSerializer ss(f_ops.get(), m_file); + + // First 8 bytes should be 0xffff... + uint64_t last_t_num = ss.getU64(); + + // Next 8 bytes should be 1. + uint32_t num_levels = ss.getU32(); + + f_ops->close(m_file); + delete m_file; + + if ( last_t_num == std::numeric_limits::max() && + num_levels == 1 ) return true; + + return false; +} + +Status DB::init(const GlobalConfig& global_config) { + DBMgr* mgr = DBMgr::init(global_config); + if (!mgr) { + return Status::ERROR; + } + return Status(); +} + +Status DB::shutdown() { + DBMgr* mgr = DBMgr::getWithoutInit(); + if (!mgr) return Status::ALREADY_SHUTDOWN; + + bool shutdown_logger = mgr->getGlobalConfig()->shutdownLogger; + mgr->destroy(); + + if (shutdown_logger) { + SimpleLogger::shutdown(); + } + + return Status(); +} + +Status DB::close(DB* db) { + if (db->sn) { + _log_trace(db->p->myLog, "close snapshot %p", db); + // This is a snapshot handle. + db->p->logMgr->closeSnapshot(db); + db->p->tableMgr->closeSnapshot(db); + delete db; + return Status(); + } + + _log_info(db->p->myLog, "close db %p", db); + if (db->p->dbGroup) { + // This is a default handle of parent DBGroup handle. + // Do not close it. It will be closed along with DBGroup handle. + _log_info(db->p->myLog, + "default handle of group %p, defer to close it", + db->p->dbGroup); + return Status(); + } + + DBMgr* mgr = DBMgr::getWithoutInit(); + if (!mgr) return Status::ALREADY_SHUTDOWN; + return mgr->close(db); +} + +Status DB::openSnapshot(DB** snap_out, + const uint64_t checkpoint) +{ + Status s; + EP( p->checkHandleValidity() ); + + uint64_t chk_local = checkpoint; + if (checkpoint) { + // If checkpoint is given, find exact match. + std::list chk_nums; + EP(getCheckpoints(chk_nums)); + + bool found = false; + for (auto& entry: chk_nums) { + if (entry == checkpoint) { + found = true; + break; + } + } + if (!found) return Status::INVALID_CHECKPOINT; + } else { + // If 0, take the latest snapshot. + // NOTE: Should tolerate error for empty DB. + p->logMgr->getMaxSeqNum(chk_local); + } + uint64_t last_flush_seq = 0; + p->logMgr->getLastFlushedSeqNum(last_flush_seq); + + DB* snap = new DB(this, last_flush_seq, chk_local); + + p->logMgr->openSnapshot(snap, chk_local, snap->sn->logList); + // Maybe need same parameter to logList above. + p->tableMgr->openSnapshot(snap, chk_local, snap->sn->tableList); + *snap_out = snap; + + return Status(); +} + +Status DB::rollback(uint64_t seqnum_upto) +{ + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_WRITE) ); + + // NOTE: Only for log-only mode for now. + if (!p->dbConfig.logSectionOnly) return Status::INVALID_MODE; + + return p->logMgr->rollback(seqnum_upto); +} + +Status DB::set(const KV& kv) { + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_WRITE) ); + return setSN(NOT_INITIALIZED, kv); +} + +Status DB::setSN(const uint64_t seq_num, const KV& kv) { + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_WRITE) ); + + Record rec; + rec.kv = kv; + rec.seqNum = seq_num; + s = p->logMgr->setSN(rec); + return s; +} + +Status DB::setRecord(const Record& rec) { + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_WRITE) ); + return p->logMgr->setSN(rec); +} + +Status DB::setRecordByKey(const Record& rec) { + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_WRITE) ); + Record rec_local = rec; + rec_local.seqNum = NOT_INITIALIZED; + return p->logMgr->setSN(rec_local); +} + +Status DB::setRecordByKeyMulti(std::list& batch, + bool last_batch) +{ + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_WRITE) ); + + if (!p->dbConfig.bulkLoading) return Status::INVALID_MODE; + return p->logMgr->setByBulkLoader(batch, p->tableMgr, last_batch); +} + +Status DB::getMaxSeqNum(uint64_t& seq_num_out) { + Status s; + EP( p->checkHandleValidity() ); + return p->logMgr->getMaxSeqNum(seq_num_out); +} + +Status DB::getMinSeqNum(uint64_t& seq_num_out) { + Status s; + EP( p->checkHandleValidity() ); + return p->logMgr->getMinSeqNum(seq_num_out); +} + +Status DB::getLastFlushedSeqNum(uint64_t& seq_num_out) { + Status s; + EP( p->checkHandleValidity() ); + return p->logMgr->getLastFlushedSeqNum(seq_num_out); +} + +Status DB::getLastSyncedSeqNum(uint64_t& seq_num_out) { + Status s; + EP( p->checkHandleValidity() ); + return p->logMgr->getLastSyncedSeqNum(seq_num_out); +} + +Status DB::getCheckpoints(std::list& chk_out) { + Status s; + EP( p->checkHandleValidity() ); + + std::list chk_local; + EP(p->tableMgr->getAvailCheckpoints(chk_local)); + EP(p->logMgr->getAvailCheckpoints(chk_local)); + + // Asc order sort, remove duplicates. + std::set chk_sorted; + for (auto& entry: chk_local) chk_sorted.insert(entry); + for (auto& entry: chk_sorted) chk_out.push_back(entry); + + return Status(); +} + +Status DB::sync(bool call_fsync) { + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_FLUSH) ); + s = p->logMgr->sync(call_fsync); + return s; +} + +Status DB::syncNoWait(bool call_fsync) { + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_FLUSH) ); + s = p->logMgr->syncNoWait(call_fsync); + return s; +} + +Status DB::flushLogs(const FlushOptions& options, const uint64_t seq_num) { + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_FLUSH) ); + + FlushOptions local_options = options; + if (p->dbConfig.logSectionOnly) local_options.purgeOnly = true; + + _log_info(p->myLog, "Flush logs, upto %s (purgeOnly = %s).", + _seq_str(seq_num).c_str(), (local_options.purgeOnly)?"true":"false"); + + s = p->logMgr->flush(local_options, seq_num, p->tableMgr); + return s; +} + +Status DB::flushLogsAsync(const FlushOptions& options, + UserHandler handler, + void* ctx, + const uint64_t seq_num) +{ + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_FLUSH) ); + + DBMgr* db_mgr = DBMgr::getWithoutInit(); + if (!db_mgr) return Status::NOT_INITIALIZED; + + FlushOptions local_options = options; + if (p->dbConfig.logSectionOnly) local_options.purgeOnly = true; + + // Selective logging based on timer, to avoid verbose messages. + int num_suppressed = 0; + SimpleLogger::Levels lv = p->vlAsyncFlush.checkPrint(num_suppressed) + ? SimpleLogger::INFO + : SimpleLogger::DEBUG; + num_suppressed = (p->myLog && p->myLog->getLogLevel() >= SimpleLogger::DEBUG) + ? 0 : num_suppressed; + + _log_( lv, p->myLog, + "Request async log flushing, upto %s " + "(purgeOnly = %s, syncOnly = %s, callFsync = %s, delay = %zu), " + "%d messages suppressed", + _seq_str(seq_num).c_str(), + (local_options.purgeOnly) ? "true" : "false", + (local_options.syncOnly) ? "true" : "false", + (local_options.callFsync) ? "true" : "false", + local_options.execDelayUs, + num_suppressed ); + + FlusherQueueElem* elem = + new FlusherQueueElem(this, local_options, seq_num, handler, ctx); + db_mgr->flusherQueue()->push(elem); + + if (options.execDelayUs) { + // Delay is given. + std::lock_guard l(p->asyncFlushJobLock); + if ( !p->asyncFlushJob || + p->asyncFlushJob->isDone() ) { + // Schedule a new timer. + p->asyncFlushJob = db_mgr->getTpMgr()->addTask( + [db_mgr, lv, this](const simple_thread_pool::TaskResult& ret) { + if (!ret.ok()) return; + _log_(lv, p->myLog, "delayed flushing wakes up"); + db_mgr->workerMgr()->invokeWorker("flusher"); + }, + local_options.execDelayUs ); + _log_(lv, p->myLog, "scheduled delayed flushing %p, %zu us", + p->asyncFlushJob.get(), + local_options.execDelayUs ); + } + } else { + // Immediately invoke. + _log_(lv, p->myLog, "invoke flush worker"); + db_mgr->workerMgr()->invokeWorker("flusher"); + } + + return Status(); +} + +Status DB::checkpoint(uint64_t& seq_num_out, bool call_fsync) { + Status s; + s = p->logMgr->checkpoint(seq_num_out, call_fsync); + if (s) { + _log_info(p->myLog, "Added new checkpoint for %ld.", seq_num_out); + } else { + _log_err(p->myLog, "checkpoint returned %d.", s); + } + return s; +} + +Status DB::compactL0(const CompactOptions& options, + uint32_t hash_num) { + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_COMPACT) ); + s = p->tableMgr->compactL0(options, hash_num); + return s; +} + +Status DB::compactLevel(const CompactOptions& options, + size_t level) +{ + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_COMPACT) ); + if (p->dbConfig.nextLevelExtension) { + if (level == 0) { + // In level extension mode, level-0 is based on + // hash partition. We should call `compactL0()`. + return Status::INVALID_LEVEL; + } + s = p->tableMgr->compactLevelItr(options, nullptr, level); + } + return s; +} + +Status DB::compactInplace(const CompactOptions& options, + size_t level) +{ + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_COMPACT) ); + if (level == 0) return Status::INVALID_LEVEL; + s = p->tableMgr->compactInPlace(options, nullptr, level); + return s; +} + +Status DB::splitLevel(const CompactOptions& options, + size_t level) +{ + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_COMPACT) ); + if (level == 0) return Status::INVALID_LEVEL; + s = p->tableMgr->splitLevel(options, nullptr, level); + return s; +} + +Status DB::mergeLevel(const CompactOptions& options, + size_t level) +{ + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_COMPACT) ); + if (level == 0) return Status::INVALID_LEVEL; + s = p->tableMgr->mergeLevel(options, nullptr, level); + return s; +} + + +// NOTE: +// MemTable --> LogFile --> LogMgr ----> DB --(memcpy)--+--> User +// TableFile --(memcpy)---> TableMgr --> DB ------------+ + +Status DB::get(const SizedBuf& key, SizedBuf& value_out) { + Status s; + EP( p->checkHandleValidity() ); + + Record rec_local; + uint64_t chknum = (sn)?(sn->chkNum):(NOT_INITIALIZED); + std::list* l_list = (sn)?(sn->logList):(nullptr); + s = p->logMgr->get(chknum, l_list, key, rec_local); + if (s) { + if (!rec_local.isIns()) { + // Removed key, should return without searching tables. + return Status::KEY_NOT_FOUND; + } + value_out.alloc(rec_local.kv.value.size, rec_local.kv.value.data); + return s; + } + + // Not exist in log, but maybe they have been purged. + // Search in table. + Record rec; + rec.kv.key = key; + DB* snap_handle = (this->sn)?(this):(nullptr); + s = p->tableMgr->get(snap_handle, rec); + if (s) { + if (!rec.isIns()) { + // Removed key. + return Status::KEY_NOT_FOUND; + } + rec.kv.value.moveTo(value_out); + } + return s; +} + +Status DB::getSN(const uint64_t seq_num, KV& kv_out) { + Status s; + EP( p->checkHandleValidity() ); + + Record rec; + s = p->logMgr->getSN(seq_num, rec); + if (!s) return s; + if (!rec.isIns()) { + return Status::NOT_KV_PAIR; + } + rec.kv.copyTo(kv_out); + return s; +} + +Status DB::getRecord(const uint64_t seq_num, Record& rec_out) { + Status s; + EP( p->checkHandleValidity() ); + + Record rec; + s = p->logMgr->getSN(seq_num, rec); + if (!s) return s; + rec.copyTo(rec_out); + return s; +} + +Status DB::getRecordByKey(const SizedBuf& key, + Record& rec_out, + bool meta_only) +{ + Status s; + EP( p->checkHandleValidity() ); + + Record rec_local; + uint64_t chknum = (sn)?(sn->chkNum):(NOT_INITIALIZED); + std::list* l_list = (sn)?(sn->logList):(nullptr); + s = p->logMgr->get(chknum, l_list, key, rec_local); + if (s) { + if (!meta_only && !rec_local.isIns()) { + // NOT meta only mode + removed key: + // should return without searching tables. + return Status::KEY_NOT_FOUND; + } + if (meta_only) { + rec_local.kv.value.clear(); + } + rec_local.copyTo(rec_out); + return s; + } + + // Not exist in log, but maybe they have been purged. + // Search in table. + key.copyTo(rec_out.kv.key); + DB* snap_handle = (this->sn)?(this):(nullptr); + s = p->tableMgr->get(snap_handle, rec_out, meta_only); + if (s) { + if (!meta_only && !rec_out.isIns()) { + // Removed key, return false if not `meta_only` mode. + rec_out.free(); + return Status::KEY_NOT_FOUND; + } + } else { + rec_out.free(); + } + return s; +} + +Status DB::del(const SizedBuf& key) { + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_WRITE) ); + return delSN(NOT_INITIALIZED, key); +} + +Status DB::delSN(const uint64_t seq_num, const SizedBuf& key) { + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_WRITE) ); + + // Add a deletion marker for the given key. + Record rec; + rec.kv.key = key; + rec.seqNum = seq_num; + rec.type = Record::Type::DELETION; + s = p->logMgr->setSN(rec); + return s; +} + +Status DB::delRecord(const Record& rec) { + Status s; + EP( p->checkHandleValidity(DBInternal::OPTYPE_WRITE) ); + + Record rec_local = rec; + rec_local.type = Record::DELETION; + return p->logMgr->setSN(rec_local); +} + +Status DB::getStats(DBStats& stats_out) { + Status s; + EP( p->checkHandleValidity() ); + + DBMgr* mgr = DBMgr::getWithoutInit(); + if (!mgr) return Status::NOT_INITIALIZED; + + stats_out.cacheSizeByte = mgr->getGlobalConfig()->fdbCacheSize; + + // Number of entries in log. + uint64_t num_kvs_log = 0; + if (p) { + Status s; + uint64_t min_seq = 0, max_seq = 0; + s = p->logMgr->getAvailSeqRange(min_seq, max_seq); + if ( s && + valid_number(min_seq) && + valid_number(max_seq) && + max_seq >= min_seq ) { + if (min_seq) { + num_kvs_log = max_seq - min_seq + 1; + } else { + // Flush never happened. + num_kvs_log = max_seq; + } + } + } + + TableStats t_stats; + if (p && p->tableMgr) p->tableMgr->getStats(t_stats); + stats_out.numKvs = t_stats.numKvs + num_kvs_log; + stats_out.workingSetSizeByte = t_stats.workingSetSizeByte; + stats_out.cacheUsedByte = t_stats.cacheUsedByte; + + return Status(); +} + +void DB::setLogLevel(int new_level) { + p->myLog->setLogLevel(new_level); +} + +int DB::getLogLevel() const { + return p->myLog->getLogLevel(); +} + +void DB::setDebugParams(const DebugParams& to, + size_t effective_time_sec) +{ + DBMgr* mgr = DBMgr::getWithoutInit(); + if (!mgr) return; + mgr->setDebugParams(to, effective_time_sec); +} + +DebugParams DB::getDebugParams() { + static DebugParams default_debug_params; + DBMgr* mgr = DBMgr::getWithoutInit(); + if (!mgr) return default_debug_params; + return mgr->getDebugParams(); +} + +// === Internal ================================================================ + +DB::DBInternal::DBInternal() + : dbGroup(nullptr) + , wrapper(nullptr) + , fOps(nullptr) + , fDirectOps(nullptr) + , mani(nullptr) + , kvsID(0) + , logMgr(nullptr) + , tableMgr(nullptr) + , myLog(nullptr) + , vlAsyncFlush(VERBOSE_LOG_SUPPRESS_MS) + , asyncFlushJob(nullptr) +{} + +DB::DBInternal::~DBInternal() {} + +void DB::DBInternal::destroy() { + flags.closing = true; + if (tableMgr) tableMgr->disallowCompaction(); + waitForBgTasks(); + + { // Cancel async flush if exists. + std::lock_guard l(asyncFlushJobLock); + if (asyncFlushJob) { + bool ret = asyncFlushJob->cancel(); + _log_info(myLog, "cancel delayed async flush job %p: %d", + asyncFlushJob.get(), ret); + } + } + + if (!kvsID) { // Only default DB. + if (!dbConfig.readOnly) { + mani->store(); + mani->sync(); + } + DELETE(mani); + } + + if (logMgr) logMgr->close(); + if (tableMgr) tableMgr->close(); + DELETE(logMgr); + DELETE(tableMgr); + + // WARNING: + // fOps MUST be freed after both log & table manager, as + // the destructor of both managers use file operations for + // closing their files. + DELETE(fOps); + DELETE(fDirectOps); + if (!kvsID) { // Only default DB. + DELETE(myLog); + } +} + +void DB::DBInternal::adjustConfig() { +} + +void DB::DBInternal::waitForBgTasks() { + _log_info(myLog, "%zu background tasks are in progress", + flags.onGoingBgTasks.load()); + uint64_t ticks = 0; + while (flags.onGoingBgTasks && ticks < DB::DBInternal::MAX_CLOSE_RETRY) { + ticks++; + Timer::sleepMs(100); + } + _log_info(myLog, "%zu background tasks are in progress, %zu ticks", + flags.onGoingBgTasks.load(), ticks); +} + +void DB::DBInternal::updateOpHistory() { + DBMgr* mgr = DBMgr::getWithoutInit(); + if (!mgr) return; + mgr->updateOpHistory(); +} + +Status DB::DBInternal::checkHandleValidity(OpType op_type) { + if (flags.closing) return Status::HANDLE_IS_BEING_CLOSED; + if ( op_type != DBInternal::OPTYPE_READ && + flags.rollbackInProgress ) { + _log_warn(myLog, "attempt to mutate db while rollback is in progress, " + "op type %d", + op_type); + return Status::ROLLBACK_IN_PROGRESS; + } + return Status::OK; +} + +} // namespace jungle + diff --git a/src/latency_collector.h b/src/latency_collector.h new file mode 100644 index 0000000..5759ea3 --- /dev/null +++ b/src/latency_collector.h @@ -0,0 +1,485 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2017 Jung-Sang Ahn +See URL: https://github.com/greensky00/latency-collector + (v0.2.2) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "ashared_ptr.h" +#include "histogram.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct LatencyCollectorDumpOptions { + enum SortBy { + NAME, + TOTAL_TIME, + NUM_CALLS, + AVG_LATENCY + }; + + enum ViewType { + TREE, + FLAT + }; + + LatencyCollectorDumpOptions() + : sort_by(SortBy::NAME) + , view_type(ViewType::TREE) + {} + + SortBy sort_by; + ViewType view_type; +}; + +class LatencyItem; +class MapWrapper; +class LatencyDump { +public: + virtual std::string dump(MapWrapper* map_w, + const LatencyCollectorDumpOptions& opt) = 0; + virtual std::string dumpTree(MapWrapper* map_w, + const LatencyCollectorDumpOptions& opt) = 0; + + // To make child class be able to access internal map. + std::unordered_map& getMap(MapWrapper* map_w); +}; + +class LatencyItem { +public: + LatencyItem() {} + LatencyItem(const std::string& _name) : statName(_name) {} + LatencyItem(const LatencyItem& src) + : statName(src.statName) + , hist(src.hist) {} + + // this = src + LatencyItem& operator=(const LatencyItem& src) { + statName = src.statName; + hist = src.hist; + return *this; + } + + // this += rhs + LatencyItem& operator+=(const LatencyItem& rhs) { + hist += rhs.hist; + return *this; + } + + // returning lhs + rhs + friend LatencyItem operator+(LatencyItem lhs, + const LatencyItem& rhs) + { + lhs.hist += rhs.hist; + return lhs; + } + + std::string getName() const { + return statName; + } + + void addLatency(uint64_t latency) { hist.add(latency); } + uint64_t getAvgLatency() const { return hist.getAverage(); } + uint64_t getTotalTime() const { return hist.getSum(); } + uint64_t getNumCalls() const { return hist.getTotal(); } + uint64_t getMaxLatency() const { return hist.getMax(); } + uint64_t getMinLatency() { return hist.estimate(1); } + uint64_t getPercentile(double percentile) { return hist.estimate(percentile); } + + size_t getNumStacks() const { + size_t pos = 0; + size_t str_size = statName.size(); + size_t ret = 0; + while (pos < str_size) { + pos = statName.find(" ## ", pos); + if (pos == std::string::npos) break; + pos += 4; + ret++; + } + return ret; + } + + std::string getActualFunction() const { + size_t level = getNumStacks(); + if (!level) { + return statName; + } + + size_t pos = statName.rfind(" ## "); + return statName.substr(pos + 4); + } + + std::string getStatName() const { return statName; } + + std::map dumpHistogram() const { + std::map ret; + for (auto& entry: hist) { + HistItr& itr = entry; + uint64_t cnt = itr.getCount(); + if (cnt) { + ret.insert( std::make_pair(itr.getUpperBound(), cnt) ); + } + } + return ret; + } + +private: + std::string statName; + Histogram hist; +}; + +class LatencyCollector; +class MapWrapper { + friend class LatencyCollector; + friend class LatencyDump; +public: + MapWrapper() {} + MapWrapper(const MapWrapper &src) { + copyFrom(src); + } + + ~MapWrapper() {} + + size_t getSize() const { + size_t ret = 0; + for (auto& entry: map) { + if (entry.second->getNumCalls()) { + ret++; + } + } + return ret; + } + + void copyFrom(const MapWrapper &src) { + // Make a clone (but the map will point to same LatencyItems) + map = src.map; + } + + LatencyItem* addItem(const std::string& bin_name) { + LatencyItem* item = new LatencyItem(bin_name); + map.insert( std::make_pair(bin_name, item) ); + return item; + } + + void delItem(const std::string& bin_name) { + LatencyItem* item = nullptr; + auto entry = map.find(bin_name); + if (entry != map.end()) { + item = entry->second; + map.erase(entry); + delete item; + } + } + + LatencyItem* get(const std::string& bin_name) { + LatencyItem* item = nullptr; + auto entry = map.find(bin_name); + if (entry != map.end()) { + item = entry->second; + } + return item; + } + + std::string dump(LatencyDump* dump_inst, + const LatencyCollectorDumpOptions& opt) { + if (dump_inst) return dump_inst->dump(this, opt); + return "null dump implementation"; + } + + std::string dumpTree(LatencyDump* dump_inst, + const LatencyCollectorDumpOptions& opt) { + if (dump_inst) return dump_inst->dumpTree(this, opt); + return "null dump implementation"; + } + + void freeAllItems() { + for (auto& entry : map) { + delete entry.second; + } + } + +private: + std::unordered_map map; +}; + +inline std::unordered_map& + LatencyDump::getMap(MapWrapper* map_w) +{ + return map_w->map; +} + +using MapWrapperSP = ashared_ptr; +//using MapWrapperSP = std::shared_ptr; + +class LatencyCollector { + friend class LatencyDump; + +public: + LatencyCollector() { + latestMap = MapWrapperSP(new MapWrapper()); + } + + ~LatencyCollector() { + latestMap->freeAllItems(); + } + + size_t getNumItems() const { + return latestMap->getSize(); + } + + void addStatName(const std::string& lat_name) { + MapWrapperSP cur_map = latestMap; + if (!cur_map->get(lat_name)) { + cur_map->addItem(lat_name); + } // Otherwise: already exists. + } + + void addLatency(const std::string& lat_name, uint64_t lat_value) { + MapWrapperSP cur_map = nullptr; + + size_t ticks_allowed = MAX_ADD_NEW_ITEM_RETRIES; + do { + cur_map = latestMap; + LatencyItem *item = cur_map->get(lat_name); + if (item) { + // Found existing latency. + item->addLatency(lat_value); + return; + } + + // Not found, + // 1) Create a new map containing new stat in an MVCC manner, and + // 2) Replace 'latestMap' pointer atomically. + + // Note: + // Below insertion process happens only when a new stat item + // is added. Generally the number of stats is not pretty big (<100), + // and adding new stats will be finished at the very early stage. + // Once all stats are populated in the map, below codes will never + // be called, and adding new latency will be done without blocking + // anything. + + // Copy from the current map. + MapWrapper* new_map_raw = new MapWrapper(); + new_map_raw->copyFrom(*cur_map); + MapWrapperSP new_map = MapWrapperSP(new_map_raw); + + // Add a new item. + item = new_map->addItem(lat_name); + item->addLatency(lat_value); + + // Atomic CAS, from current map to new map + MapWrapperSP expected = cur_map; + if (latestMap.compare_exchange(expected, new_map)) { + // Succeeded. + return; + } + + // Failed, other thread updated the map at the same time. + // Delete newly added item. + new_map_raw->delItem(lat_name); + // Retry. + } while (ticks_allowed--); + + // Update failed, ignore the given latency at this time. + } + + LatencyItem getAggrItem(const std::string& lat_name) { + LatencyItem ret; + if (lat_name.empty()) return ret; + + MapWrapperSP cur_map_p = latestMap; + MapWrapper* cur_map = cur_map_p.get(); + + for (auto& entry: cur_map->map) { + LatencyItem *item = entry.second; + std::string actual_name = item->getActualFunction(); + + if (actual_name != lat_name) continue; + + if (ret.getName().empty()) { + // Initialize. + ret = *item; + } else { + // Already exists. + ret += *item; + } + } + + return ret; + } + + uint64_t getAvgLatency(const std::string& lat_name) { + MapWrapperSP cur_map = latestMap; + LatencyItem *item = cur_map->get(lat_name); + return (item)? item->getAvgLatency() : 0; + } + + uint64_t getMinLatency(const std::string& lat_name) { + MapWrapperSP cur_map = latestMap; + LatencyItem *item = cur_map->get(lat_name); + return (item && item->getNumCalls()) ? item->getMinLatency() : 0; + } + + uint64_t getMaxLatency(const std::string& lat_name) { + MapWrapperSP cur_map = latestMap; + LatencyItem *item = cur_map->get(lat_name); + return (item) ? item->getMaxLatency() : 0; + } + + uint64_t getTotalTime(const std::string& lat_name) { + MapWrapperSP cur_map = latestMap; + LatencyItem *item = cur_map->get(lat_name); + return (item) ? item->getTotalTime() : 0; + } + + uint64_t getNumCalls(const std::string& lat_name) { + MapWrapperSP cur_map = latestMap; + LatencyItem *item = cur_map->get(lat_name); + return (item) ? item->getNumCalls() : 0; + } + + uint64_t getPercentile(const std::string& lat_name, double percentile) { + MapWrapperSP cur_map = latestMap; + LatencyItem *item = cur_map->get(lat_name); + return (item) ? item->getPercentile(percentile) : 0; + } + + std::string dump( LatencyDump* dump_inst, + const LatencyCollectorDumpOptions& opt + = LatencyCollectorDumpOptions() ) + { + MapWrapperSP cur_map_p = latestMap; + MapWrapper* cur_map = cur_map_p.get(); + + if (opt.view_type == LatencyCollectorDumpOptions::TREE) { + return cur_map->dumpTree(dump_inst, opt); + } else { + return cur_map->dump(dump_inst, opt); + } + } + +private: + static const size_t MAX_ADD_NEW_ITEM_RETRIES = 16; + // Mutex for Compare-And-Swap of latestMap. + std::mutex lock; + MapWrapperSP latestMap; +}; + +struct ThreadTrackerItem { + ThreadTrackerItem() + : numStacks(0), + aggrStackNameRaw(4096), + lenName(0) + {} + + void pushStackName(const std::string& cur_stack_name) { + size_t cur_stack_name_len = cur_stack_name.size(); + while (lenName + 4 + cur_stack_name_len > aggrStackNameRaw.size()) { + // Double the string buffer. + aggrStackNameRaw.resize(aggrStackNameRaw.size() * 2); + } + + // Remember the latest length for later poping up. + lenStack.push_back(lenName); + strcpy(&aggrStackNameRaw[0] + lenName, " ## "); + lenName += 4; + strcpy(&aggrStackNameRaw[0] + lenName, cur_stack_name.c_str()); + lenName += cur_stack_name_len; + + numStacks++; + } + + size_t popLastStack() { + lenName = *(lenStack.rbegin()); + lenStack.pop_back(); + + return --numStacks; + } + + std::string getAggrStackName() { + aggrStackNameRaw[lenName] = 0; + return &aggrStackNameRaw[0]; + } + + size_t numStacks; + std::vector aggrStackNameRaw; + size_t lenName; + std::list lenStack; +}; + +struct LatencyCollectWrapper { + using SystemClock = std::chrono::system_clock; + using TimePoint = std::chrono::time_point; + using MicroSeconds = std::chrono::microseconds; + + LatencyCollectWrapper(LatencyCollector *_lat, + const std::string& _func_name) { + lat = _lat; + if (lat) { + start = SystemClock::now(); + + thread_local ThreadTrackerItem thr_item; + cur_tracker = &thr_item; + cur_tracker->pushStackName(_func_name); + } + } + + ~LatencyCollectWrapper() { + if (lat) { + TimePoint end = SystemClock::now(); + auto us = std::chrono::duration_cast(end - start); + + lat->addLatency(cur_tracker->getAggrStackName(), us.count()); + cur_tracker->popLastStack(); + } + } + + LatencyCollector *lat; + ThreadTrackerItem *cur_tracker; + TimePoint start; +}; + +#if defined(WIN32) || defined(_WIN32) +#define collectFuncLatency(lat) \ + LatencyCollectWrapper LCW__func_latency__((lat), __FUNCTION__) +#else +#define collectFuncLatency(lat) \ + LatencyCollectWrapper LCW__func_latency__((lat), __func__) +#endif + +#define collectBlockLatency(lat, name) \ + LatencyCollectWrapper LCW__block_latency__((lat), name) + diff --git a/src/latency_dump.h b/src/latency_dump.h new file mode 100644 index 0000000..0f614c3 --- /dev/null +++ b/src/latency_dump.h @@ -0,0 +1,339 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. +Author/Developer(s): Jung-Sang Ahn + +Original Copyright 2017 Jung-Sang Ahn +See URL: https://github.com/greensky00/latency-collector + (v0.2.1) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "latency_collector.h" + +#include +#include +#include + +class LatencyDumpDefaultImpl : public LatencyDump { +public: + std::string dump(MapWrapper* map_w, + const LatencyCollectorDumpOptions& opt) { + std::stringstream ss; + if (!map_w->getSize()) { + ss << "# stats: " << map_w->getSize() << std::endl; + return ss.str(); + } + + std::multimap > map_uint64_t; + std::map map_string; + size_t max_name_len = 9; // reserved for "STAT NAME" 9 chars + + std::unordered_map& map = getMap(map_w); + + // Deduplication + for (auto& entry: map) { + LatencyItem *item = entry.second; + if (!item->getNumCalls()) { + continue; + } + std::string actual_name = getActualFunction(item->getName(), false); + + auto existing = map_string.find(actual_name); + if (existing != map_string.end()) { + LatencyItem* item_found = existing->second; + *item_found += *item; + } else { + LatencyItem* new_item = new LatencyItem(*item); + map_string.insert( std::make_pair(actual_name, new_item) ); + } + + if (actual_name.size() > max_name_len) { + max_name_len = actual_name.size(); + } + } + + ss << "# stats: " << map_string.size() << std::endl; + + for (auto& entry: map_string) { + LatencyItem *item = entry.second; + if (!item->getNumCalls()) continue; + + switch (opt.sort_by) { + case LatencyCollectorDumpOptions::NAME: { + // Do nothing + break; + } + + // Otherwise: dealing with uint64_t, map_uint64_t. + case LatencyCollectorDumpOptions::TOTAL_TIME: + addToUintMap(item->getTotalTime(), map_uint64_t, item); + break; + + case LatencyCollectorDumpOptions::NUM_CALLS: + addToUintMap(item->getNumCalls(), map_uint64_t, item); + break; + + case LatencyCollectorDumpOptions::AVG_LATENCY: + addToUintMap(item->getAvgLatency(), map_uint64_t, item); + break; + } + } + + addDumpTitle(ss, max_name_len); + + if (opt.sort_by == LatencyCollectorDumpOptions::NAME) { + // Name (string) + for (auto& entry: map_string) { + LatencyItem *item = entry.second; + if (item->getNumCalls()) { + ss << dumpItem(item, max_name_len, 0, false) + << std::endl; + } + } + } else { + // Otherwise (number) + for (auto& entry: map_uint64_t) { + LatencyItem *item = entry.second; + if (item->getNumCalls()) { + ss << dumpItem(item, max_name_len, 0, false) + << std::endl; + } + } + } + + // Free all. + for (auto& entry: map_string) { + delete entry.second; + } + + return ss.str(); + } + + std::string dumpTree(MapWrapper* map_w, + const LatencyCollectorDumpOptions& opt) { + std::stringstream ss; + DumpItem root; + + // Sort by name first. + std::map by_name; + std::unordered_map& map = getMap(map_w); + for (auto& entry : map) { + LatencyItem *item = entry.second; + by_name.insert( std::make_pair(item->getName(), item) ); + } + + size_t max_name_len = 9; + std::vector last_ptr(1); + last_ptr[0] = &root; + for (auto& entry : by_name) { + LatencyItem *item = entry.second; + std::string item_name = item->getName(); + size_t level = getNumStacks(item_name); + if (!level) { + // Not a thread-aware latency item, stop. + return dump(map_w, opt); + } + + DumpItem* parent = last_ptr[level-1]; + assert(parent); // Must exist + + DumpItemP dump_item(new DumpItem(level, item, parent->itself)); + if (level >= last_ptr.size()) { + last_ptr.resize(level*2); + } + last_ptr[level] = dump_item.get(); + parent->child.push_back(std::move(dump_item)); + + size_t actual_name_len = getActualFunction(item_name).size(); + if (actual_name_len > max_name_len) { + max_name_len = actual_name_len; + } + } + + addDumpTitle(ss, max_name_len); + dumpRecursive(ss, &root, max_name_len); + + return ss.str(); + } + +private: + static std::string usToString(uint64_t us) { + std::stringstream ss; + if (us < 1000) { + // us + ss << std::fixed << std::setprecision(0) << us << " us"; + } else if (us < 1000000) { + // ms + double tmp = static_cast(us / 1000.0); + ss << std::fixed << std::setprecision(1) << tmp << " ms"; + } else if (us < (uint64_t)600 * 1000000) { + // second (from 1 second to 10 mins) + double tmp = static_cast(us / 1000000.0); + ss << std::fixed << std::setprecision(1) << tmp << " s"; + } else { + // minute + double tmp = static_cast(us / 60.0 / 1000000.0); + ss << std::fixed << std::setprecision(0) << tmp << " m"; + } + return ss.str(); + } + + static std::string countToString(uint64_t count) { + std::stringstream ss; + if (count < 1000) { + ss << count; + } else if (count < 1000000) { + // K + double tmp = static_cast(count / 1000.0); + ss << std::fixed << std::setprecision(1) << tmp << "K"; + } else if (count < (uint64_t)1000000000) { + // M + double tmp = static_cast(count / 1000000.0); + ss << std::fixed << std::setprecision(1) << tmp << "M"; + } else { + // B + double tmp = static_cast(count / 1000000000.0); + ss << std::fixed << std::setprecision(1) << tmp << "B"; + } + return ss.str(); + } + + static std::string ratioToPercent(uint64_t a, uint64_t b) { + std::stringstream ss; + double tmp = (double)100.0 * a / b; + ss << std::fixed << std::setprecision(1) << tmp << " %"; + return ss.str(); + } + + static size_t getNumStacks(const std::string& str) { + size_t pos = 0; + size_t str_size = str.size(); + size_t ret = 0; + while (pos < str_size) { + pos = str.find(" ## ", pos); + if (pos == std::string::npos) break; + pos += 4; + ret++; + } + return ret; + } + + static std::string getActualFunction(const std::string& str, + bool add_tab = true) { + size_t level = getNumStacks(str); + if (!level) { + return str; + } + + size_t pos = str.rfind(" ## "); + std::string ret = ""; + if (level > 1 && add_tab) { + for (size_t i=1; igetStatName(), add_tab) << ": "; + ss << std::right; + ss << std::setw(8) << usToString(item->getTotalTime()) << " "; + if (parent_total_time) { + ss << std::setw(7) + << ratioToPercent(item->getTotalTime(), parent_total_time) + << " "; + } else { + ss << " ---" << " "; + } + ss << std::setw(6) << countToString(item->getNumCalls()) << " "; + ss << std::setw(8) << usToString(item->getAvgLatency()) << " "; + ss << std::setw(8) << usToString(item->getPercentile(50)) << " "; + ss << std::setw(8) << usToString(item->getPercentile(99)) << " "; + ss << std::setw(8) << usToString(item->getPercentile(99.9)); + return ss.str(); + } + + struct DumpItem { + using UPtr = std::unique_ptr; + + DumpItem() : level(0), itself(nullptr), parent(nullptr) {} + DumpItem(size_t _level, LatencyItem* _item, LatencyItem* _parent) + : level(_level), + itself(_item), + parent(_parent) {} + + size_t level; + LatencyItem* itself; + LatencyItem* parent; + std::list child; + }; + using DumpItemP = DumpItem::UPtr; + + static void dumpRecursive(std::stringstream& ss, + DumpItem* dump_item, + size_t max_name_len) { + if (dump_item->itself) { + if (dump_item->parent) { + ss << dumpItem(dump_item->itself, max_name_len, + dump_item->parent->getTotalTime()); + } else { + ss << dumpItem(dump_item->itself, max_name_len); + } + ss << std::endl; + } + for (auto& entry : dump_item->child) { + DumpItem* child = entry.get(); + dumpRecursive(ss, child, max_name_len); + } + } + + static void addDumpTitle(std::stringstream& ss, size_t max_name_len) { + ss << std::left << std::setw(max_name_len) << "STAT NAME" << ": "; + ss << std::right; + ss << std::setw(8) << "TOTAL" << " "; + ss << std::setw(7) << "RATIO" << " "; + ss << std::setw(6) << "CALLS" << " "; + ss << std::setw(8) << "AVERAGE" << " "; + ss << std::setw(8) << "p50" << " "; + ss << std::setw(8) << "p99" << " "; + ss << std::setw(8) << "p99.9"; + ss << std::endl; + } + + static void addToUintMap(uint64_t value, + std::multimap >& map, + LatencyItem* item) + { + map.insert( std::make_pair(value, item) ); + } +}; + + diff --git a/src/list.cc b/src/list.cc new file mode 100644 index 0000000..11bbb13 --- /dev/null +++ b/src/list.cc @@ -0,0 +1,156 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2018 Jung-Sang Ahn. +See URL: https://github.com/greensky00/linkedlist + (v0.1.2) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "list.h" + +// LCOV_EXCL_START + +namespace c_list { + +void list_push_front(struct list* list, struct list_elem* e) +{ + if (list->head == NULL) { + list->head = e; + list->tail = e; + e->next = e->prev = NULL; + } else { + list->head->prev = e; + e->prev = NULL; + e->next = list->head; + list->head = e; + } + list->num_nodes++; +} + +void list_push_back(struct list* list, struct list_elem* e) +{ + if (list->tail == NULL) { + list->head = e; + list->tail = e; + e->next = e->prev = NULL; + } else { + list->tail->next = e; + e->prev = list->tail; + e->next = NULL; + list->tail = e; + } + list->num_nodes++; +} + +void list_insert_before(struct list* list, + struct list_elem* pivot, + struct list_elem* e) +{ + e->prev = pivot->prev; + e->next = pivot; + if (pivot->prev) { + pivot->prev->next = e; + } else { + list->head = e; + } + pivot->prev = e; + + list->num_nodes++; +} + +void list_insert_after(struct list* list, + struct list_elem* pivot, + struct list_elem* e) +{ + e->next = pivot->next; + e->prev = pivot; + if (pivot->next) { + pivot->next->prev = e; + } else { + list->tail = e; + } + pivot->next = e; + + list->num_nodes++; +} + +struct list_elem* list_remove(struct list* list, + struct list_elem* e) +{ + if (e) { + if (e->next) e->next->prev = e->prev; + if (e->prev) e->prev->next = e->next; + + if (list->head == e) list->head = e->next; + if (list->tail == e) list->tail = e->prev; + + list->num_nodes--; + return e->next; + } + return NULL; +} + +struct list_elem* list_remove_reverse(struct list* list, + struct list_elem* e) +{ + if (e) { + if (e->next) e->next->prev = e->prev; + if (e->prev) e->prev->next = e->next; + + if (list->head == e) list->head = e->next; + if (list->tail == e) list->tail = e->prev; + + list->num_nodes--; + return e->prev; + } + return NULL; +} + +struct list_elem* list_pop_front(struct list* list) +{ + struct list_elem *e = list->head; + if (e) { + if (e->next) e->next->prev = e->prev; + if (e->prev) e->prev->next = e->next; + + if (list->head == e) list->head = e->next; + if (list->tail == e) list->tail = e->prev; + + list->num_nodes--; + return e; + } + return NULL; +} + +struct list_elem* list_pop_back(struct list* list) +{ + struct list_elem* e = list->tail; + if (e) { + if (e->next) e->next->prev = e->prev; + if (e->prev) e->prev->next = e->next; + + if (list->head == e) list->head = e->next; + if (list->tail == e) list->tail = e->prev; + + list->num_nodes--; + return e; + } + return NULL; +} + +} + +// LCOV_EXCL_STOP + diff --git a/src/list.h b/src/list.h new file mode 100644 index 0000000..0088cba --- /dev/null +++ b/src/list.h @@ -0,0 +1,115 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2018 Jung-Sang Ahn. +See URL: https://github.com/greensky00/linkedlist + (v0.1.2) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include +#include + +namespace c_list { + +struct list_elem { + struct list_elem* prev; + struct list_elem* next; +}; + +struct list { + struct list_elem* head; + struct list_elem* tail; + uint32_t num_nodes; +}; + +#ifndef _get_entry +#define _get_entry(ELEM, STRUCT, MEMBER) \ + ((STRUCT *) ((uint8_t *) (ELEM) - offsetof (STRUCT, MEMBER))) +#endif + +static inline void list_init(struct list* list) +{ + list->head = NULL; + list->tail = NULL; + list->num_nodes = 0; +} + +static inline void list_elem_init(struct list_elem* le) +{ + le->prev = NULL; + le->next = NULL; +} + +static inline size_t list_size(struct list* list) { + return list->num_nodes; +} + +// Insert `e` at the head of `list`. +void list_push_front(struct list* list, struct list_elem* e); + +// Insert `e` at the tail of `list`. +void list_push_back(struct list* list, struct list_elem* e); + +// Insert `e` before `pivot`. +void list_insert_before(struct list* list, + struct list_elem* pivot, + struct list_elem* e); + +// Insert `e` after `pivot`. +void list_insert_after(struct list* list, + struct list_elem* pivot, + struct list_elem* e); + +// Remove `e`, and return its next. +struct list_elem* list_remove(struct list* list, struct list_elem* e); + +// Remove `e`, and return its prev. +struct list_elem* list_remove_reverse(struct list* list, struct list_elem* e); + +// Remove the head of `list`, and then return it. +struct list_elem* list_pop_front(struct list* list); + +// Remove the tail of `list`, and then return it. +struct list_elem* list_pop_back(struct list* list); + +static inline struct list_elem* list_begin(struct list* list) +{ + return list->head; +} + +static inline struct list_elem* list_end(struct list* list) +{ + return list->tail; +} + +static inline struct list_elem* list_next(struct list_elem* e) +{ + return e->next; +} + +static inline struct list_elem* list_prev(struct list_elem* e) +{ + return e->prev; +} + +static inline int list_is_empty(struct list* list) +{ + return list->head == NULL; +} + +} + diff --git a/src/log_file.cc b/src/log_file.cc new file mode 100644 index 0000000..cde57f4 --- /dev/null +++ b/src/log_file.cc @@ -0,0 +1,806 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "log_file.h" + +#include "db_mgr.h" +#include "internal_helper.h" +#include "log_mgr.h" + +#include _MACRO_TO_STR(LOGGER_H) + +namespace jungle { + +static uint8_t LOGFILE_FOOTER[8] = {0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0xab, 0xcc}; +static uint32_t LOGFILE_VERSION = 0x1; + +LogFile::LogFile(const LogMgr* log_mgr) + : logFileNum(0) + , fHandle(nullptr) + , mTable(nullptr) + , immutable(false) + , coldChk(false) + , integrity(IntegrityTypes::UNKNOWN) + , memtablePurged(false) + , logMgr(log_mgr) + , fileSize(0) + , myLog(nullptr) +{} + +LogFile::~LogFile() { + if (fHandle) { + if (fHandle->isOpened()) { + fOps->close(fHandle); + } + DELETE(fHandle); + } + + if (mTable) { + DELETE(mTable); + } +} + +bool LogFile::isSynced() { + if (coldChk) return false; + if (mTable) { + uint64_t max = mTable->maxSeqNum; + uint64_t synced = mTable->syncedSeqNum; + if ( valid_number(max) && + ( !valid_number(synced) || + synced < max ) ) { + return false; + } + } + return true; +} + +bool LogFile::okToCloseFHandle() { + return immutable && isSynced(); +} + +std::string LogFile::getLogFileName(const std::string& path, + uint64_t prefix_num, + uint64_t log_file_num) +{ + // Log file name example: log0001_00000001 + // log0001_00000002 + // ... + char p_num[16]; + char l_num[16]; + sprintf(p_num, "%04" PRIu64, prefix_num); + sprintf(l_num, "%08" PRIu64, log_file_num); + std::string l_filename = path + "/log" + p_num + "_" + l_num; + return l_filename; +} + +Status LogFile::openFHandle() { + Status s; + EP( fOps->open(&fHandle, filename.c_str()) ); + _log_debug(myLog, "open file %s handle %p", + filename.c_str(), fHandle); + return s; +} + +Status LogFile::closeFHandle() { + if (!fHandle) return Status::ALREADY_CLOSED; + + if (fHandle->isOpened()) { + // Once file is closed, file integrity is unkown + integrity = IntegrityTypes::UNKNOWN; + fOps->close(fHandle); + } + _log_debug(myLog, "close file %s handle %p", + filename.c_str(), fHandle); + delete fHandle; + fHandle = nullptr; + + return Status(); +} + +void LogFile::touch() { + lastAcc.reset(); +} + +bool LogFile::isExpired() { + if ( lastAcc.getUs() > + (uint64_t)logMgr->getDbConfig()->logFileTtl_sec * 1000000 ) { + return true; + } + return false; +} + +uint64_t LogFile::getLastAcc() { + return lastAcc.getUs(); +} + +Status LogFile::create(const std::string& _filename, + FileOps* _f_ops, + const uint64_t log_file_num, + const uint64_t start_seq_num) +{ + if (mTable) return Status::ALREADY_INITIALIZED; + + filename = _filename; + fOps = _f_ops; + logFileNum = log_file_num; + + if (fOps->exist(filename)) { + // Previous file exists, which means that there is a legacy log file. + // We should overwrite it. + _log_warn(myLog, "file %s already exists, remove it", filename.c_str()); + fOps->remove(filename); + } + + // New created file is valid + integrity = IntegrityTypes::VALID; + Status s; + + // Create a memtable + mTable = new MemTable(this); + mTable->setLogger(myLog); + EP( mTable->init(start_seq_num) ); + + // Prepend footer and version + if (!fHandle) { + EP( openFHandle() ); + } + + size_t fsize = fOps->eof(fHandle); + SizedBuf fbuf(fsize); + SizedBuf::Holder h_fbuf(fbuf); + EP( fOps->pread(fHandle, fbuf.data, fbuf.size, 0) ); + + RwSerializer ss(&fbuf); + ss.put(LOGFILE_FOOTER, 8); + ss.putU32(LOGFILE_VERSION); + EP( fOps->append(fHandle, fbuf.data, ss.pos()) ); + EP( fOps->flush(fHandle) ); + + if (okToCloseFHandle()) { + EP( closeFHandle() ); + } + touch(); + + return Status(); +} + +Status LogFile::load(const std::string& _filename, + FileOps* _f_ops, + uint64_t log_file_num, + uint64_t min_seq, + uint64_t flushed_seq, + uint64_t synced_seq) +{ + touch(); + + if (mTable) return Status::ALREADY_INITIALIZED; + + filename = _filename; + fOps = _f_ops; + logFileNum = log_file_num; + // We will not append existing file. + immutable = true; + + if (!fOps->exist(filename.c_str())) return Status::FILE_NOT_EXIST; + + memTableOnFileMeta.minSeq = min_seq; + memTableOnFileMeta.flushedSeq = flushed_seq; + memTableOnFileMeta.syncedSeq = synced_seq; + memTableOnFileMeta.maxSeq = synced_seq; + if (logMgr->isTtlMode()) { + // If reclaim mode, load mem-table lazily. + memtablePurged = true; + return Status(); + } + return loadMemTable(); +} + +Status LogFile::loadMemTable() { + touch(); + + if (mTable) return Status::ALREADY_INITIALIZED; + + Status s; + // Open log file + if (!fHandle) { + EP( openFHandle() ); + } + + // Check footer and version. + fileSize = fOps->eof(fHandle); + if (!fileSize) { + // If file is empty, treat it as a non-exist file. + closeFHandle(); + return Status::FILE_NOT_EXIST; + } + + SizedBuf read_buf(fileSize); + SizedBuf::Holder h_read_buf(read_buf); + + fOps->pread(fHandle, read_buf.data, read_buf.size, 0); + + uint8_t footer_file[8]; + RwSerializer ss(read_buf); + ss.get(footer_file, 8); + uint32_t ver_file = ss.getU32(s); + (void)ver_file; + + // Load to MemTable. + mTable = new MemTable(this); + mTable->setLogger(myLog); + EP( mTable->load( ss, + memTableOnFileMeta.minSeq, + memTableOnFileMeta.flushedSeq, + memTableOnFileMeta.syncedSeq ) ); + if (okToCloseFHandle()) { + EP( closeFHandle() ); + } + memtablePurged = false; + _log_info(myLog, "loaded memtable of file %zu", logFileNum); + + return Status(); +} + +Status LogFile::truncate(uint64_t seq_upto) { + Status s; + // Open log file + if (!fHandle) { + EP( openFHandle() ); + } + + // Check footer and version. + fileSize = fOps->eof(fHandle); + if (!fileSize) { + // If file is empty, treat it as a non-exist file. + closeFHandle(); + return Status::FILE_NOT_EXIST; + } + + try { + SizedBuf read_buf(fileSize); + SizedBuf::Holder h_read_buf(read_buf); + + TC( fOps->pread(fHandle, read_buf.data, read_buf.size, 0) ); + + RwSerializer ss(read_buf); + if (!ss.available(8 + 4)) { + throw Status(Status::INCOMPLETE_LOG); + } + + uint8_t footer_file[8]; + ss.get(footer_file, 8); + uint32_t ver_file = ss.getU32(s); + (void)ver_file; + + uint64_t offset_to_truncate = 0; + MemTable::findOffsetOfSeq(myLog, ss, seq_upto, offset_to_truncate); + + if (mTable) { + mTable->maxSeqNum = seq_upto; + mTable->syncedSeqNum = seq_upto; + mTable->seqNumAlloc = seq_upto; + } + memTableOnFileMeta.maxSeq = seq_upto; + memTableOnFileMeta.syncedSeq = seq_upto; + + if (!offset_to_truncate) { + if (okToCloseFHandle()) { + EP( closeFHandle() ); + } + return Status::SEQNUM_NOT_FOUND; + } + + // Truncate. + fOps->ftruncate(fHandle, offset_to_truncate); + + if (okToCloseFHandle()) { + EP( closeFHandle() ); + } + + return Status(); + } catch (Status s) { + if (okToCloseFHandle()) { + closeFHandle(); + } + return s; + } +} + +Status LogFile::assignSeqNum(Record& rec_local) { + touch(); + return mTable->assignSeqNum(rec_local); +} + +Status LogFile::setSN(const Record& rec) { + touch(); + + // Put into memtable + Status s; + bool overwrite = false; + if ( valid_number(rec.seqNum) && + logMgr->getDbConfig()->allowOverwriteSeqNum && + rec.seqNum <= mTable->maxSeqNum ) { + overwrite = true; + } + + s = mTable->putNewRecord(rec); + if (overwrite) { + if ( valid_number(mTable->flushedSeqNum) && + rec.seqNum <= mTable->flushedSeqNum ) { + mTable->flushedSeqNum = rec.seqNum - 1; + } + + if ( valid_number(mTable->syncedSeqNum) && + rec.seqNum <= mTable->syncedSeqNum ) { + mTable->syncedSeqNum = rec.seqNum - 1; + } + } + return s; +} + +Status LogFile::getSN(const uint64_t seq_num, Record& rec_out) { + touch(); + return mTable->findRecordBySeq(seq_num, rec_out); +} + +Status LogFile::get(const uint64_t chk, + const SizedBuf& key, + uint64_t* key_hash, + Record& rec_out, + bool allow_tombstone) +{ + touch(); + Status s; + EP( mTable->getRecordByKey(chk, key, key_hash, rec_out, allow_tombstone) ); + return Status(); +} + +Status LogFile::flushMemTable() { + touch(); + // Skip unnecessary flushing + if (immutable && !fHandle && isSynced()) { + _log_debug(myLog, + "skip unnecessary flushing to file %s %ld", + filename.c_str(), logFileNum); + return Status(); + } + + try { + Status s; + if (!fHandle) { + _log_warn(myLog, + "try to flush into a file that already closed %s %ld", + filename.c_str(), logFileNum); + integrity = IntegrityTypes::UNKNOWN; + TC( openFHandle() ); + } + assert(fHandle); + +#if 0 + // Memory buffer based approach. + SizedBuf a_buf(4096); + SizedBuf::Holder h_a_buf(a_buf); + + RwSerializer rws(&a_buf); + + TC( mTable->flush(rws) ); + TC( mTable->appendFlushMarker(rws) ); + + // Append at the end. + size_t last_pos = fOps->eof(fHandle); + TC( fOps->pwrite(fHandle, a_buf.data, rws.pos(), last_pos) ); + + fileSize = fOps->eof(fHandle); + if (immutable) { + TC( closeFHandle() ); + } + return Status(); + +#else + // Writing into pre-exist file needs to check integrity + if (IntegrityTypes::UNKNOWN == integrity) { + std::string message; + fileSize = fOps->eof(fHandle); + _log_warn(myLog, + "flush into pre-exist file %s %ld, size %ld, " + "checking file integrity... ", + filename.c_str(), logFileNum, fileSize); + if (!fileSize) { + // If file is empty, treat it as new created file. + integrity = IntegrityTypes::VALID; + message = "it's empty"; + } else { + SizedBuf read_buf(fileSize); + SizedBuf::Holder h_read_buf(read_buf); + + TC( fOps->pread(fHandle, read_buf.data, read_buf.size, 0) ); + + RwSerializer ss(read_buf); + if (!ss.available(8 + 4)) { + integrity = IntegrityTypes::CORRUPTED; + message = "file footer is incomplete"; + } else { + uint8_t footer_file[8]; + ss.get(footer_file, 8); + uint32_t ver_file = ss.getU32(s); + (void) ver_file; + + uint64_t offset_to_truncate = 0; + uint64_t padding_start_pos = NOT_INITIALIZED; + s = MemTable::findOffsetOfSeq(myLog, + ss, + NOT_INITIALIZED, + offset_to_truncate, + &padding_start_pos); + integrity = s ? IntegrityTypes::VALID : IntegrityTypes::CORRUPTED; + if (s && NOT_INITIALIZED != padding_start_pos) { + // Get rid of padding bytes for new writes + Status ts = fOps->ftruncate(fHandle, padding_start_pos); + if (!ts) { + // Treat it corrupted if failed to remove padding bytes + integrity = IntegrityTypes::CORRUPTED; + message = "failed to remove padding bytes"; + } + } + } + } + if (IntegrityTypes::CORRUPTED == integrity) { + _log_err(myLog, + "integrity check of file %s %ld, size %ld, " + "result CORRUPTED, msg %s", + filename.c_str(), logFileNum, fileSize, + message.c_str()); + } + } + + if (IntegrityTypes::CORRUPTED == integrity) { + _log_err(myLog, + "failed to flush into file %s %ld as file is CORRUPTED", + filename.c_str(), logFileNum); + // Don't break other files' flushing, so return OK + return Status(); + } + + RwSerializer rws(fOps, fHandle, true); + + TC( mTable->flush(rws) ); + TC( mTable->appendFlushMarker(rws) ); + + TC( fOps->flush(fHandle) ); + + fileSize = fOps->eof(fHandle); + if (okToCloseFHandle()) { + TC( closeFHandle() ); + } + return Status(); +#endif + + } catch (Status s) { + if (okToCloseFHandle()) { + closeFHandle(); + } + return s; + } +} + +Status LogFile::purgeMemTable() { + Timer timer; + + // WARNING: + // The caller of this function should coordinate + // race condition between load(), get(), and so on. + + // Can purge memtable only when the file is immutable. + if (!immutable) return Status::FILE_IS_NOT_IMMUTABLE; + + memTableOnFileMeta.minSeq = mTable->minSeqNum; + memTableOnFileMeta.maxSeq = mTable->maxSeqNum; + memTableOnFileMeta.flushedSeq = mTable->flushedSeqNum; + memTableOnFileMeta.syncedSeq = mTable->syncedSeqNum; + + // Now all incoming request should not go to `mTable`. + memtablePurged = true; + + if (mTable) { + DELETE(mTable); + } + _log_info( myLog, + "purged memtable of file %s %zu, %zu us, min seq %s, " + "max seq %s, flush seq %s, sync seq %s", + filename.c_str(), + logFileNum, + timer.getUs(), + _seq_str( memTableOnFileMeta.minSeq ).c_str(), + _seq_str( memTableOnFileMeta.maxSeq ).c_str(), + _seq_str( memTableOnFileMeta.flushedSeq ).c_str(), + _seq_str( memTableOnFileMeta.syncedSeq ).c_str() ); + + return Status(); +} + +uint64_t LogFile::size() const { + return mTable ? mTable->size() : 0; +} + +Status LogFile::sync() { + touch(); + + Status s; + if (!fHandle) { + EP( openFHandle() ); + } + s = mTable->sync(fOps, fHandle); + if (okToCloseFHandle()) { + EP( closeFHandle() ); + } + return s; +} + +Status LogFile::checkpoint(uint64_t& seq_num_out) { + touch(); + + // A new checkpoint is added into an immutable file + // Needs to flush memtable again + if (immutable) coldChk = true; + + Status s; + s = mTable->checkpoint(seq_num_out); + return s; +} + +Status LogFile::getLogsToFlush(const uint64_t seq_num, + std::list& list_out, + bool ignore_sync_seqnum) +{ + touch(); + + Status s; + s = mTable->getLogsToFlush(seq_num, list_out, ignore_sync_seqnum); + return s; +} + +Status LogFile::getCheckpoints(const uint64_t seq_num, + std::list& list_out) +{ + touch(); + + Status s; + s = mTable->getCheckpoints(seq_num, list_out); + return s; +} + +Status LogFile::setSyncedSeqNum(const uint64_t seq_num) { + touch(); + if (!mTable) { + memTableOnFileMeta.syncedSeq = seq_num; + return Status(); + } + + if (!valid_number(mTable->maxSeqNum)) { + // Nothing was appended. + return Status(); + } + + if (seq_num > mTable->maxSeqNum) { + mTable->syncedSeqNum.store(mTable->maxSeqNum.load()); + } else { + mTable->syncedSeqNum = seq_num; + } + _log_debug(myLog, "Set log file %ld synced seqnum to %ld.", + logFileNum, mTable->syncedSeqNum.load()); + return Status(); +} + +Status LogFile::setFlushedSeqNum(const uint64_t seq_num) { + touch(); + if (!mTable) { + memTableOnFileMeta.flushedSeq = seq_num; + return Status(); + } + + if (!valid_number(mTable->syncedSeqNum)) { + // Nothing was flushed. + return Status(); + } + + if (seq_num > mTable->syncedSeqNum) { + mTable->flushedSeqNum.store(mTable->syncedSeqNum.load()); + } else { + mTable->flushedSeqNum = seq_num; + } + _log_debug(myLog, "Set log file %ld flushed seqnum to %ld.", + logFileNum, mTable->flushedSeqNum.load()); + return Status(); +} + +Status LogFile::updateSeqNumByBulkLoader(const uint64_t seq_num) { + touch(); + + mTable->flushedSeqNum = seq_num; + mTable->syncedSeqNum = seq_num; + mTable->minSeqNum = seq_num; + mTable->maxSeqNum = seq_num; + return Status(); +} + +Status LogFile::destroySelf() { + if (fHandle) { + if (fHandle->isOpened()) { + fOps->close(fHandle); + } + delete fHandle; + fHandle = nullptr; + } + + if (fOps->exist(filename.c_str())) { + // Instead removing it immediately, + // put it into remove list. + DBMgr* dbm = DBMgr::getWithoutInit(); + if (!dbm) { + fOps->remove(filename.c_str()); + } else { + dbm->addFileToRemove(filename); + } + } + + if (mTable) { + delete mTable; + mTable = nullptr; + } + + return Status(); +} + +uint64_t LogFile::getMinSeqNum() const { + if (mTable) { + return mTable->minSeqNum; + } else { + return memTableOnFileMeta.minSeq; + } +} + +uint64_t LogFile::getFlushedSeqNum() const { + if (mTable) { + return mTable->flushedSeqNum; + } else { + return memTableOnFileMeta.flushedSeq; + } +} + +uint64_t LogFile::getSyncedSeqNum() const { + if (mTable) { + return mTable->syncedSeqNum; + } else { + return memTableOnFileMeta.syncedSeq; + } +} + +uint64_t LogFile::getMaxSeqNum() const { + if (mTable) { + return mTable->maxSeqNum.load(MOR); + } else { + return memTableOnFileMeta.maxSeq; + } +} + +void LogFile::setImmutable() { + immutable = true; + if (mTable) { + _log_info( myLog, + "log file %s %zu becomes immutable, min seq %s, " + "max seq %s, flushed seq %s, synced seq %s, size %zu", + filename.c_str(), + logFileNum, + _seq_str(mTable->minSeqNum).c_str(), + _seq_str(mTable->maxSeqNum).c_str(), + _seq_str(mTable->flushedSeqNum).c_str(), + _seq_str(mTable->syncedSeqNum).c_str(), + mTable->bytesSize.load() ); + } else { + _log_info(myLog, "log file %s %zu becomes immutable", + filename.c_str(), logFileNum); + } +} + +bool LogFile::isValidToWrite() { + // If 1) already set immutable OR + // 2) file size exceeds OR + // 3) # entries exceeds + // then not writable. + uint32_t max_log_file_size = logMgr->getDbConfig()->maxLogFileSize; + uint32_t max_log_entries = logMgr->getDbConfig()->maxEntriesInLogFile; + if ( immutable || + fileSize > max_log_file_size || + mTable->getNumLogs() >= max_log_entries ) + return false; + return true; +} + +bool LogFile::isIncreasingOrder() const { + if (mTable) return mTable->isIncreasingOrder(); + return false; +} + + +LogFile::Iterator::Iterator() : lFile(nullptr) {} +LogFile::Iterator::~Iterator() {} + +Status LogFile::Iterator::init(LogFile* l_file, + const SizedBuf& start_key, + const SizedBuf& end_key, + const uint64_t seq_upto) +{ + lFile = l_file; + lFile->touch(); + return mItr.init(lFile->mTable, start_key, end_key, seq_upto); +} + +Status LogFile::Iterator::initSN(LogFile* l_file, + const uint64_t min_seq, + const uint64_t max_seq) +{ + lFile = l_file; + lFile->touch(); + return mItr.initSN(lFile->mTable, min_seq, max_seq); +} + + +Status LogFile::Iterator::get(Record& rec_out) { + lFile->touch(); + return mItr.get(rec_out); +} + +Status LogFile::Iterator::prev(bool allow_tombstone) { + lFile->touch(); + return mItr.prev(allow_tombstone); +} + +Status LogFile::Iterator::next(bool allow_tombstone) { + lFile->touch(); + return mItr.next(allow_tombstone); +} + +Status LogFile::Iterator::seek(const SizedBuf& key, SeekOption opt) { + lFile->touch(); + return mItr.seek(key, (MemTable::Iterator::SeekOption)opt); +} + +Status LogFile::Iterator::seekSN(const uint64_t seqnum, SeekOption opt) { + lFile->touch(); + return mItr.seekSN(seqnum, (MemTable::Iterator::SeekOption)opt); +} + +Status LogFile::Iterator::gotoBegin() { + lFile->touch(); + return mItr.gotoBegin(); +} + +Status LogFile::Iterator::gotoEnd() { + lFile->touch(); + return mItr.gotoEnd(); +} + +Status LogFile::Iterator::close() { + lFile->touch(); + Status s; + EP( mItr.close() ); + lFile = nullptr; + return Status(); +} + +} // namespace jungle + + diff --git a/src/log_file.h b/src/log_file.h new file mode 100644 index 0000000..05fbfb8 --- /dev/null +++ b/src/log_file.h @@ -0,0 +1,236 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "fileops_base.h" +#include "internal_helper.h" +#include "memtable.h" + +#include + +#include +#include +#include + +class SimpleLogger; + +namespace jungle { + +// TODO: make this class virtual in the future. +class LogMgr; +class LogFile { + friend class MemTable; +public: + LogFile(const LogMgr* log_mgr); + ~LogFile(); + + static std::string getLogFileName(const std::string& path, + uint64_t prefix_num, + uint64_t log_file_num); + + Status openFHandle(); + Status closeFHandle(); + + void touch(); + + bool isExpired(); + + uint64_t getLastAcc(); + + Status create(const std::string& _filename, + FileOps* _f_ops, + const uint64_t log_file_num, + const uint64_t start_seq_num); + Status load(const std::string& _filename, + FileOps* _f_ops, + uint64_t log_file_num, + uint64_t min_seq, + uint64_t flushed_seq, + uint64_t synced_seq); + + Status loadMemTable(); + + Status truncate(uint64_t seq_upto); + + Status assignSeqNum(Record& rec_local); + + Status setSN(const Record& rec); + + // Returns pointer only. + Status getSN(const uint64_t seq_num, Record& rec_out); + + // Returns pointer only. + Status get(const uint64_t chk, + const SizedBuf& key, + uint64_t* key_hash, + Record& rec_out, + bool allow_tombstone = false); + + Status flushMemTable(); + + Status purgeMemTable(); + + bool isMemTablePurged() const { return memtablePurged; } + + uint64_t size() const; + + Status sync(); + + bool isSynced(); + + Status checkpoint(uint64_t& seq_num_out); + + Status getLogsToFlush(const uint64_t seq_num, + std::list& list_out, + bool ignore_sync_seqnum); + // If seq_num == NOT_INITIALIZED, return all. + Status getCheckpoints(const uint64_t seq_num, + std::list& list_out); + Status setSyncedSeqNum(const uint64_t seq_num); + Status setFlushedSeqNum(const uint64_t seq_num); + Status updateSeqNumByBulkLoader(const uint64_t seq_num); + Status destroySelf(); + + uint64_t getMinSeqNum() const; + uint64_t getFlushedSeqNum() const; + uint64_t getSyncedSeqNum() const; + uint64_t getMaxSeqNum() const; + uint64_t getLogFileNum() const { return logFileNum; } + + void setImmutable(); + bool isImmutable() const { return immutable; } + bool isValidToWrite(); + bool isIncreasingOrder() const; + + uint64_t getSeqCounter() const { + if (mTable) return mTable->getSeqCounter(); + return NOT_INITIALIZED; + } + + Status forceSeqnum(uint64_t to) { + if (mTable) { + return mTable->forceSeqnum(to); + } + // In TTL mode, `mTable` will not exist but that's ok + // as TTL mode is only available in log-section-only mode. + return Status::NOT_INITIALIZED; + } + + void setLogger(SimpleLogger* logger) { + myLog = logger; + if (mTable) mTable->setLogger(logger); + } + + class Iterator { + public: + Iterator(); + ~Iterator(); + + enum SeekOption { + GREATER = 0, + SMALLER = 1, + }; + + Status init(LogFile* l_file, + const SizedBuf& start_key, + const SizedBuf& end_key, + const uint64_t seq_upto); + Status initSN(LogFile* l_file, + const uint64_t min_seq, + const uint64_t max_seq); + Status get(Record& rec_out); + Status prev(bool allow_tombstone = false); + Status next(bool allow_tombstone = false); + Status seek(const SizedBuf& key, SeekOption opt = GREATER); + Status seekSN(const uint64_t seqnum, SeekOption opt = GREATER); + Status gotoBegin(); + Status gotoEnd(); + Status close(); + + private: + LogFile* lFile; + MemTable::Iterator mItr; + }; + +private: +// === TYPES + struct PurgedMemTableMeta { + PurgedMemTableMeta() + : minSeq(0), maxSeq(0), flushedSeq(0), syncedSeq(0) {} + uint64_t minSeq; + uint64_t maxSeq; + uint64_t flushedSeq; + uint64_t syncedSeq; + }; + + enum IntegrityTypes { + UNKNOWN, + VALID, + CORRUPTED + }; + +// === FUNCTIONS + + bool okToCloseFHandle(); + +// === VARIABLES + + // Log file number. + uint64_t logFileNum; + + // File name. + std::string filename; + + // File operations. + FileOps* fOps; + + // Handle for this log file. + FileHandle* fHandle; + + // In-memory index for this log file. + MemTable* mTable; + + // If true, immutable file. + std::atomic immutable; + + // If true, a new checkpoint is added after immutable + bool coldChk; + + // If file is corrupted, new writes are forbidden. + IntegrityTypes integrity; + + // Memtable has been purged. + std::atomic memtablePurged; + + // Memtable metadata if it is purged. + PurgedMemTableMeta memTableOnFileMeta; + + // Parent log manager. + const LogMgr* logMgr; + + // Current file size. + uint64_t fileSize; + + // Timer for last access. + Timer lastAcc; + + // Logger. + SimpleLogger* myLog; +}; + +} // namespace jungle + diff --git a/src/log_iterator.cc b/src/log_iterator.cc new file mode 100644 index 0000000..24c30d5 --- /dev/null +++ b/src/log_iterator.cc @@ -0,0 +1,601 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "log_mgr.h" + +#include "db_internal.h" + +namespace jungle { + +LogMgr::Iterator::Iterator() + : lMgr(nullptr) + , snapLogList(nullptr) + , minSeqSnap(NOT_INITIALIZED) + , maxSeqSnap(NOT_INITIALIZED) + , windowCursor(nullptr) +{ + avl_init(&curWindow, nullptr); +} + +LogMgr::Iterator::~Iterator() { + close(); +} + +void LogMgr::Iterator::addLogFileItr(LogFileInfo* l_info) { + LogFile::Iterator* l_itr = new LogFile::Iterator(); + if (type == BY_SEQ) { + l_itr->initSN(l_info->file, minSeqSnap, maxSeqSnap); + } else if (type == BY_KEY) { + l_itr->init(l_info->file, startKey, endKey, maxSeqSnap); + } + + ItrItem* ctx = new ItrItem(); + ctx->lInfo = l_info; + ctx->lItr = l_itr; + Status s = ctx->lItr->get(ctx->lastRec); + if (s) { + avl_cmp_func* cmp_func = (type == BY_SEQ) + ? (ItrItem::cmpSeq) + : (ItrItem::cmpKey); + avl_node* avl_ret = avl_insert(&curWindow, &ctx->an, cmp_func); + assert(avl_ret == &ctx->an); (void)avl_ret; + } + itrs.push_back(ctx); +} + + +Status LogMgr::Iterator::init(DB* snap_handle, + LogMgr* log_mgr, + const SizedBuf& start_key, + const SizedBuf& end_key) +{ + uint64_t empty_seq = NOT_INITIALIZED; + return initInternal(snap_handle, log_mgr, + empty_seq, empty_seq, + start_key, end_key, BY_KEY); +} + +Status LogMgr::Iterator::initSN(DB* snap_handle, + LogMgr* log_mgr, + uint64_t min_seq, + uint64_t max_seq) +{ + SizedBuf empty_key; + return initInternal(snap_handle, log_mgr, + min_seq, max_seq, + empty_key, empty_key, BY_SEQ); +} + +Status LogMgr::Iterator::initInternal(DB* snap_handle, + LogMgr* log_mgr, + uint64_t min_seq, + uint64_t max_seq, + const SizedBuf& start_key, + const SizedBuf& end_key, + LogMgr::Iterator::Type _type) +{ + // Save current seq number status. + Status s; + s = log_mgr->getAvailSeqRange(minSeqSnap, maxSeqSnap); + // No log yet. + if (!s) return s; + + if (valid_number(min_seq) && minSeqSnap < min_seq) minSeqSnap = min_seq; + if (valid_number(max_seq) && max_seq < maxSeqSnap) maxSeqSnap = max_seq; + if (snap_handle) { + assert(snap_handle->sn); + if (maxSeqSnap > snap_handle->sn->chkNum) { + maxSeqSnap = snap_handle->sn->chkNum; + } + if (minSeqSnap > snap_handle->sn->lastFlush) { + minSeqSnap = snap_handle->sn->lastFlush; + } + } + + // No available records in log section. + if (minSeqSnap > maxSeqSnap) return Status::OUT_OF_RANGE; + + lMgr = log_mgr; + type = _type; + startKey.alloc(start_key); + endKey.alloc(end_key); + if (lMgr->getDbConfig()->cmpFunc) { + // Custom cmp function exists. + avl_set_aux(&curWindow, (void*)lMgr); + } + + try { + uint64_t l_num_min; + uint64_t l_num_max; + + if (snap_handle) { + // Snapshot + assert(snap_handle->sn->logList); + for (auto& entry: *snap_handle->sn->logList) { + LogFileInfo* l_info = entry; + addLogFileItr(l_info); + } + snapLogList = snap_handle->sn->logList; + } + + // Only when not a snapshot. + for (; !snap_handle; ) { + // Normal + TC( lMgr->mani->getLastFlushedLog(l_num_min) ); + TC( lMgr->mani->getMaxLogFileNum(l_num_max) ); + + // If seq-range is given, load that files only. + if (valid_number(minSeqSnap)) { + uint64_t min_seq_log; + s = lMgr->mani->getLogFileNumBySeq(minSeqSnap, min_seq_log); + if (s) { + l_num_min = min_seq_log; + } + } + if (valid_number(maxSeqSnap)) { + uint64_t max_seq_log; + s = lMgr->mani->getLogFileNumBySeq(maxSeqSnap, max_seq_log); + if (s) { + l_num_max= max_seq_log; + } + } + + bool retry_init = false; + for (uint64_t ii = l_num_min; ii <= l_num_max; ++ii) { + LogFileInfo* l_info = nullptr; + s = lMgr->mani->getLogFileInfo(ii, l_info); + if (!s || l_info->isRemoved()) { + _log_info(log_mgr->myLog, "log file %lu: %d %s, retry", + ii, (int)s, + (l_info && l_info->isRemoved())?"removed":"not-exist"); + // `l_num_min` is invalid. Retry. + // Free all resources. + for (auto& entry: itrs) { + ItrItem* ctx = entry; + ctx->lItr->close(); + delete ctx->lItr; + ctx->lInfo->done(); + delete ctx; + } + if (s) l_info->done(); + retry_init = true; + + // Reset item list. + itrs.clear(); + + // Reset AVL-tree as well. + avl_init(&curWindow, curWindow.aux); + break; + } + addLogFileItr(l_info); + } + + if (retry_init) continue; + break; + } + + windowCursor = avl_first(&curWindow); + if (!windowCursor) throw Status(Status::OUT_OF_RANGE); + +#if 0 + // Now this feature is done by higher level iterator (i.e., Jungle iterator). + if (type == BY_KEY) { + ItrItem* cur_item = _get_entry(windowCursor, ItrItem, an); + while (cur_item && !cur_item->lastRec.isIns()) { + s = next(); + if (!s) return s; + cur_item = _get_entry(windowCursor, ItrItem, an); + } + } +#endif + + return Status(); + + } catch (Status s) { + startKey.free(); + endKey.free(); + return s; + } +} + + +Status LogMgr::Iterator::get(Record& rec_out) { + if (!windowCursor) return Status::KEY_NOT_FOUND; + + ItrItem* item = _get_entry(windowCursor, ItrItem, an); + rec_out = item->lastRec; + return Status(); +} + +Status LogMgr::Iterator::prev() { + Status s; + + ItrItem* cur_item = _get_entry(windowCursor, ItrItem, an); + uint64_t cur_seq = cur_item->lastRec.seqNum; + SizedBuf cur_key; + cur_key.alloc(cur_item->lastRec.kv.key); + + // Do prev() for all iterators GTEQ windowCursor. + // Note: opposite direction. + avl_node* cursor = avl_last(&curWindow); + while (cursor) { + ItrItem* item = _get_entry(cursor, ItrItem, an); + if (item->flags & ItrItem::no_more_prev) { + s = Status::ERROR; + } else { + if ( type == BY_SEQ && + item->lastRec.seqNum < cur_seq ) break; + if ( type == BY_KEY && + cmpSizedBuf(item->lastRec.kv.key, cur_key) < 0 ) break; + // Include tombstone. + s = item->lItr->prev(true); + } + + if (s) { + avl_remove(&curWindow, &item->an); + item->flags = ItrItem::none; + s = item->lItr->get(item->lastRec); + assert(s); + + avl_cmp_func* cmp_func = (type == BY_SEQ) + ? (ItrItem::cmpSeq) + : (ItrItem::cmpKey); + avl_node* avl_ret = avl_insert(&curWindow, &item->an, cmp_func); + assert(avl_ret == &item->an); (void)avl_ret; + cursor = avl_last(&curWindow); + } else { + item->flags |= ItrItem::no_more_prev; + cursor = avl_prev(&item->an); + } + } + + // Opposite direction. + windowCursor = avl_last(&curWindow); + ItrItem* last_valid_item = nullptr; + while (windowCursor) { + // Find *LAST* valid item (only for BY_KEY). + ItrItem* item = _get_entry(windowCursor, ItrItem, an); + + bool valid = false; + if (type == BY_SEQ) { + valid = checkValidBySeq(item, cur_seq, true); + if (!valid) windowCursor = avl_prev(windowCursor); + else break; + + } else if (type == BY_KEY) { + valid = checkValidByKey(item, cur_key, true); + if (last_valid_item && + cmpSizedBuf(item->lastRec.kv.key, + last_valid_item->lastRec.kv.key) < 0) break; + if (valid) last_valid_item = item; + windowCursor = avl_prev(windowCursor); + } + } + + if (last_valid_item) windowCursor = &last_valid_item->an; + + cur_key.free(); + + if (!windowCursor) { + // Reached the end. + windowCursor = avl_first(&curWindow); + return Status::OUT_OF_RANGE; + } + return Status(); +} + +Status LogMgr::Iterator::next() { + Status s; + + ItrItem* cur_item = _get_entry(windowCursor, ItrItem, an); + uint64_t cur_seq = cur_item->lastRec.seqNum; + SizedBuf cur_key; + cur_key.alloc(cur_item->lastRec.kv.key); + + // Do next() for all iterators SMEQ windowCursor. + avl_node* cursor = avl_first(&curWindow); + while (cursor) { + ItrItem* item = _get_entry(cursor, ItrItem, an); + if (item->flags & ItrItem::no_more_next) { + s = Status::ERROR; + } else { + if ( type == BY_SEQ && + item->lastRec.seqNum > cur_seq ) break; + if ( type == BY_KEY && + cmpSizedBuf(item->lastRec.kv.key, cur_key) > 0 ) break; + // Include tombstone. + s = item->lItr->next(true); + } + + if (s) { + avl_remove(&curWindow, &item->an); + item->flags = ItrItem::none; + s = item->lItr->get(item->lastRec); + assert(s); + + avl_cmp_func* cmp_func = (type == BY_SEQ) + ? (ItrItem::cmpSeq) + : (ItrItem::cmpKey); + avl_node* avl_ret = avl_insert(&curWindow, &item->an, cmp_func); + assert(avl_ret == &item->an); (void)avl_ret; + cursor = avl_first(&curWindow); + } else { + item->flags |= ItrItem::no_more_next; + cursor = avl_next(&item->an); + } + } + + windowCursor = avl_first(&curWindow); + while (windowCursor) { + // Find first valid item. + ItrItem* item = _get_entry(windowCursor, ItrItem, an); + + bool valid = false; + if (type == BY_SEQ) { + valid = checkValidBySeq(item, cur_seq); + } else if (type == BY_KEY) { + valid = checkValidByKey(item, cur_key); + } + + if (!valid) { + windowCursor = avl_next(windowCursor); + } else { + break; + } + } + + cur_key.free(); + + if (!windowCursor) { + // Reached the end. + moveToLastValid(); + return Status::OUT_OF_RANGE; + } + return Status(); +} + +Status LogMgr::Iterator::seek(const SizedBuf& key, SeekOption opt) +{ + return seekInternal(key, NOT_INITIALIZED, opt); +} + +Status LogMgr::Iterator::seekSN(const uint64_t seqnum, SeekOption opt) +{ + SizedBuf dummy_key; + return seekInternal(dummy_key, seqnum, opt); +} + +Status LogMgr::Iterator::gotoBegin() { + SizedBuf empty_key; + return seekInternal(empty_key, 0, GREATER); +} + +Status LogMgr::Iterator::gotoEnd() { + SizedBuf empty_key; + return seekInternal(empty_key, 0, SMALLER, true); +} + +Status LogMgr::Iterator::moveToLastValid() { + windowCursor = avl_last(&curWindow); + while (windowCursor) { + // Find *LAST* valid item (only for BY_KEY). + // + // e.g.) + // ... Del K9 (seq 100), Ins K9 (seq 99) + // We should pick up `Del K9`. + ItrItem* item = _get_entry(windowCursor, ItrItem, an); + + if (type == BY_KEY) { + ItrItem* prev_item = nullptr; + avl_node* prev_cursor = avl_prev(windowCursor); + if (prev_cursor) prev_item = _get_entry(prev_cursor, ItrItem, an); + + if (prev_item) { + int cmp = cmpSizedBuf( item->lastRec.kv.key, + prev_item->lastRec.kv.key ); + if (cmp == 0) { + // Same key, should take previous one. + windowCursor = prev_cursor; + continue; + } + } + } + break; +#if 0 + if (item->flags == ItrItem::none) break; + else windowCursor = avl_prev(windowCursor); +#endif + } + return Status(); +} + +Status LogMgr::Iterator::seekInternal + ( const SizedBuf& key, + const uint64_t seqnum, + SeekOption opt, + bool goto_end ) +{ + Status s; + + // Remove current items from `curWindow`. + std::vector items; + avl_node* cursor = avl_first(&curWindow); + while (cursor) { + ItrItem* item = _get_entry(cursor, ItrItem, an); + cursor = avl_next(&item->an); + avl_remove(&curWindow, &item->an); + items.push_back(item); + } + + // Seek for all items. + for (auto& entry: items) { + ItrItem*& item = entry; + + if (goto_end) { + // Goto end: special case. + s = item->lItr->gotoEnd(); + + } else { + if (type == BY_SEQ) { + s = item->lItr->seekSN(seqnum, (LogFile::Iterator::SeekOption)opt); + } else { + s = item->lItr->seek(key, (LogFile::Iterator::SeekOption)opt); + } + } + + if (s) { + s = item->lItr->get(item->lastRec); + assert(s); + + int cmp = 0; + if (goto_end) { + // Goto end: special case. + cmp = -1; + + } else { + if (type == BY_SEQ) { + if (item->lastRec.seqNum < seqnum) cmp = -1; + else if (item->lastRec.seqNum > seqnum) cmp = 1; + else cmp = 0; + } else { + cmp = cmpSizedBuf(item->lastRec.kv.key, key); + } + } + + item->flags = ItrItem::none; + if (opt == GREATER && cmp < 0) { + item->flags |= ItrItem::no_more_next; + } else if (opt == SMALLER && cmp > 0) { + item->flags |= ItrItem::no_more_prev; + } + } else { + item->flags = ItrItem::no_more_prev | + ItrItem::no_more_next; + } + + avl_cmp_func* cmp_func = (type == BY_SEQ) + ? (ItrItem::cmpSeq) + : (ItrItem::cmpKey); + avl_node* avl_ret = avl_insert(&curWindow, &item->an, cmp_func); + assert(avl_ret == &item->an); (void)avl_ret; + } + + if (opt == GREATER) { + windowCursor = avl_first(&curWindow); + while (windowCursor) { + // Find first valid item. + ItrItem* item = _get_entry(windowCursor, ItrItem, an); + + if (item->flags == ItrItem::none) break; + else windowCursor = avl_next(windowCursor); + } + } else { // SMALLER + moveToLastValid(); + } + + if (!windowCursor) { + // Reached the end. + if (opt == GREATER) windowCursor = avl_last(&curWindow); + if (opt == SMALLER) windowCursor = avl_first(&curWindow); + } + +#if 0 + // Now this feature is done by higher level iterator (i.e., Jungle iterator). + if (type == BY_KEY) { + ItrItem* item = _get_entry(windowCursor, ItrItem, an); + while ( !item->lastRec.isIns() ) { + // Deleted key, move the cursor. + if (opt == GREATER) s = next(); + if (opt == SMALLER) s = prev(); + if (!s) return s; + item = _get_entry(windowCursor, ItrItem, an); + } + } +#endif + + return Status(); +} + + +int LogMgr::Iterator::cmpSizedBuf(const SizedBuf& l, const SizedBuf& r) { + CMP_NULL_CHK(l.data, r.data); + if (lMgr->getDbConfig()->cmpFunc) { + // Custom cmp mode. + CustomCmpFunc func = lMgr->getDbConfig()->cmpFunc; + void* param = lMgr->getDbConfig()->cmpFuncParam; + return func(l.data, l.size, r.data, r.size, param); + } + return SizedBuf::cmp(l, r); +} + +bool LogMgr::Iterator::checkValidBySeq(ItrItem* item, + const uint64_t cur_seq, + const bool is_prev) +{ + if ( ( !is_prev && (item->flags & ItrItem::no_more_next) ) || + ( is_prev && (item->flags & ItrItem::no_more_prev) ) ) { + return false; + } else if (item->lastRec.seqNum == cur_seq) { + // Duplicate item, skip. + return false; + } + return true; +} + +bool LogMgr::Iterator::checkValidByKey(ItrItem* item, + const SizedBuf& cur_key, + const bool is_prev) +{ + if ( ( !is_prev && (item->flags & ItrItem::no_more_next) ) || + ( is_prev && (item->flags & ItrItem::no_more_prev) ) ) { + return false; + } else if (cmpSizedBuf(item->lastRec.kv.key, cur_key) == 0) { + // Duplicate item, skip. + return false; + } + return true; +} + +Status LogMgr::Iterator::close() { + if (!lMgr) return Status(); + + avl_node* cursor = avl_first(&curWindow); + while (cursor) { + ItrItem* item = _get_entry(cursor, ItrItem, an); + cursor = avl_next(&item->an); + avl_remove(&curWindow, &item->an); + } + + for (auto& entry: itrs) { + ItrItem* ctx = entry; + ctx->lItr->close(); + delete ctx->lItr; + if (!snapLogList) { + // Only when not a snapshot. + ctx->lInfo->done(); + } + delete ctx; + } + + lMgr = nullptr; + windowCursor = nullptr; + startKey.free(); + endKey.free(); + return Status(); +} + +}; // namespace jungle + diff --git a/src/log_manifest.cc b/src/log_manifest.cc new file mode 100644 index 0000000..b2c633c --- /dev/null +++ b/src/log_manifest.cc @@ -0,0 +1,746 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "log_manifest.h" + +#include "crc32.h" +#include "db_mgr.h" +#include "event_awaiter.h" +#include "internal_helper.h" +#include "log_mgr.h" + +#include _MACRO_TO_STR(LOGGER_H) + +#include +#include + +namespace jungle { + +static uint8_t LOGMANI_FOOTER[8] = {0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0xab, 0xcd}; +static uint32_t LOGMANI_VERSION = 0x1; + +void LogManifest::reclaimExpiredLogFiles() { + uint64_t last_synced_log_file_num; + Status s = getLastSyncedLog(last_synced_log_file_num); + if (!s) return; + + std::stringstream size_message; + uint64_t total_memtable_size = 0; + std::vector live_file_acc; + skiplist_node* begin = skiplist_begin(&logFiles); + skiplist_node* cursor = begin; + while (cursor) { + LogFileInfo* info = _get_entry(cursor, LogFileInfo, snode); + info->grab(); + + // Collect memory usage info + if ( !info->file->isMemTablePurged() ) { + uint64_t size = info->file->size(); + if ( size > 0 ) { + size_message << ", logfile_" << info->logFileNum << ": " << size; + total_memtable_size += size; + } + } + + // NOTE: + // Need to keep the first log file, as it may be frequently + // accessed by getting min seqnum or something like that. + if ( cursor != begin && + info->logFileNum < last_synced_log_file_num && + !info->isEvicted() && + !info->isRemoved() && + info->file->isImmutable() && + !info->file->isMemTablePurged() && + info->file->isExpired() ) { + _log_info(myLog, "will purge memtable of log file %zu, " + "last access %zu us ago, ref count %zu", + info->logFileNum, + info->file->getLastAcc(), + info->getRefCount()); + info->setEvicted(); + } + + if ( !info->isEvicted() && + !info->isRemoved() && + info->file->isImmutable() && + !info->file->isMemTablePurged() ) { + live_file_acc.push_back(info->file->getLastAcc()); + } + + cursor = skiplist_next(&logFiles, cursor); + skiplist_release_node(&info->snode); + + info->done(); + } + if (cursor) skiplist_release_node(cursor); + + // Log memory usage info + thread_local uint64_t last_total_log_size = 0; + if ( total_memtable_size != last_total_log_size ) { + _log_info(myLog, "memtable memory usage info, total %lu%s", + total_memtable_size, + size_message.str().c_str()); + last_total_log_size = total_memtable_size; + } + + // TODO: + // What if MemTable loading is too fast and occupies huge memory + // before next reclaimer wake-up? + uint32_t limit = logMgr->getDbConfig()->maxKeepingMemtables; + if ( limit && live_file_acc.size() > limit ) { + + // Too many MemTables are in memory, do urgent reclaiming. + size_t num_files_to_purge = live_file_acc.size() - limit; + std::sort(live_file_acc.begin(), live_file_acc.end()); + + _log_info( myLog, "num memtable %zu exceeds limit %zu, " + "last access %zu us ago", + live_file_acc.size(), + limit, + live_file_acc[limit - 1] ); + + size_t num_purged = 0; + begin = skiplist_begin(&logFiles); + cursor = begin; + while (cursor) { + LogFileInfo* info = _get_entry(cursor, LogFileInfo, snode); + info->grab(); + + if ( cursor != begin && + info->logFileNum < last_synced_log_file_num && + !info->isEvicted() && + !info->isRemoved() && + info->file->isImmutable() && + info->file->getLastAcc() > live_file_acc[limit - 1] ) { + _log_info(myLog, "will purge memtable of log file %zu (urgent), " + "last access %zu us ago, ref count %zu", + info->logFileNum, + info->file->getLastAcc(), + info->getRefCount()); + info->setEvicted(); + num_purged++; + } + + cursor = skiplist_next(&logFiles, cursor); + skiplist_release_node(&info->snode); + + info->done(); + + if (num_purged >= num_files_to_purge) break; + } + if (cursor) skiplist_release_node(cursor); + } +} + +LogManifest::LogManifest(const LogMgr* log_mgr, FileOps* _f_ops, FileOps* _f_l_ops) + : fOps(_f_ops) + , fLogOps(_f_l_ops) + , mFile(nullptr) + , lastFlushedLog(NOT_INITIALIZED) + , lastSyncedLog(NOT_INITIALIZED) + , maxLogFileNum(NOT_INITIALIZED) + , logMgr(log_mgr) + , myLog(nullptr) +{ + skiplist_init(&logFiles, LogFileInfo::cmp); + skiplist_init(&logFilesBySeq, LogFileInfo::cmpBySeq); +} + +LogManifest::~LogManifest() { + // Should join reclaimer first, before releasing skiplist. + // It will be safe as this destructor will be invoked after + // LogMgr::close() is done. + + if (mFile) { + delete mFile; + } + // NOTE: Skip `logFilesBySeq` as they share the actual memory. + skiplist_node* cursor = skiplist_begin(&logFiles); + while (cursor) { + LogFileInfo* info = _get_entry(cursor, LogFileInfo, snode); + LogFile* log_file = info->file; + cursor = skiplist_next(&logFiles, cursor); + + delete log_file; + delete info; + } + skiplist_free(&logFiles); + skiplist_free(&logFilesBySeq); +} + +bool LogManifest::isLogReclaimerActive() { + return logMgr->isTtlMode(); +} + +void LogManifest::spawnReclaimer() { + if ( isLogReclaimerActive() ) { + DBMgr* dbm = DBMgr::getWithoutInit(); + if (!dbm) return; + Status s = dbm->addLogReclaimer(); + if (s) { + _log_info(myLog, "initiated log reclaimer"); + } else { + _log_info(myLog, "log reclaimer already exists"); + } + } +} + +Status LogManifest::create(const std::string& path, + const std::string& filename, + const uint64_t prefix_num) +{ + if (!fOps) return Status::NOT_INITIALIZED; + if (fOps->exist(filename.c_str())) return Status::ALREADY_EXIST; + if (filename.empty()) return Status::INVALID_PARAMETERS; + + dirPath = path; + mFileName = filename; + + // Create a new file. + Status s; + EP( fOps->open(&mFile, mFileName.c_str()) ); + + // Store initial data. + EP( store() ); + + spawnReclaimer(); + return Status(); +} + +Status LogManifest::load(const std::string& path, + const std::string& filename, + const uint64_t prefix_num) +{ + if (!fOps) return Status::NOT_INITIALIZED; + if (mFile) return Status::ALREADY_LOADED; + if (filename.empty()) return Status::INVALID_PARAMETERS; + + dirPath = path; + mFileName = filename; + prefixNum = prefix_num; + + Status s; + Timer tt; + const DBConfig* db_config = logMgr->getDbConfig(); + + EP( fOps->open(&mFile, mFileName.c_str()) ); + + try { + // File should be bigger than 16 bytes (FOOTER + version + CRC32). + size_t file_size = fOps->eof(mFile); + if (file_size < 16) throw Status(Status::FILE_CORRUPTION); + + // Footer check + RwSerializer ss(fOps, mFile); + uint8_t chk_footer[8]; + ss.pos(file_size - 16); + ss.get(chk_footer, 8); + if (memcmp(LOGMANI_FOOTER, chk_footer, 8) != 0) { + throw Status(Status::FILE_CORRUPTION); + } + + // Version + uint32_t ver_file = ss.getU32(s); + (void)ver_file; + + // CRC check + uint32_t crc_file = ss.getU32(s); + + SizedBuf chk_buf(file_size - 4); + SizedBuf::Holder h_chk_buf(chk_buf); + ss.pos(0); + ss.get(chk_buf.data, chk_buf.size); + uint32_t crc_local = crc32_8(chk_buf.data, chk_buf.size, 0); + if (crc_local != crc_file) throw Status(Status::CHECKSUM_ERROR); + + ss.pos(0); + maxLogFileNum.store(ss.getU64(s), MOR); + lastFlushedLog.store(ss.getU64(s), MOR); + lastSyncedLog.store(ss.getU64(s), MOR); + uint32_t num_log_files = ss.getU32(s); + _log_info(myLog, + "max log file num %ld, last flush %ld, last sync %ld, " + "num log files %zu", + maxLogFileNum.load(), lastFlushedLog.load(), + lastSyncedLog.load(), num_log_files); + + uint64_t last_synced_seq = NOT_INITIALIZED; + + bool first_file_read = false; + for (uint32_t ii=0; ii < num_log_files; ++ii) { + LogFile* l_file = new LogFile(logMgr); + l_file->setLogger(myLog); + + uint64_t l_file_num = ss.getU64(s); + std::string l_filename = + LogFile::getLogFileName(dirPath, prefixNum, l_file_num); + + uint64_t min_seq = ss.getU64(s); + uint64_t purged_seq = ss.getU64(s); + uint64_t synced_seq = ss.getU64(s); + + if ( db_config->logSectionOnly && + db_config->truncateInconsecutiveLogs && + valid_number(min_seq) ) { + // Log-only mode, validity check. + bool invalid_log = false; + if ( valid_number(synced_seq) && + min_seq > synced_seq ) { + // This cannot happen, probably caused by + // abnormal shutdown. + _log_warn( myLog, "min seq %s > synced seq %s, break", + _seq_str(min_seq).c_str(), + _seq_str(synced_seq).c_str() ); + invalid_log = true; + } + if ( valid_number(last_synced_seq) && + min_seq != last_synced_seq + 1 ) { + // Inconsecutive sequence number, + // probably caused by abnormal shutdown and then + // re-open. + _log_warn( myLog, "min seq %s and last synced seq %s " + "are inconsecutive, break", + _seq_str(min_seq).c_str(), + _seq_str(last_synced_seq).c_str() ); + invalid_log = true; + } + + if (invalid_log) { + delete l_file; + if (l_file_num) { + maxLogFileNum.store(l_file_num-1, MOR); + lastSyncedLog.store(l_file_num-1, MOR); + _log_warn(myLog, "adjusted max log file num %zu, " + "last synced log file num %zu", + maxLogFileNum.load(), + lastSyncedLog.load()); + } + break; + } + } + + if (valid_number(synced_seq)) { + last_synced_seq = synced_seq; + } + if (!valid_number(min_seq) && valid_number(last_synced_seq)) { + min_seq = last_synced_seq + 1; + } + if (!valid_number(synced_seq) && valid_number(last_synced_seq)) { + synced_seq = last_synced_seq; + } + + s = l_file->load(l_filename, fLogOps, l_file_num, + min_seq, purged_seq, synced_seq); + if (!s) { + _s_warn(myLog) << "log file " << l_file_num << " read error: " << s; + if (s == Status::FILE_NOT_EXIST) { + if ( !first_file_read && + ii + 1 < num_log_files ) { + // If this is the first log file, and there are + // more log files to read, we can tolerate this error. + _log_warn(myLog, "index %zu (out of %zu) log number %zu is " + "the first log yet, skip it", + ii, num_log_files, l_file_num); + lastFlushedLog = NOT_INITIALIZED; + lastSyncedLog = NOT_INITIALIZED; + delete l_file; + continue; + } + + // Log file in the middle or the last one. + _s_warn(myLog) << "something wrong happened, stop loading here"; + delete l_file; + if ( getNumLogFiles() ) { + if (l_file_num) { + maxLogFileNum.store(l_file_num-1, MOR); + lastSyncedLog.store(l_file_num-1, MOR); + _log_warn(myLog, "adjusted max log file num %zu, " + "last synced log file num %zu", + maxLogFileNum.load(), + lastSyncedLog.load()); + } + break; + } + // Otherwise, there is no valid log file. Should create one, + // and set its number to max file number. + l_file = new LogFile(logMgr); + l_file_num = maxLogFileNum; + + // WARNING: WE SHOULD RESET LAST FLUSHED/SYNCED FILE NUMBER. + lastFlushedLog = NOT_INITIALIZED; + lastSyncedLog = NOT_INITIALIZED; + + std::string l_filename = + LogFile::getLogFileName(dirPath, prefixNum, l_file_num); + // NOTE: `start_seq_num` will be re-synced with tables + // after table loading is done. So it is safe to + // set it to 0 here. + l_file->create(l_filename, fLogOps, l_file_num, 0); + + _log_warn(myLog, "no log file is found due to previous crash, " + "created new log file %zu", l_file_num); + + // Make it escape loop. + ii = num_log_files; + } + // Otherwise: tolerate. + } + first_file_read = true; + + _log_info( myLog, + "log %ld, min seq %s, last flush %s, last sync %s", + l_file_num, + _seq_str(min_seq).c_str(), + _seq_str(purged_seq).c_str(), + _seq_str(synced_seq).c_str() ); + + addNewLogFile(l_file_num, l_file, min_seq); + if ( !valid_number(lastSyncedLog) || + lastSyncedLog < l_file_num ) { + lastSyncedLog.store(l_file_num); + } + } + + _log_info(myLog, "loading manifest & log files done: %lu us, " + "flushed %s synced %s max %s", + tt.getUs(), + _seq_str(lastFlushedLog).c_str(), + _seq_str(lastSyncedLog).c_str(), + _seq_str(maxLogFileNum).c_str() ); + + spawnReclaimer(); + return Status(); + + } catch (Status s) { + // Error happened, close file. + fOps->close(mFile); + DELETE(mFile); + return s; + } +} + +Status LogManifest::store() { + if (mFileName.empty() || !fOps) return Status::NOT_INITIALIZED; + + Status s; + + // Tolerate backup failure. + //BackupRestore::backup(fOps, mFileName); + + SizedBuf mani_buf(4096); + SizedBuf::Holder h_mani_buf(mani_buf); + // Resizable serializer. + RwSerializer ss(&mani_buf); + + // << Log manifest file format >> + // Latest log file number, 8 bytes + // Last flushed log file number, 8 bytes + // Last synced log file number, 8 bytes + // Number of log files, 4 bytes + uint32_t num_log_files = skiplist_get_size(&logFiles); + ss.putU64(maxLogFileNum); + ss.putU64(lastFlushedLog); + ss.putU64(lastSyncedLog); + ss.putU32(num_log_files); + _log_debug(myLog, + "max log file num %ld, last flush %ld, last sync %ld, " + "num log files %zu", + maxLogFileNum.load(), lastFlushedLog.load(), + lastSyncedLog.load(), num_log_files); + + skiplist_node* cursor = skiplist_begin(&logFiles); + while (cursor) { + // << Log info entry format >> + // Log file number, 8 bytes + // Min seq number, 8 bytes + // Last flushed seq number, 8 bytes + // Last synced seq number, 8 bytes + LogFileInfo* info = _get_entry(cursor, LogFileInfo, snode); + LogFile* l_file = info->file; + + ss.putU64(info->logFileNum); + ss.putU64(l_file->getMinSeqNum()); + ss.putU64(l_file->getFlushedSeqNum()); + ss.putU64(l_file->getSyncedSeqNum()); + _log_trace(myLog, + "log %ld, min seq %ld, last flush %ld, last sync %ld", + info->logFileNum, l_file->getMinSeqNum(), + l_file->getFlushedSeqNum(), l_file->getSyncedSeqNum()); + + cursor = skiplist_next(&logFiles, cursor); + skiplist_release_node(&info->snode); + } + if (cursor) skiplist_release_node(cursor); + + ss.put(LOGMANI_FOOTER, 8); + + // Version + ss.putU32(LOGMANI_VERSION); + + // CRC32 + uint32_t crc_val = crc32_8(mani_buf.data, ss.pos(), 0); + + ss.putU32(crc_val); + + EP( fOps->pwrite(mFile, mani_buf.data, ss.pos(), 0) ); + + // Should truncate tail. + fOps->ftruncate(mFile, ss.pos()); + + // After success, make a backup file one more time, + // using the latest data. + // Same as above, tolerate backup failure. + BackupRestore::backup(fOps, mFileName, mani_buf, ss.pos()); + + return s; +} + +Status LogManifest::sync() { + return fOps->fsync(mFile); +} + +Status LogManifest::issueLogFileNumber(uint64_t& new_log_file_number) { + uint64_t expected = NOT_INITIALIZED; + uint64_t val = 0; + if (maxLogFileNum.compare_exchange_weak(expected, val)) { + // The first log file, number 0. + } else { + // Otherwise: current max + 1. + do { + expected = maxLogFileNum; + val = maxLogFileNum + 1; + } while (!maxLogFileNum.compare_exchange_weak(expected, val)); + } + new_log_file_number = val; + return Status(); +} + +Status LogManifest::rollbackLogFileNumber(uint64_t to) { + maxLogFileNum = to; + return Status(); +} + +bool LogManifest::logFileExist(const uint64_t log_num) { + LogFileInfo query(log_num); + skiplist_node* cursor = skiplist_find(&logFiles, &query.snode); + if (!cursor) { + return false; + } + skiplist_release_node(cursor); + return true; +} + +Status LogManifest::getLogFileInfo(const uint64_t log_num, + LogFileInfo*& info_out, + bool force_not_load_memtable) +{ + LogFileInfo query(log_num); + skiplist_node* cursor = skiplist_find(&logFiles, &query.snode); + if (!cursor) { + return Status::LOG_FILE_NOT_FOUND; + } + info_out = _get_entry(cursor, LogFileInfo, snode); + if (force_not_load_memtable) { + info_out->grab(); + } else { + info_out->grab(isLogReclaimerActive()); + } + skiplist_release_node(cursor); + return Status(); +} + +Status LogManifest::getLogFileInfoRange(const uint64_t s_log_inc, + const uint64_t e_log_inc, + std::vector& info_out, + bool force_not_load_memtable) +{ + LogFileInfo query(s_log_inc); + skiplist_node* cursor = + skiplist_find_greater_or_equal(&logFiles, &query.snode); + if (!cursor) { + return Status::LOG_FILE_NOT_FOUND; + } + + while (cursor) { + LogFileInfo* l_info = _get_entry(cursor, LogFileInfo, snode); + if (force_not_load_memtable) { + l_info->grab(); + } else { + l_info->grab(isLogReclaimerActive()); + } + info_out.push_back(l_info); + + if (l_info->logFileNum >= e_log_inc) { + cursor = nullptr; + } else { + cursor = skiplist_next(&logFiles, &l_info->snode); + } + skiplist_release_node(&l_info->snode); + } + return Status(); +} + +Status LogManifest::getLogFileInfoBySeq(const uint64_t seq_num, + LogFileInfo*& info_out, + bool force_not_load_memtable) +{ + LogFileInfo query(0); + query.startSeq = seq_num; + skiplist_node* cursor = skiplist_find_smaller_or_equal + ( &logFilesBySeq, &query.snodeBySeq ); + if (!cursor) return Status::LOG_FILE_NOT_FOUND; + + LogFileInfo* info = _get_entry(cursor, LogFileInfo, snodeBySeq); + LogFile* file = info->file; + + uint64_t file_min_seq = file->getMinSeqNum(); + if (valid_number(file_min_seq) && file_min_seq > seq_num) { + // WARNING: This can happen for the first log file, + // if user uses custom seqnum which is bigger than + // the expected seqnum. + skiplist_release_node(cursor); + return Status::LOG_FILE_NOT_FOUND; + } + if (file->getMaxSeqNum() < seq_num) { + skiplist_release_node(cursor); + return Status::LOG_FILE_NOT_FOUND; + } + + info_out = info; + if (force_not_load_memtable) { + info_out->grab(); + } else { + info_out->grab(isLogReclaimerActive()); + } + skiplist_release_node(cursor); + return Status(); +} + +LogFileInfo* LogManifest::getLogFileInfoP(uint64_t log_num, + bool force_not_load_memtable) { + LogFileInfo* ret = nullptr; + Status s = getLogFileInfo(log_num, ret, force_not_load_memtable); + if (!s) return nullptr; + return ret; +} + +Status LogManifest::addNewLogFile(uint64_t log_num, + LogFile* log_file, + uint64_t start_seqnum) +{ + LogFileInfo* info = new LogFileInfo(log_num); + if (!info) return Status::ALLOCATION_FAILURE; + + info->file = log_file; + info->startSeq = start_seqnum; + skiplist_insert(&logFilesBySeq, &info->snodeBySeq); + skiplist_insert(&logFiles, &info->snode); + + return Status(); +} + +Status LogManifest::removeLogFile(uint64_t log_num) { + LogFileInfo query(log_num); + skiplist_node* cursor = skiplist_find(&logFiles, &query.snode); + if (!cursor) { + return Status::LOG_FILE_NOT_FOUND; + } + + LogFileInfo* info = _get_entry(cursor, LogFileInfo, snode); + + // NOTE: the last done() call will kill itself (suicide). + info->setRemoved(); + + skiplist_erase_node(&logFiles, &info->snode); + skiplist_release_node(&info->snode); + skiplist_wait_for_free(&info->snode); + + skiplist_erase_node(&logFilesBySeq, &info->snodeBySeq); + skiplist_wait_for_free(&info->snode); + + return Status(); +} + +Status LogManifest::getLogFileNumBySeq(const uint64_t seqnum, + uint64_t& log_file_num_out, + bool force_not_load_memtable) +{ + LogFileInfo* info; + Status s; + EP( getLogFileInfoBySeq(seqnum, info, force_not_load_memtable) ); + LogFileInfoGuard gg(info); + if (!info->file) return Status::NOT_INITIALIZED; + log_file_num_out = info->file->getLogFileNum(); + return Status(); +} + +Status LogManifest::getMaxLogFileNum(uint64_t& log_file_num_out) { + uint64_t max_log_file_num = maxLogFileNum.load(MOR); + if (max_log_file_num == NOT_INITIALIZED) + return Status::NOT_INITIALIZED; + + log_file_num_out = max_log_file_num; + return Status(); +} + +Status LogManifest::setMaxLogFileNum(uint64_t cur_num, uint64_t new_num) { + if (maxLogFileNum.compare_exchange_weak(cur_num, new_num)) { + return Status(); + } + return Status::ERROR; +} + + +Status LogManifest::getMinLogFileNum(uint64_t& log_file_num_out) { + skiplist_node* cursor = skiplist_begin(&logFiles); + if (!cursor) { + return Status::NOT_INITIALIZED; + } + LogFileInfo* info = _get_entry(cursor, LogFileInfo, snode); + log_file_num_out = info->logFileNum; + skiplist_release_node(cursor); + return Status(); +} + +Status LogManifest::getLastFlushedLog(uint64_t& last_flushed_log) { + if (lastFlushedLog == NOT_INITIALIZED) { + // Flush never happened yet, return the min log file number. + // If no log file exists, return error. + Status s; + s = getMinLogFileNum(last_flushed_log); + if (!s) last_flushed_log = NOT_INITIALIZED; + return s; + } + last_flushed_log = lastFlushedLog; + return Status(); +} + +Status LogManifest::getLastSyncedLog(uint64_t& last_synced_log) { + last_synced_log = lastSyncedLog; + if (lastSyncedLog == NOT_INITIALIZED) { + return Status::NOT_INITIALIZED; + } + + return Status(); +} + +size_t LogManifest::getNumLogFiles() { + return skiplist_get_size(&logFiles); +} + + +} // namespace jungle + diff --git a/src/log_manifest.h b/src/log_manifest.h new file mode 100644 index 0000000..b1c7dea --- /dev/null +++ b/src/log_manifest.h @@ -0,0 +1,287 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "fileops_base.h" +#include "internal_helper.h" +#include "log_file.h" +#include "skiplist.h" + +#include + +#include +#include +#include +#include +#include + +class SimpleLogger; + +namespace jungle { + +struct LogFileInfo { + LogFileInfo() { + LogFileInfo(0); + } + LogFileInfo(uint64_t _log_file_num) + : logFileNum(_log_file_num) + , startSeq(0) + , file(nullptr) + , refCount(0) + , removed(false) + , evicted(false) { + skiplist_init_node(&snodeBySeq); + skiplist_init_node(&snode); + } + ~LogFileInfo() { + skiplist_free_node(&snodeBySeq); + skiplist_free_node(&snode); + assert(refCount == 0); + } + + static int cmp(skiplist_node *a, skiplist_node *b, void *aux) { + LogFileInfo *aa, *bb; + aa = _get_entry(a, LogFileInfo, snode); + bb = _get_entry(b, LogFileInfo, snode); + + if (aa->logFileNum < bb->logFileNum) return -1; + if (aa->logFileNum > bb->logFileNum) return 1; + return 0; + } + + static int cmpBySeq(skiplist_node *a, skiplist_node *b, void *aux) { + LogFileInfo *aa, *bb; + aa = _get_entry(a, LogFileInfo, snodeBySeq); + bb = _get_entry(b, LogFileInfo, snodeBySeq); + + if (aa->startSeq < bb->startSeq) return -1; + if (aa->startSeq > bb->startSeq) return 1; + return 0; + } + + void grab(bool load_memtable_if_needed = false) { + refCount.fetch_add(1); + + std::lock_guard l(evictionLock); + if (load_memtable_if_needed) { + if (file->isMemTablePurged()) { + file->loadMemTable(); + evicted = false; + } + } + } + + void done() { + assert(refCount); + // WARNING: MONSTOR-11561 + // We should check `removed` or `evicted` flag first + // and then decrease `refCount`. + // + // e.g.) + // T1: done(), decrease refCount to 0, count = 1. + // --- context switch --- + // T2: Increase ref count to 1. + // T2: Do flush, remove file, removed = true. + // T2: done(), decrease refCount to 0, count = 1. + // T2: removed == true && count == 1, destroy the file. + // --- context switch --- + // T1: removed == true && count == 1, destroy the file. + if (removed) { + uint64_t count = refCount.fetch_sub(1); + if (count == 1) { + // WARNING: + // While reclaimer is evicting memtable below (in that case + // refCount == 0 already), LogMgr::flush() may grab a file, + // remove it, and then release, which will get into here. + // In this case this file will not be protected and causes crash. + // We should do busy-waiting for eviction here. + // + // * Note that opposite CANNOT happen, because + // 1) This node is already detached from skiplist, and + // 2) once `removed` flag is set, reclaimer cannot handle this file. + // + if (evicted) { + while (!file->isMemTablePurged()) std::this_thread::yield(); + // Wait for other job done. + } + file->destroySelf(); + delete file; + delete this; + } + return; + } + + if (evicted) { + uint64_t count = refCount.fetch_sub(1); + if (count == 1) { + std::lock_guard l(evictionLock); + // WARNING: + // There shouldn't be another thread that called grab() before this. + uint64_t count_protected = refCount.load(); + if (count_protected == 0 && !file->isMemTablePurged()) { + file->purgeMemTable(); + } + } + return; + } + + // Normal case. + refCount.fetch_sub(1); + } + uint64_t getRefCount() const { return refCount.load(); } + void setRemoved() { removed.store(true); } + bool isRemoved() { return removed.load(); } + void setEvicted() { + std::lock_guard l(evictionLock); + evicted.store(true); + } + bool isEvicted() { return evicted.load(); } + + skiplist_node snode; + skiplist_node snodeBySeq; + + uint64_t logFileNum; + + uint64_t startSeq; + + LogFile* file; + + // Reference counter. + std::atomic refCount; + + // Flag indicating whether or not this file is removed. + std::atomic removed; + + // Flag indicating whether or not this file is evicted from LRU. + std::atomic evicted; + + // Lock for loading & evicting mem-table. + std::mutex evictionLock; +}; + +struct LogFileInfoGuard { + LogFileInfoGuard(LogFileInfo* _ptr) : ptr(_ptr) {} + ~LogFileInfoGuard() { if (ptr) ptr->done(); } + void operator=(const LogFileInfoGuard& src) { + LogFileInfo* tmp = ptr; + ptr = src.ptr; + ptr->grab(true); + if (tmp) tmp->done(); + } + bool empty() const { return (ptr == nullptr); } + LogFileInfo* operator->() const { return ptr; } + LogFile* file() { return ptr->file; } + LogFileInfo* ptr; +}; + +class LogMgr; +class LogManifest { +public: + LogManifest(const LogMgr* log_mgr, FileOps* _f_ops, FileOps* _f_l_ops); + ~LogManifest(); + + bool isLogReclaimerActive(); + + void spawnReclaimer(); + + Status create(const std::string& path, + const std::string& filename, + const uint64_t prefix_num); + Status load(const std::string& path, + const std::string& filename, + const uint64_t prefix_num); + Status store(); + static Status copyFile(FileOps* f_ops, + const std::string& src_file, + const std::string& dst_file); + static Status backup(FileOps* f_ops, + const std::string& filename); + static Status restore(FileOps* f_ops, + const std::string& filename); + Status sync(); + + Status issueLogFileNumber(uint64_t& new_log_file_number); + + Status rollbackLogFileNumber(uint64_t to); + + Status addNewLogFile(uint64_t log_num, + LogFile* log_file, + uint64_t start_seqnum); + Status removeLogFile(uint64_t log_num); + + bool logFileExist(const uint64_t log_num); + + Status getLogFileInfo(uint64_t log_num, + LogFileInfo*& info_out, + bool force_not_load_memtable = false); + Status getLogFileInfoRange(const uint64_t s_log_inc, + const uint64_t e_log_inc, + std::vector& info_out, + bool force_not_load_memtable = false); + Status getLogFileInfoBySeq(const uint64_t seq_num, + LogFileInfo*& info_out, + bool force_not_load_memtable = false); + LogFileInfo* getLogFileInfoP(uint64_t log_num, + bool force_not_load_memtable = false); + + Status getLogFileNumBySeq(const uint64_t seq_num, + uint64_t& log_file_num_out, + bool force_not_load_memtable = false); + Status getMaxLogFileNum(uint64_t& log_file_num_out); + Status setMaxLogFileNum(uint64_t cur_num, uint64_t new_num); + + Status getMinLogFileNum(uint64_t& log_file_num_out); + Status getLastFlushedLog(uint64_t& last_purged_log); + Status getLastSyncedLog(uint64_t& last_synced_log); + + void setLastSyncedLog(const uint64_t log_num) { + lastSyncedLog = log_num; + } + void setLastFlushedLog(const uint64_t log_num) { + lastFlushedLog = log_num; + } + + size_t getNumLogFiles(); + + void reclaimExpiredLogFiles(); + + void setLogger(SimpleLogger* logger) { myLog = logger; } + +private: + FileOps* fOps; + FileOps* fLogOps; + FileHandle* mFile; + std::string dirPath; + std::string mFileName; + uint64_t prefixNum; + std::atomic lastFlushedLog; + std::atomic lastSyncedLog; + std::atomic maxLogFileNum; + + // Log files by its file number. + skiplist_raw logFiles; + + // Log files by its starting (minimum) seq number. + // Entries are shared with `logFiles`. + skiplist_raw logFilesBySeq; + + const LogMgr* logMgr; + SimpleLogger* myLog; +}; + +} // namespace jungle + diff --git a/src/log_mgr.cc b/src/log_mgr.cc new file mode 100644 index 0000000..f2d005a --- /dev/null +++ b/src/log_mgr.cc @@ -0,0 +1,1404 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "log_mgr.h" + +#include "db_internal.h" +#include "db_mgr.h" +#include "fileops_base.h" +#include "internal_helper.h" +#include "murmurhash3.h" + +#include _MACRO_TO_STR(LOGGER_H) + +#include +#include +#include +#include + +namespace jungle { + +LogMgr::LogMgr(DB* parent_db, const LogMgrOptions& _options) + : parentDb(parent_db) + , initialized(false) + , opt(_options) + , mani(nullptr) + , throttlingRate(0) + , lastFlushIntervalMs(0) + , numSetRecords(0) + , myLog(nullptr) + , vlSync(VERBOSE_LOG_SUPPRESS_MS) + {} + +LogMgr::~LogMgr() { + assert(sMap.size() == 0); + delete mani; +} + +Status LogMgr::init(const LogMgrOptions& _options) { + if (mani) return Status::ALREADY_INITIALIZED; + + opt = _options; + syncSema.enabled = true; + flushSema.enabled = true; + reclaimSema.enabled = true; + + Status s; + mani = new LogManifest(this, + opt.fOps, + opt.dbConfig->directIo + && FileOps::supportDirectIO() + ? opt.fDirectOps : opt.fOps); + if (!mani) return Status::ALLOCATION_FAILURE; + mani->setLogger(myLog); + + if (getDbConfig()->logSectionOnly) { + DBMgr* dbm = DBMgr::getWithoutInit(); + GlobalConfig* g_config = (dbm) ? dbm->getGlobalConfig() : nullptr; + _log_info(myLog, + "log-section mode: " + "reclaimer period %u seconds, TTL %u seconds", + (g_config) ? g_config->logFileReclaimerSleep_sec : 0, + getDbConfig()->logFileTtl_sec); + } + + try { + char p_num[16]; + sprintf(p_num, "%04" PRIu64, opt.prefixNum); + std::string m_filename = opt.path + "/log" + p_num + "_manifest"; + + if (opt.fOps->exist(m_filename.c_str())) { + // Manifest file already exists, load it. + s = mani->load(opt.path, m_filename, opt.prefixNum); + if (!s) { + // Error happened, try again using backup file. + _log_err(myLog, "loading manifest error: %d, try again", s); + TC(BackupRestore::restore(opt.fOps, m_filename)); + s = mani->load(opt.path, m_filename, opt.prefixNum); + } + if (!s) throw s; + + } else { + // Not exist, initial setup phase. + + // Create manifest file. + TC(mani->create(opt.path, m_filename, opt.prefixNum)); + + // Get new log file number, and file name + uint64_t log_num; + TC(mani->issueLogFileNumber(log_num)); + std::string l_filename = + LogFile::getLogFileName(opt.path, opt.prefixNum, log_num); + + // Create a log file and add it to manifest. + LogFile* l_file = new LogFile(this); + l_file->setLogger(myLog); + + try { + TC(l_file->create(l_filename, + opt.dbConfig->directIo + && FileOps::supportDirectIO() + ? opt.fDirectOps : opt.fOps, + log_num, + 0)); + TC(mani->addNewLogFile(log_num, l_file, 1)); + + } catch (Status s) { + delete l_file; + throw s; + } + + // Sync manifest file. + mani->store(); + mani->sync(); + } + mani->setLogger(myLog); + + logMgrSettings(); + + removeStaleFiles(); + + initialized = true; + return Status(); + + } catch (Status s) { + _log_err(myLog, "init manifest error: %d", s); + DELETE(mani); + return s; + } +} + +void LogMgr::logMgrSettings() { + DBMgr* mgr = DBMgr::getWithoutInit(); + assert(mgr); + + GlobalConfig* g_conf = mgr->getGlobalConfig(); + + _log_info(myLog, "initialized log manager, memtable flush buffer %zu", + g_conf->memTableFlushBufferSize); +} + +Status LogMgr::rollback(uint64_t seq_upto) { + Status s; + + DBMgr* mgr = DBMgr::getWithoutInit(); + DebugParams d_params = mgr->getDebugParams(); + + // Return error in read-only mode. + if (getDbConfig()->readOnly) return Status::WRITE_VIOLATION; + + // WARNING: + // Both syncing (memtable -> log) and flushing (log -> table) + // should be blocked during rollback. + + Timer tt; + const size_t MAX_RETRY_MS = 1000; // 1 second. + tt.setDurationMs(MAX_RETRY_MS); + + OpSemaWrapper ow_sync(&syncSema); + while (!ow_sync.acquire()) { + if (tt.timeout()) { + _log_err(myLog, "rollback timeout due to sync"); + return Status::TIMEOUT; + } + Timer::sleepMs(10); + } + assert(ow_sync.op_sema->enabled); + + OpSemaWrapper ow_flush(&flushSema); + while (!ow_flush.acquire()) { + if (tt.timeout()) { + _log_err(myLog, "rollback timeout due to flush"); + return Status::TIMEOUT; + } + Timer::sleepMs(10); + } + assert(ow_flush.op_sema->enabled); + + OpSemaWrapper ow_reclaim(&reclaimSema); + while (!ow_reclaim.acquire()) { + if (tt.timeout()) { + _log_err(myLog, "rollback timeout due to reclaim"); + return Status::TIMEOUT; + } + Timer::sleepMs(10); + } + assert(ow_reclaim.op_sema->enabled); + + _log_info(myLog, "[ROLLBACK] upto %zu", seq_upto); + + // Set rollback flag, it will be reset on exit of this function. + GcFunc gc_func( [this]() -> void + {this->parentDb->p->flags.rollbackInProgress = false;} ); + this->parentDb->p->flags.rollbackInProgress = true; + + // Should sync first. + EP( syncInternal(false) ); + + // Find corresponding log file. + LogFileInfo* linfo; + EP( mani->getLogFileInfoBySeq(seq_upto, linfo) ); + + LogFileInfoGuard gg(linfo); + if (gg.empty() || gg.ptr->isRemoved()) return Status::SEQNUM_NOT_FOUND; + + // Truncate that file. + EP( gg.file()->truncate(seq_upto) ); + + if (d_params.rollbackDelayUs) { + // If debug parameter is given, sleep here. + Timer::sleepUs(d_params.rollbackDelayUs); + } + + gg.file()->setImmutable(); + _log_info(myLog, "[ROLLBACK] truncated log file %zu", linfo->logFileNum); + + // Remove all log files after that. + uint64_t lf_max = 0; + mani->getMaxLogFileNum(lf_max); + for (uint64_t ii = linfo->logFileNum + 1; ii <= lf_max; ++ii) { + // Avoid loading memtable because of this call. + LogFileInfoGuard ll(mani->getLogFileInfoP(ii, true)); + + // Remove file from manifest. + mani->removeLogFile(ii); + _log_info(myLog, "[ROLLBACK] removed log file %ld.", ii); + + if (d_params.rollbackDelayUs) { + // If debug parameter is given, sleep here. + Timer::sleepUs(d_params.rollbackDelayUs); + } + } + mani->rollbackLogFileNumber(linfo->logFileNum); + + // Adjust manifest, and store. + mani->setLastSyncedLog(linfo->logFileNum); + mani->store(); + mani->sync(); + _log_info(myLog, "[ROLLBACK] now %zu is the last seqnum", seq_upto); + + DBMgr::get()->forceRemoveFiles(); + + // Reload the entire memtable, to purge rolled-back records. + linfo->setEvicted(); + + return Status(); +} + +Status LogMgr::removeStaleFiles() { + // Do nothing in read only mode. + if (getDbConfig()->readOnly) return Status(); + + std::vector files; + FileMgr::scan(opt.path, files); + + char p_num[16]; + sprintf(p_num, "%04" PRIu64, opt.prefixNum); + std::string prefix = "log"; + prefix += p_num; + prefix += "_"; + size_t prefix_len = prefix.size(); + + std::string m_filename = "log"; + m_filename += p_num; + m_filename += "_manifest"; + + bool need_mani_sync = false; + for (auto& entry: files) { + std::string& ff = entry; + size_t pos = ff.find(prefix); + if ( pos != std::string::npos && + ff.find(m_filename) == std::string::npos ) { + // Check if it is in manifest. + uint64_t log_num = atoi( ff.substr( prefix_len, + ff.size() - prefix_len ).c_str() ); + if (!mani->logFileExist(log_num)) { + Timer tt; + opt.fOps->remove(opt.path + "/" + ff); + need_mani_sync = true; + _log_warn(myLog, "%s does not exist in manifest, removed. %zu us", + ff.c_str(), tt.getUs()); + } + } + } + + if (need_mani_sync) { + // Should sync manifest file. + mani->store(); + mani->sync(); + _log_info(myLog, "done manifest sync for stale file removal"); + } + + return Status(); +} + +bool LogMgr::isTtlMode() const { + if ( getDbConfig()->logSectionOnly && + getDbConfig()->logFileTtl_sec ) { + return true; + } + return false; +} + +Status LogMgr::openSnapshot(DB* snap_handle, + const uint64_t checkpoint, + std::list*& log_file_list_out) +{ + Status s; + LogFileList* l_list = new LogFileList(); + { mGuard l(sMapLock); + sMap.insert( std::make_pair(snap_handle, l_list) ); + } + + uint64_t l_num_min; + uint64_t l_num_max; + + for (;;) { + // Special case: empty DB (both checkpoint and last flush are zero). + if ( !snap_handle->sn->chkNum && + !snap_handle->sn->lastFlush ) { + // Skip this phase, let the list empty. + break; + } + + // Get log file numbers. + EP( mani->getMinLogFileNum(l_num_min) ); + EP( mani->getLogFileNumBySeq(checkpoint, l_num_max) ); + + bool retry_init = false; + for (uint64_t ii = l_num_min; ii <= l_num_max; ++ii) { + LogFileInfo* l_info = nullptr; + s = mani->getLogFileInfo(ii, l_info); + + if (!s || l_info->isRemoved()) { + // `l_num_min` is invalid. Retry. + // Free all resources. + for (auto& entry: *l_list) { + LogFileInfo* item = entry; + item->done(); + } + if (s) l_info->done(); + + retry_init = true; + break; + } + l_list->push_back(l_info); + } + if (retry_init) continue; + break; + } + + log_file_list_out = l_list; + return Status(); +} + +Status LogMgr::closeSnapshot(DB* snap_handle) { + Status s; + LogFileList* l_list = nullptr; + { mGuard l(sMapLock); + auto entry = sMap.find(snap_handle); + assert(entry != sMap.end()); + l_list = entry->second; + sMap.erase(entry); + } + + for (auto& entry: *l_list) { + LogFileInfo* info = entry; + info->done(); + } + delete l_list; + return Status(); +} + +Status LogMgr::setByBulkLoader(std::list& batch, + TableMgr* table_mgr, + bool last_batch) +{ + // Return error in read-only mode. + if (getDbConfig()->readOnly) return Status::WRITE_VIOLATION; + + Status s; + + uint64_t log_file_num = 0; + EP(mani->getMaxLogFileNum(log_file_num)); + LogFileInfoGuard g_li(mani->getLogFileInfoP(log_file_num)); + + if (g_li.empty() || g_li.ptr->isRemoved()) { + // This shouldn't happen. + assert(0); + return Status::ERROR; + } + + for (Record*& rec: batch) { + // In bulk loading mode, user cannot assign seq number. + g_li.file()->assignSeqNum(*rec); + + // All seq numbers need to be synchronized. + g_li.file()->updateSeqNumByBulkLoader(rec->seqNum); + mani->setLastSyncedLog(log_file_num); + mani->setLastFlushedLog(log_file_num); + } + + std::list dummy; + s = table_mgr->setBatch(batch, dummy, !last_batch); + + return s; +} + +Status LogMgr::addNewLogFile(LogFileInfoGuard& cur_log_file_info, + LogFileInfoGuard& new_log_file_info) +{ + std::unique_lock ll(addNewLogFileMutex); + Status s; + + // Log file is full. Add one more file. + uint64_t new_log_num = 0; + uint64_t max_log_num = 0; + s = mani->getMaxLogFileNum(max_log_num); + if (s) new_log_num = max_log_num + 1; + + uint64_t log_file_num = cur_log_file_info.ptr->logFileNum; + if (max_log_num == log_file_num) { + // Set existing file immutable. + cur_log_file_info->file->setImmutable(); + + std::string l_filename = + LogFile::getLogFileName(opt.path, opt.prefixNum, new_log_num); + LogFile* l_new_file = new LogFile(this); + l_new_file->setLogger(myLog); + uint64_t start_seqnum = cur_log_file_info->file->getMaxSeqNum() + 1; + l_new_file->create( l_filename, + opt.dbConfig->directIo + && FileOps::supportDirectIO() + ? opt.fDirectOps : opt.fOps, + new_log_num, + start_seqnum ); + mani->addNewLogFile(new_log_num, l_new_file, start_seqnum); + s = mani->setMaxLogFileNum(log_file_num, new_log_num); + assert(s); + + ll.unlock(); + _log_info(myLog, "moved to a new log file %ld, start seq %s", + new_log_num, _seq_str(start_seqnum).c_str()); + + // Sync manifest file. + mani->store(); + //mani->sync(); + } else { + // Otherwise, other thread already added a new log file. + ll.unlock(); + } + + LogFileInfo* lf_info = nullptr; + do { + mani->getMaxLogFileNum(new_log_num); + lf_info = mani->getLogFileInfoP(new_log_num); + } while (!lf_info || lf_info->isRemoved()); + + new_log_file_info = LogFileInfoGuard(lf_info); + + return Status(); +} + +Status LogMgr::setSN(const Record& rec) { + Timer tt; + + Status s; + uint64_t log_file_num = 0; + uint64_t max_log_file_num = 0; + bool overwrite = false; + + if (parentDb) parentDb->p->updateOpHistory(); + + // Return error in read-only mode. + if (getDbConfig()->readOnly) return Status::WRITE_VIOLATION; + + // seqnum should start from 1. + if ( rec.seqNum == 0 ) return Status::INVALID_SEQNUM; + + // All writes will be serialized, except for throttling part. + std::unique_lock wm(writeMutex); + + // Get latest log file. + LogFileInfo* lf_info = nullptr; + do { + EP(mani->getMaxLogFileNum(max_log_file_num)); + log_file_num = max_log_file_num; + + if ( valid_number(rec.seqNum) && + getDbConfig()->allowOverwriteSeqNum ) { + // May overwrite existing seqnum, get corresponding log file. + s = mani->getLogFileNumBySeq(rec.seqNum, log_file_num); + if (s) overwrite = true; + // If not exist, use the latest file. + } + lf_info = mani->getLogFileInfoP(log_file_num); + + } while (!lf_info || lf_info->isRemoved()); + LogFileInfoGuard g_li(lf_info); + + // If 1) this file is not writable, AND + // 2) 1) overwrite is not allowed, OR + // 2) overwrite is allowed, but not overwriting. + if ( !g_li->file->isValidToWrite() && + ( !getDbConfig()->allowOverwriteSeqNum || + !overwrite ) ) { + addNewLogFile(g_li, g_li); + + DBMgr* dbm = DBMgr::getWithoutInit(); + DebugParams dp = dbm->getDebugParams(); + if (dp.addNewLogFileCb) { + DebugParams::GenericCbParams p; + dp.addNewLogFileCb(p); + } + } + + EP(g_li->file->setSN(rec)); + + if (g_li->file->isImmutable()) { + // Overwrote immutable file, + // reset sync/flush seq num if necessary. + uint64_t last_synced_log = 0; + s = mani->getLastSyncedLog(last_synced_log); + if (s && log_file_num < last_synced_log) { + mani->setLastSyncedLog(log_file_num); + } + uint64_t last_flushed_log = 0; + s = mani->getLastFlushedLog(last_flushed_log); + if (s && log_file_num < last_flushed_log) { + mani->setLastFlushedLog(log_file_num); + } + } + + wm.unlock(); + if (throttlingRate > 0) { + DBMgr* mgr = DBMgr::getWithoutInit(); + GlobalConfig* g_config = mgr->getGlobalConfig(); + + // Throttling. + double exp_us = 1000000.0 / throttlingRate.load(); + + size_t effective_time_ms = + std::min( lastFlushIntervalMs.load(), + (int64_t)THROTTLING_EFFECTIVE_TIME_MS ); + size_t num_log_files = mani->getNumLogFiles(); + size_t log_files_limit = g_config->flusherMinLogFilesToTrigger * 2; + if (num_log_files > log_files_limit) { + effective_time_ms *= (num_log_files - log_files_limit); + } + + uint64_t throttle_age_ms = throttlingRateTimer.getUs() / 1000; + if ( effective_time_ms && + throttle_age_ms < effective_time_ms ) { + // Should consider age. + exp_us *= (effective_time_ms - throttle_age_ms); + exp_us /= effective_time_ms; + + double cur_us = tt.getUs(); + if ( exp_us > cur_us ) { + // Throttle incoming writes. + double remaining_us = exp_us - cur_us; + if (remaining_us > 1.0) { + Timer::sleepUs((uint64_t)remaining_us); + } + } + } + } + + numSetRecords.fetch_add(1); + return Status(); +} + +Status LogMgr::getSN(const uint64_t seq_num, Record& rec_out) { + Status s; + LogFileInfo* linfo; + + if (parentDb) parentDb->p->updateOpHistory(); + + EP( mani->getLogFileInfoBySeq(seq_num, linfo) ); + LogFileInfoGuard gg(linfo); + if (gg.empty() || gg.ptr->isRemoved()) { + return Status::KEY_NOT_FOUND; + } + + EP( gg->file->getSN(seq_num, rec_out) ); + return Status(); +} + +Status LogMgr::get(const uint64_t chk, + std::list* l_list, + const SizedBuf& key, + Record& rec_out) +{ + Status s; + uint64_t min_log_num, max_log_num; + + // NOTE: Calculate hash value in advance, + // to avoid duplicate overhead. + uint64_t hash_values[2]; + MurmurHash3_x64_128(key.data, key.size, 0, hash_values); + + if (parentDb) parentDb->p->updateOpHistory(); + + if (valid_number(chk)) { + // Snapshot: beyond the last flushed log. + assert(l_list); + auto entry = l_list->rbegin(); + while (entry != l_list->rend()) { + LogFileInfo* l_info = *entry; + s = l_info->file->get(chk, key, hash_values, rec_out, true); + if (s) return s; + entry++; + } + } else { + // Normal: from the last flushed log. + EP( mani->getLastFlushedLog(min_log_num) ); + EP( mani->getMaxLogFileNum(max_log_num) ); + + if (getDbConfig()->logSectionOnly) { + // Log only mode: searching skiplist one-by-one. + for (int64_t ii = max_log_num; ii >= (int64_t)min_log_num; --ii) { + LogFileInfoGuard li(mani->getLogFileInfoP(ii)); + if (li.empty() || li.ptr->isRemoved()) continue; + s = li->file->get(chk, key, hash_values, rec_out, true); + if (s) { + return s; + } + } + + } else { + // Get whole list and then find, + // to reduce skiplist overhead. + std::vector l_files; + EP( mani->getLogFileInfoRange(min_log_num, max_log_num, l_files) ); + size_t num = l_files.size(); + if (!num) return Status::KEY_NOT_FOUND; + + bool found = false; + for (int ii = num-1; ii>=0; --ii) { + LogFileInfo* l_info = l_files[ii]; + if (l_info->isRemoved()) continue; + s = l_info->file->get(chk, key, hash_values, rec_out, true); + if (s) { + found = true; + break; + } + } + + for (LogFileInfo* ll: l_files) ll->done(); + if (found) return Status(); + } + } + return Status::KEY_NOT_FOUND; +} + +Status LogMgr::sync(bool call_fsync) { + std::lock_guard l(syncMutex); + return syncNoWait(call_fsync); +} + +Status LogMgr::syncNoWait(bool call_fsync) { + // Return error in read-only mode. + if (getDbConfig()->readOnly) return Status::WRITE_VIOLATION; + + // Only one sync operation at a time. + OpSemaWrapper ow(&syncSema); + if (!ow.acquire()) { + _log_debug(myLog, "Sync failed. Other thread is working on it."); + return Status::OPERATION_IN_PROGRESS; + } + assert(ow.op_sema->enabled); + return syncInternal(call_fsync); +} + +Status LogMgr::syncInternal(bool call_fsync) { + Status s; + uint64_t ln_from, ln_to; + s = mani->getMaxLogFileNum(ln_to); + if (!s) { + // No log, do nothing. + return Status(); + } + s = mani->getLastSyncedLog(ln_from); + if (!s) { + // Checkpointing (memtable -> logs) never happend. + // Start from the first log file. + EP( mani->getMinLogFileNum(ln_from) ); + } + + // Selective logging based on timer, to avoid verbose messages. + int num_suppressed = 0; + SimpleLogger::Levels log_level = vlSync.checkPrint(num_suppressed) + ? SimpleLogger::INFO + : SimpleLogger::DEBUG; + num_suppressed = (myLog && myLog->getLogLevel() >= SimpleLogger::DEBUG) + ? 0 : num_suppressed; + + if (ln_from + 2 <= ln_to) { + // Big sync (across 3 log files), leave log message. + log_level = SimpleLogger::INFO; + } + _log_(log_level, myLog, "sync log file %zu - %zu (fsync = %s), " + "%d suppressed messages", + ln_from, ln_to, call_fsync ? "true" : "false", + num_suppressed); + + uint64_t last_synced_log = ln_from; + for (uint64_t ii=ln_from; ii<=ln_to; ++ii) { + // Write log file first + LogFileInfoGuard li(mani->getLogFileInfoP(ii)); + if (li.empty() || li.ptr->isRemoved()) continue; + + uint64_t before_sync = li->file->getSyncedSeqNum(); + EP( li->file->flushMemTable() ); + uint64_t after_sync = li->file->getSyncedSeqNum(); + _log_( log_level, myLog, "synced log file %zu, min seq %s, " + "flush seq %s, sync seq %s -> %s, max seq %s", + ii, + _seq_str( li->file->getMinSeqNum() ).c_str(), + _seq_str( li->file->getFlushedSeqNum() ).c_str(), + _seq_str( before_sync ).c_str(), + _seq_str( after_sync ).c_str(), + _seq_str( li->file->getMaxSeqNum() ).c_str() ); + if (call_fsync) { + EP( li->file->sync() ); + } + if (valid_number(after_sync)) { + last_synced_log = ii; + } + } + + // Sync up manifest file next + mani->setLastSyncedLog(last_synced_log); + EP( mani->store() ); + if (call_fsync) { + EP( mani->sync() ); + } + _log_(log_level, myLog, "updated log manifest file."); + + return Status(); +} + +Status LogMgr::flush(const FlushOptions& options, + const uint64_t seq_num, + TableMgr* table_mgr) +{ + if (!seq_num) { + // Zero sequence number is not allowed. + return Status::INVALID_SEQNUM; + } + + OpSemaWrapper ow(&flushSema); + if (!ow.acquire()) { + _log_debug(myLog, "Flush skipped. Other thread is working on it."); + return Status::OPERATION_IN_PROGRESS; + } + assert(ow.op_sema->enabled); + + Status s; + Timer tt; + + // Grab all logs and pass them to table manager + uint64_t ln_from, ln_to, ln_to_original; + mani->getLastFlushedLog(ln_from); + if (options.beyondLastSync) { + // Upto the latest log. + mani->getMaxLogFileNum(ln_to); + } else { + // Upto the last synced log. + mani->getLastSyncedLog(ln_to); + } + + if (ln_to == NOT_INITIALIZED) { + // Sync (memtable -> logs) never happend, cannot flush. + return Status::LOG_NOT_SYNCED; + } + if (ln_from == NOT_INITIALIZED) { + // Flush (logs -> tables) never happend. + // Flush from the first log file. + EP( mani->getMinLogFileNum(ln_from) ); + } + + ln_to_original = ln_to; + if ( options.numFilesLimit && + ln_to - ln_from + 1 > options.numFilesLimit ) { + ln_to = ln_from + options.numFilesLimit - 1; + } + + uint64_t seq_num_local = seq_num; + if (seq_num_local == NOT_INITIALIZED) { + // Purge all synced (checkpointed) logs. + LogFileInfoGuard ll(mani->getLogFileInfoP(ln_to, true)); + if (options.beyondLastSync) { + seq_num_local = ll->file->getMaxSeqNum(); + } else { + seq_num_local = ll->file->getSyncedSeqNum(); + } + + } else { + // Not all logs, need to adjust `ln_to`. + seq_num_local = seq_num; + // If in purge only mode, don't need to load mem table. + mani->getLogFileNumBySeq(seq_num_local, ln_to, options.purgeOnly); + } + _log_debug(myLog, "Given seq upto %s, actual seq upto %ld", + _seq_str(seq_num).c_str(), seq_num_local); + + { // Compare given seq with the last flushed seq. + LogFileInfoGuard ll(mani->getLogFileInfoP(ln_from, true)); + uint64_t last_flushed_seq = ll->file->getFlushedSeqNum(); + if (valid_number(last_flushed_seq) && last_flushed_seq >= seq_num) { + // Already flushed. Do nothing. + return Status::ALREADY_FLUSHED; + } + } + + uint64_t num_records_flushed = 0; + if (!options.purgeOnly) { + std::list records; + std::list checkpoints; + bool increasing_order = true; + + for (uint64_t ii = ln_from; ii <= ln_to; ++ii) { + LogFileInfoGuard ll(mani->getLogFileInfoP(ii)); + s = ll->file->getLogsToFlush( seq_num_local, + records, + options.beyondLastSync ); + if (!s) _log_warn(myLog, "s: %d", s); + ll->file->getCheckpoints(seq_num_local, checkpoints); + increasing_order = increasing_order && ll->file->isIncreasingOrder(); + } + num_records_flushed = records.size(); + _log_info( myLog, "Gather records from log files %ld -- %ld, %zu records.", + ln_from, ln_to, num_records_flushed ); + if (increasing_order) { + _log_info( myLog, "INCREASING ORDER, set sequantial loading flag" ); + } + parentDb->p->flags.seqLoading = increasing_order; + + if (records.size()) { + EP( table_mgr->setBatch(records, checkpoints) ); + EP( table_mgr->storeManifest() ); + _log_debug(myLog, "Updated table files."); + } else { + // WARNING: + // Even if `records` is empty, we SHOULD proceed + // as we need to purge log files properly. + } + + // Set flush log & seq number. + for (uint64_t ii = ln_from; ii <= ln_to; ++ii) { + LogFileInfoGuard ll(mani->getLogFileInfoP(ii)); + if (options.beyondLastSync) { + ll->file->setSyncedSeqNum(seq_num_local); + } + EP( ll->file->setFlushedSeqNum(seq_num_local) ); + } + + } else { + // Purge only mode: set flush seq number of the last file. + // WARNING: Should avoid loading memtable because of this call. + LogFileInfoGuard ll(mani->getLogFileInfoP(ln_to, true)); + EP( ll->file->setFlushedSeqNum(seq_num_local) ); + } + + _log_info(myLog, + "Flush done, seq upto %s, actual seq upto %lu, " + "log from %lu to %lu", + _seq_str(seq_num).c_str(), seq_num_local, ln_from, ln_to); + + if (options.beyondLastSync) { + mani->setLastSyncedLog(ln_to); + } + mani->setLastFlushedLog(ln_to); + // Remove log file except for ln_to. + for (uint64_t ii = ln_from; ii < ln_to; ++ii) { + // Avoid loading memtable because of this call. + LogFileInfoGuard ll(mani->getLogFileInfoP(ii, true)); + + // Remove file from manifest. + mani->removeLogFile(ii); + _log_info(myLog, "Removed log file %ld.", ii); + } + + // Store log & table manifest file. + EP( mani->store() ); + EP( mani->sync() ); + _log_debug(myLog, "Updated log manifest file."); + + if (num_records_flushed) { + adjustThrottling(num_records_flushed, tt.getSec(), + options, ln_to_original, ln_to); + } + + return s; +} + +void LogMgr::adjustThrottling(uint64_t num_records_flushed, + double elapsed, + const FlushOptions& options, + uint64_t ln_to_original, + uint64_t ln_to) +{ + uint64_t flush_time_gap_us = lastFlushTimer.getUs(); + lastFlushTimer.reset(); + lastFlushIntervalMs = flush_time_gap_us / 1000; + + int64_t num_set_records = numSetRecords.load(); + + double incoming_rate = + flush_time_gap_us + ? (double)num_set_records * 1000000 / flush_time_gap_us + : 0; + double log_flush_rate = + elapsed ? num_records_flushed / elapsed : 0; + + if (parentDb) { + parentDb->p->tStats.lastLogFlushRate = log_flush_rate; + } + double slowest_speed = getSlowestMergeRate(false); + + size_t num_log_files = getNumLogFiles(); + _log_info( myLog, "numFilesLimit %zu, num log files %zu, " + "num records flushed %zu, num set records %zd, " + "incoming rate %.1f iops, flush rate %.1f iops, " + "slowest rate %.1f iops, last flush interval %zu ms", + options.numFilesLimit, + num_log_files, + num_records_flushed, + num_set_records, + incoming_rate, + log_flush_rate, + slowest_speed, + lastFlushIntervalMs.load() ); + bool enable_throttling = false; + bool too_many_logs = false; + + if (num_log_files > 128) { + enable_throttling = true; + too_many_logs = true; + } + + if ( slowest_speed > 0 && + num_records_flushed > getDbConfig()->throttlingThreshold && + incoming_rate > slowest_speed ) { + enable_throttling = true; + } + + if (enable_throttling) { + throttlingRate.store( slowest_speed ); + if (too_many_logs) { + adjustThrottlingExtreme(); + } + _log_info(myLog, + "enable write throttling, # records flushed %zu, %s" + "new throttling rate %.1f ops/sec (%.1f us)", + num_records_flushed, + (too_many_logs) ? "too many logs, " : "", + throttlingRate.load(), + 1000000.0 / throttlingRate.load()); + + } else { + // If # records waiting for being flushed is less than a threshold, + // cancel the throttling. + throttlingRate = 0; + _log_info(myLog, + "cancel write throttling, # records flushed %zu, %.3f sec", + num_records_flushed, elapsed); + } + throttlingRateTimer.reset(); + + if (numSetRecords >= (int64_t)num_records_flushed) { + numSetRecords.fetch_sub(num_records_flushed); + } else { + numSetRecords.store(0); + } +} + +double LogMgr::getSlowestMergeRate(bool include_table_rate) { + if (!parentDb) return 0; + + // Pick the smallest non-zero rate. + std::set rates; + const DB::DBInternal::ThrottlingStats& t_stats = parentDb->p->tStats; + if ( t_stats.lastLogFlushRate.load() ) { + rates.insert( t_stats.lastLogFlushRate.load() ); + } + + if (include_table_rate) { + if ( t_stats.lastTableFlushRate.load() && + !t_stats.lastTableFlushRateExpiry.timeout() ) { + rates.insert( t_stats.lastTableFlushRate.load() ); + } + if ( t_stats.lastSplitRate.load() && + !t_stats.lastSplitRateExpiry.timeout() ) { + rates.insert( t_stats.lastSplitRate.load() ); + } + } + + if (!rates.size()) return 0; + return *rates.begin(); +} + +void LogMgr::adjustThrottlingExtreme() { + if (getDbConfig()->logSectionOnly) return; + + size_t num_logs = getNumLogFiles(); + if (num_logs <= 128) return; + + size_t factor = num_logs - 128; + double target = 10000.0 / factor; + + if (throttlingRate == 0) { + throttlingRate.store(target); + } else { + throttlingRate = std::min( target, throttlingRate.load() ); + } + throttlingRateTimer.reset(); +} + +Status LogMgr::doLogReclaim() { + OpSemaWrapper ow(&reclaimSema); + if (!ow.acquire()) { + _log_debug(myLog, "Reclaim skipped. Other thread is working on it."); + return Status::OPERATION_IN_PROGRESS; + } + assert(ow.op_sema->enabled); + + mani->reclaimExpiredLogFiles(); + return Status(); +} + +Status LogMgr::checkpoint(uint64_t& seq_num_out, bool call_fsync) { + Status s; + uint64_t log_file_num = 0; + + for(;;) { + // Get latest log file. + EP(mani->getMaxLogFileNum(log_file_num)); + LogFileInfoGuard g_li(mani->getLogFileInfoP(log_file_num)); + if (g_li.empty() || g_li.ptr->isRemoved()) continue; + + // If this file is already immutable: force append. + EP(g_li->file->checkpoint(seq_num_out)); + break; + } + // Durable sync all. + s = sync(call_fsync); + // Tolerate race condition, report error for all the others. + if (!s && s != Status::OPERATION_IN_PROGRESS) return s; + + return Status(); +} + +Status LogMgr::getAvailCheckpoints(std::list& chk_out) { + Status s; + uint64_t ln_flushed; + uint64_t ln_max; + + for (;;) { + EP(mani->getLastFlushedLog(ln_flushed)); + EP(mani->getMaxLogFileNum(ln_max)); + + // Check every file. + for (uint64_t ii=ln_flushed; ii<=ln_max; ++ii) { + LogFileInfoGuard g(mani->getLogFileInfoP(ii)); + if (g.empty() || g.ptr->isRemoved()) continue; // skip this file. + + g.file()->getCheckpoints(NOT_INITIALIZED, chk_out); + } + break; + } + return Status(); +} + + +Status LogMgr::getAvailSeqRange(uint64_t& min_seq, + uint64_t& max_seq) +{ + Status s; + uint64_t ln_flush; + + for (;;) { + EP( mani->getLastFlushedLog(ln_flush) ); + LogFileInfoGuard li_flush( mani->getLogFileInfoP(ln_flush, true) ); + if (li_flush.empty() || li_flush.ptr->isRemoved()) continue; + + min_seq = li_flush->file->getFlushedSeqNum(); + if (min_seq == NOT_INITIALIZED) { + // Purge never happened. + min_seq = 0; + } else { + // Available seq number starts from purge seq + 1. + min_seq++; + } + break; + } + + // Note: if max file doesn't exist, it means there is no log. + // Return failure in that case. + uint64_t ln_max = NOT_INITIALIZED; + for (;;) { + EP( mani->getMaxLogFileNum(ln_max) ); + LogFileInfoGuard li_max(mani->getLogFileInfoP(ln_max, true)); + if (li_max.empty() || li_max.ptr->isRemoved()) continue; + max_seq = li_max->file->getMaxSeqNum(); + break; + } + + return Status(); +} + +Status LogMgr::getMaxSeqNum(uint64_t& seq_num_out) { + Status s; + uint64_t ln_max = 0; + uint64_t max_seq = NOT_INITIALIZED; + const size_t MAX_TRY = 16; + + for (size_t num_tries = 0; num_tries < MAX_TRY; num_tries++) { + EP( mani->getMaxLogFileNum(ln_max) ); + + bool succ = false; + for (int64_t cur_idx = ln_max; cur_idx >= 0; --cur_idx) { + LogFileInfoGuard li(mani->getLogFileInfoP(cur_idx, true)); + if (li.empty() || li.ptr->isRemoved()) { + break; + } + max_seq = li->file->getMaxSeqNum(); + if (max_seq != NOT_INITIALIZED) { + succ = true; + break; + } + } + if (succ) break; + } + + if (max_seq == NOT_INITIALIZED) { + return Status::LOG_NOT_EXIST; + } + seq_num_out = max_seq; + return Status(); +} + +Status LogMgr::getMinSeqNum(uint64_t& seq_num_out) { + Status s; + uint64_t ln_min = 0; + uint64_t min_seq = NOT_INITIALIZED; + + LogFileInfo* lf_info = nullptr; + + for (;;) { + EP(mani->getLastFlushedLog(ln_min)); + // WARNING: Should avoid file loading due to this call. + lf_info = mani->getLogFileInfoP(ln_min, true); + if (!lf_info || lf_info->isRemoved()) continue; + min_seq = lf_info->file->getFlushedSeqNum(); + break; + } + + LogFileInfoGuard li(lf_info); + + if (valid_number(min_seq)) { + if (min_seq == li->file->getMaxSeqNum()) { + LogFileInfo* next_file; + // WARNING: Should avoid file loading due to this call. + s = mani->getLogFileInfo(ln_min+1, next_file, true); + if (!s) { + // Next file doesn't exist, + // means that there is no record in log section. + return Status::LOG_NOT_EXIST; + } + next_file->done(); + } + // Min seq: last flush + 1 + min_seq++; + } else { + // Nothing has been flushed yet. Get min seq. + min_seq = li->file->getMinSeqNum(); + if (!valid_number(min_seq)) { + return Status::LOG_NOT_EXIST; + } + } + seq_num_out = min_seq; + return Status(); +} + +Status LogMgr::getLastFlushedSeqNum(uint64_t& seq_num_out) { + Status s; + uint64_t ln_flush = 0; + uint64_t flush_seq = NOT_INITIALIZED; + const size_t MAX_TRY = 16; + + for (size_t num_tries=0; num_tries < MAX_TRY; ++num_tries) { + EP(mani->getLastFlushedLog(ln_flush)); + LogFileInfoGuard li(mani->getLogFileInfoP(ln_flush, true)); + if (li.empty() || li.ptr->isRemoved()) continue; + flush_seq = li->file->getFlushedSeqNum(); + break; + } + + if (!valid_number(flush_seq)) { + // Nothing has been flushed yet. + return Status::INVALID_SEQNUM; + } + seq_num_out = flush_seq; + return Status(); +} + +Status LogMgr::getLastSyncedSeqNum(uint64_t& seq_num_out) { + Status s; + uint64_t ln_sync = 0; + uint64_t sync_seq = NOT_INITIALIZED; + const size_t MAX_TRY = 16; + + for (size_t num_tries=0; num_tries < MAX_TRY; ++num_tries) { + EP(mani->getLastSyncedLog(ln_sync)); + LogFileInfoGuard li(mani->getLogFileInfoP(ln_sync, true)); + if (li.empty() || li.ptr->isRemoved()) continue; + sync_seq = li->file->getSyncedSeqNum(); + if (!valid_number(sync_seq)) { + // This should be a bug. + _log_err( myLog, "log file %zu returned invalid seq number, " + "evicted %d removed %d memtable purge %d", + li.ptr->logFileNum, + li.ptr->evicted.load(), + li.ptr->removed.load(), + li->file->isMemTablePurged() ); + assert(0); + } + break; + } + + if (!valid_number(sync_seq)) { + // Nothing has been flushed yet. + return Status::INVALID_SEQNUM; + } + seq_num_out = sync_seq; + return Status(); +} + + +bool LogMgr::checkTimeToFlush(const GlobalConfig& config) { + Status s; + uint64_t l_last_flush = 0; + uint64_t l_max = 0; + uint64_t seq_last_flush = NOT_INITIALIZED; + uint64_t seq_max = NOT_INITIALIZED; + + if (getDbConfig()->readOnly) return false; + if (syncSema.grabbed) return false; + if (flushSema.grabbed) return false; + if (getDbConfig()->logSectionOnly) return false; + + const size_t MAX_TRY = 10; + size_t num_try = 0; + for (num_try = 0; num_try < MAX_TRY; ++num_try) { + s = mani->getMaxLogFileNum(l_max); + if (!s) return false; + + LogFileInfoGuard g_max(mani->getLogFileInfoP(l_max, true)); + if (g_max.empty() || g_max.ptr->isRemoved()) continue; + + seq_max = g_max->file->getMaxSeqNum(); + break; + } + if (num_try >= MAX_TRY) return false; + + for (num_try = 0; num_try < MAX_TRY; ++num_try) { + s = mani->getLastFlushedLog(l_last_flush); + if (!s) l_last_flush = 0; + + LogFileInfoGuard g_flush(mani->getLogFileInfoP(l_last_flush, true)); + if (g_flush.empty() || g_flush.ptr->isRemoved()) continue; + + seq_last_flush = g_flush->file->getFlushedSeqNum(); + break; + } + if (num_try >= MAX_TRY) return false; + + if (seq_last_flush == NOT_INITIALIZED) seq_last_flush = 0; + if (seq_max == NOT_INITIALIZED) return false; + + // If seq number gap exceeds the limit. + if (seq_max > seq_last_flush + config.flusherMinRecordsToTrigger) { + return true; + } + // If the number of log files exceeds the limit. + if (l_max > l_last_flush + config.flusherMinLogFilesToTrigger) { + return true; + } + + return false; +} + +Status LogMgr::close() { + if (!initialized) return Status(); + + // If sync() or flush() is running, + // wait until they finish their jobs. + OpSemaWrapper op_sync(&syncSema); + _log_info(myLog, "Wait for on-going sync operation."); + + uint64_t ticks = 0; + while (!op_sync.acquire()) { + ticks++; + Timer::sleepMs(1); + } + syncSema.enabled = false; + _log_info(myLog, "Disabled syncing for %p, %zu ticks", this, ticks); + + if (!getDbConfig()->readOnly) { + // Last sync before close (not in read-only mode). + syncInternal(false); + _log_info(myLog, "Last sync done"); + } else { + _log_info(myLog, "read-only mode: skip the last sync"); + } + + OpSemaWrapper op_flush(&flushSema); + _log_info(myLog, "Wait for on-going flush operation."); + ticks = 0; + while (!op_flush.acquire()) { + ticks++; + Timer::sleepMs(1); + } + + flushSema.enabled = false; + _log_info(myLog, "Disabled flushing for %p, %zu ticks", this, ticks); + + OpSemaWrapper op_reclaim(&reclaimSema); + _log_info(myLog, "Wait for on-going log reclaim operation."); + ticks = 0; + while (!op_reclaim.acquire()) { + ticks++; + Timer::sleepMs(1); + } + + reclaimSema.enabled = false; + _log_info(myLog, "Disabled reclaiming for %p, %zu ticks", this, ticks); + + initialized = false; + return Status(); +} + +Status LogMgr::syncSeqnum(TableMgr* t_mgr) { + // WARNING: + // This function will be called on opening DB only, + // assuming that the DB is NOT activated yet. + + uint64_t last_seqnum = NOT_INITIALIZED; + t_mgr->getLastSeqnum(last_seqnum); + + // If tables do not exist, do nothing. + if (!valid_number(last_seqnum)) return Status(); + + uint64_t min_log_file = 0; + uint64_t max_log_file = 0; + Status s; + s = mani->getMinLogFileNum(min_log_file); + // Log section is empty, do nothing. + if (!s) return Status(); + s = mani->getMaxLogFileNum(max_log_file); + + for (size_t ii=min_log_file; ii<=max_log_file; ++ii) { + LogFileInfoGuard ll( mani->getLogFileInfoP(ii) ); + uint64_t min_seq = ll.file()->getMinSeqNum(); + uint64_t flushed_seq = ll.file()->getFlushedSeqNum(); + uint64_t seq_counter = ll.file()->getSeqCounter(); + _log_info(myLog, "log file %zu, min seq %s flushed seq %s seq counter %s " + "table seq %s", + ii, + _seq_str(min_seq).c_str(), + _seq_str(flushed_seq).c_str(), + _seq_str(seq_counter).c_str(), + _seq_str(last_seqnum).c_str()); + // WARNING: + // We should not force set flushed seq number. + // If crash happens in the middle of flushing, + // we should re-flush them, instead of force-setting + // the last flushed number (it causes data loss). + //ll.file()->forceSeqnum(last_seqnum); + } + + return s; +} + +size_t LogMgr::getNumLogFiles() { + if (!initialized || !mani) return 0; + return mani->getNumLogFiles(); +} + +} // namespace jungle diff --git a/src/log_mgr.h b/src/log_mgr.h new file mode 100644 index 0000000..c447178 --- /dev/null +++ b/src/log_mgr.h @@ -0,0 +1,340 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "avltree.h" +#include "fileops_base.h" +#include "internal_helper.h" +#include "log_file.h" +#include "log_manifest.h" +#include "table_mgr.h" + +#include + +#include +#include +#include +#include + +class SimpleLogger; + +namespace jungle { + +class LogMgrOptions { +public: + LogMgrOptions() + : fOps(nullptr) + , fDirectOps(nullptr) + , prefixNum(0) + , dbConfig(nullptr) + {} + + std::string path; + FileOps* fOps; + FileOps* fDirectOps; + // KVS ID. + uint64_t prefixNum; + std::string kvsName; + // Pointer to the parent DB handle's config. + const DBConfig* dbConfig; +}; + +// Semaphore that allows only one operation at a time. +struct OpSema { + OpSema() : enabled(true), grabbed(false) {} + std::atomic enabled; + std::atomic grabbed; +}; + +struct OpSemaWrapper { + OpSemaWrapper(OpSema* _op_sema) : op_sema(_op_sema), acquired(false) {} + ~OpSemaWrapper() { + if (acquired) { + op_sema->grabbed = false; + } + op_sema = nullptr; + acquired = false; + } + + bool acquire() { + bool expected = false; + bool val = true; + if ( op_sema->enabled && + op_sema->grabbed.compare_exchange_weak(expected, val) ) { + acquired = true; + } + return acquired; + } + + OpSema* op_sema; + bool acquired; +}; + +namespace checker { class Checker; } + +class LogMgr { + friend class checker::Checker; + +public: + LogMgr(DB* parent_db, const LogMgrOptions& _options = LogMgrOptions()); + + ~LogMgr(); + + Status init(const LogMgrOptions& _options); + + void logMgrSettings(); + + Status rollback(uint64_t seq_upto); + + Status removeStaleFiles(); + + bool isTtlMode() const; + + Status openSnapshot(DB* snap_handle, + const uint64_t checkpoint, + std::list*& log_file_list_out); + Status closeSnapshot(DB* snap_handle); + + Status setByBulkLoader(std::list& batch, + TableMgr* table_mgr, + bool last_batch = false); + + Status setSN(const Record& rec); + + // Returns pointer only. + Status getSN(const uint64_t seq_num, Record& rec_out); + + // Returns pointer only. + Status get(const uint64_t chk, + std::list* l_list, + const SizedBuf& key, + Record& rec_out); + + Status sync(bool call_fsync = true); + + Status syncNoWait(bool call_fsync = true); + +protected: + Status syncInternal(bool call_fsync); + + Status addNewLogFile(LogFileInfoGuard& cur_log_file_info, + LogFileInfoGuard& new_log_file_info); + + void adjustThrottling(uint64_t num_records_flushed, + double elapsed, + const FlushOptions& options, + uint64_t ln_to_original, + uint64_t ln_to); + + double getSlowestMergeRate(bool include_table_rate = true); + + void adjustThrottlingExtreme(); + +public: + Status flush(const FlushOptions& options, + const uint64_t seq_num, + TableMgr* table_mgr); + + Status doLogReclaim(); + + Status checkpoint(uint64_t& seq_num_out, bool call_fsync = true); + Status getAvailCheckpoints(std::list& chk_out); + + // Return (last flushed seq + 1) to max seq + Status getAvailSeqRange(uint64_t& min_seq, + uint64_t& max_seq); + Status getMaxSeqNum(uint64_t& seq_num_out); + Status getMinSeqNum(uint64_t& seq_num_out); + Status getLastFlushedSeqNum(uint64_t& seq_num_out); + Status getLastSyncedSeqNum(uint64_t& seq_num_out); + + bool checkTimeToFlush(const GlobalConfig& config); + Status close(); + + Status syncSeqnum(TableMgr* t_mgr); + + inline const DBConfig* getDbConfig() const { return opt.dbConfig; } + + void setLogger(SimpleLogger* logger) { + myLog = logger; + if (mani) mani->setLogger(myLog); + } + + size_t getNumLogFiles(); + + struct Iterator { + public: + Iterator(); + ~Iterator(); + + enum SeekOption { + GREATER = 0, + SMALLER = 1, + }; + + Status init(DB* snap_handle, + LogMgr* log_mgr, + const SizedBuf& start_key, + const SizedBuf& end_key); + Status initSN(DB* snap_handle, + LogMgr* log_mgr, + uint64_t min_seq, + uint64_t max_seq); + Status get(Record& rec_out); + Status prev(); + Status next(); + Status seek(const SizedBuf& key, SeekOption opt = GREATER); + Status seekSN(const uint64_t seqnum, SeekOption opt = GREATER); + Status gotoBegin(); + Status gotoEnd(); + Status close(); + private: + enum Type { + BY_KEY = 0, + BY_SEQ = 1 + }; + struct ItrItem { + ItrItem() : flags(0x0), lInfo(nullptr), lItr(nullptr) {} + enum Flag { + none = 0x0, + no_more_prev = 0x1, + no_more_next = 0x2, + }; + inline static int cmpSeq(avl_node *a, avl_node *b, void *aux) { + ItrItem* aa = _get_entry(a, ItrItem, an); + ItrItem* bb = _get_entry(b, ItrItem, an); + if (aa->lastRec.seqNum < bb->lastRec.seqNum) return -1; + else if (aa->lastRec.seqNum > bb->lastRec.seqNum) return 1; + return 0; + } + inline static int cmpKey(avl_node *a, avl_node *b, void *aux) { + ItrItem* aa = _get_entry(a, ItrItem, an); + ItrItem* bb = _get_entry(b, ItrItem, an); + + CMP_NULL_CHK(aa->lastRec.kv.key.data, bb->lastRec.kv.key.data); + + int cmp = 0; + if (aux) { + // Custom cmp mode. + LogMgr* lm = reinterpret_cast(aux); + CustomCmpFunc func = lm->getDbConfig()->cmpFunc; + void* param = lm->getDbConfig()->cmpFuncParam; + cmp = func(aa->lastRec.kv.key.data, aa->lastRec.kv.key.size, + bb->lastRec.kv.key.data, bb->lastRec.kv.key.size, + param); + } else { + cmp = SizedBuf::cmp(aa->lastRec.kv.key, bb->lastRec.kv.key); + } + + // Note: key: ascending, seq: descending order. + if (cmp == 0) return cmpSeq(b, a, aux); + return cmp; + } + avl_node an; + uint8_t flags; + LogFileInfo* lInfo; + LogFile::Iterator* lItr; + Record lastRec; + }; + + Status initInternal(DB* snap_handle, + LogMgr* log_mgr, + uint64_t min_seq, + uint64_t max_seq, + const SizedBuf& start_key, + const SizedBuf& end_key, + LogMgr::Iterator::Type _type); + Status seekInternal(const SizedBuf& key, + const uint64_t seqnum, + SeekOption opt, + bool goto_end = false); + Status moveToLastValid(); + + void addLogFileItr(LogFileInfo* l_info); + inline int cmpSizedBuf(const SizedBuf& l, const SizedBuf& r); + inline bool checkValidBySeq(ItrItem* item, + const uint64_t cur_seq, + const bool is_prev = false); + inline bool checkValidByKey(ItrItem* item, + const SizedBuf& cur_key, + const bool is_prev = false); + + Type type; + LogMgr* lMgr; + std::vector itrs; + std::list* snapLogList; + uint64_t minSeqSnap; + uint64_t maxSeqSnap; + SizedBuf startKey; + SizedBuf endKey; + avl_tree curWindow; + avl_node* windowCursor; + }; + +protected: +// === TYPES + using LogFileList = std::list; + using SnapMap = std::unordered_map; + +// === VARIABLES + // Backward pointer to parent DB instance. + DB* parentDb; + + std::atomic initialized; + LogMgrOptions opt; + LogManifest* mani; + + std::recursive_mutex writeMutex; + + OpSema syncSema; + std::mutex syncMutex; + + OpSema flushSema; + + OpSema reclaimSema; + + std::mutex addNewLogFileMutex; + + std::mutex sMapLock; + SnapMap sMap; + + // IOPS. + // If non-zero, throttling is enabled based on this number. + std::atomic throttlingRate; + + // Timer that remembers the last time `throttlingRate` + // was updated. + Timer throttlingRateTimer; + + // Keep the last flushed time. + Timer lastFlushTimer; + + // Interval of last two flushes in ms. + std::atomic lastFlushIntervalMs; + + // Number of set calls since the last flush. + std::atomic numSetRecords; + + // Logger. + SimpleLogger* myLog; + + // Verbose logging control for sync. + VerboseLog vlSync; +}; + +} // namespace jungle + diff --git a/src/log_reclaimer.cc b/src/log_reclaimer.cc new file mode 100644 index 0000000..96f4eb9 --- /dev/null +++ b/src/log_reclaimer.cc @@ -0,0 +1,78 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "db_mgr.h" +#include "db_internal.h" +#include "internal_helper.h" +#include "log_mgr.h" +#include "log_reclaimer.h" +#include "skiplist.h" + +#include + +#include _MACRO_TO_STR(LOGGER_H) + +#include + +namespace jungle { + +LogReclaimer::LogReclaimer(const std::string& _w_name, + const GlobalConfig& _config) +{ + workerName = _w_name; + gConfig = _config; + LogReclaimerOptions options; + options.sleepDuration_ms = gConfig.logFileReclaimerSleep_sec * 1000; + options.worker = this; + curOptions = options; + handle = std::thread(WorkerBase::loop, &curOptions); +} + +LogReclaimer::~LogReclaimer() {} + +void LogReclaimer::work(WorkerOptions* opt_base) { + Status s; + + DBMgr* dbm = DBMgr::getWithoutInit(); + if (!dbm) return; + + std::list target_dbs; + + { // Check DB map. + std::lock_guard l(dbm->dbMapLock); + + skiplist_node* cursor = skiplist_begin(&dbm->dbMap); + while (cursor) { + DBWrap* dbwrap = _get_entry(cursor, DBWrap, snode); + if (dbwrap->db->p->dbConfig.logSectionOnly) { + target_dbs.push_back(dbwrap->db); + } + cursor = skiplist_next(&dbm->dbMap, cursor); + skiplist_release_node(&dbwrap->snode); + } + if (cursor) skiplist_release_node(cursor); + } + + if (target_dbs.empty()) return; + + for (auto& entry: target_dbs) { + DB* db = entry; + db->p->logMgr->doLogReclaim(); + } +} + +} // namespace jungle + diff --git a/src/log_reclaimer.h b/src/log_reclaimer.h new file mode 100644 index 0000000..f4c887a --- /dev/null +++ b/src/log_reclaimer.h @@ -0,0 +1,41 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "worker_mgr.h" + +#include + +namespace jungle { + +class LogReclaimer : public WorkerBase { +public: + struct LogReclaimerOptions : public WorkerOptions { + }; + + LogReclaimer(const std::string& _w_name, + const GlobalConfig& _config); + + ~LogReclaimer(); + + void work(WorkerOptions* opt_base); + + GlobalConfig gConfig; +}; + + +} // namespace jungle diff --git a/src/logger.cc b/src/logger.cc new file mode 100644 index 0000000..1b06bce --- /dev/null +++ b/src/logger.cc @@ -0,0 +1,1239 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2017 Jung-Sang Ahn +See URL: https://github.com/greensky00/simple_logger + (v0.3.25) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "logger.h" + +#if defined(__linux__) || defined(__APPLE__) + #include "backtrace.h" +#endif + +#include +#include +#include + +#include + +#if defined(__linux__) || defined(__APPLE__) + #include + #ifdef __linux__ + #include + #endif + #include + #include + #include + +#elif defined(WIN32) || defined(_WIN32) + #include + #undef min + #undef max +#endif + +#include +#include + +#ifndef _CLM_DEFINED +#define _CLM_DEFINED (1) + +#ifdef LOGGER_NO_COLOR + #define _CLM_D_GRAY "" + #define _CLM_GREEN "" + #define _CLM_B_GREEN "" + #define _CLM_RED "" + #define _CLM_B_RED "" + #define _CLM_BROWN "" + #define _CLM_B_BROWN "" + #define _CLM_BLUE "" + #define _CLM_B_BLUE "" + #define _CLM_MAGENTA "" + #define _CLM_B_MAGENTA "" + #define _CLM_CYAN "" + #define _CLM_END "" + + #define _CLM_WHITE_FG_RED_BG "" +#else + #define _CLM_D_GRAY "\033[1;30m" + #define _CLM_GREEN "\033[32m" + #define _CLM_B_GREEN "\033[1;32m" + #define _CLM_RED "\033[31m" + #define _CLM_B_RED "\033[1;31m" + #define _CLM_BROWN "\033[33m" + #define _CLM_B_BROWN "\033[1;33m" + #define _CLM_BLUE "\033[34m" + #define _CLM_B_BLUE "\033[1;34m" + #define _CLM_MAGENTA "\033[35m" + #define _CLM_B_MAGENTA "\033[1;35m" + #define _CLM_CYAN "\033[36m" + #define _CLM_B_GREY "\033[1;37m" + #define _CLM_END "\033[0m" + + #define _CLM_WHITE_FG_RED_BG "\033[37;41m" +#endif + +#define _CL_D_GRAY(str) _CLM_D_GRAY str _CLM_END +#define _CL_GREEN(str) _CLM_GREEN str _CLM_END +#define _CL_RED(str) _CLM_RED str _CLM_END +#define _CL_B_RED(str) _CLM_B_RED str _CLM_END +#define _CL_MAGENTA(str) _CLM_MAGENTA str _CLM_END +#define _CL_BROWN(str) _CLM_BROWN str _CLM_END +#define _CL_B_BROWN(str) _CLM_B_BROWN str _CLM_END +#define _CL_B_BLUE(str) _CLM_B_BLUE str _CLM_END +#define _CL_B_MAGENTA(str) _CLM_B_MAGENTA str _CLM_END +#define _CL_CYAN(str) _CLM_CYAN str _CLM_END +#define _CL_B_GRAY(str) _CLM_B_GREY str _CLM_END + +#define _CL_WHITE_FG_RED_BG(str) _CLM_WHITE_FG_RED_BG str _CLM_END + +#endif + +std::atomic SimpleLoggerMgr::instance(nullptr); +std::mutex SimpleLoggerMgr::instanceLock; +std::mutex SimpleLoggerMgr::displayLock; + +struct SimpleLoggerMgr::CompElem { + CompElem(uint64_t num, SimpleLogger* logger) + : fileNum(num), targetLogger(logger) + {} + uint64_t fileNum; + SimpleLogger* targetLogger; +}; + +SimpleLoggerMgr::TimeInfo::TimeInfo(std::tm* src) + : year(src->tm_year + 1900) + , month(src->tm_mon + 1) + , day(src->tm_mday) + , hour(src->tm_hour) + , min(src->tm_min) + , sec(src->tm_sec) + , msec(0) + , usec(0) + {} + +SimpleLoggerMgr::TimeInfo::TimeInfo(std::chrono::system_clock::time_point now) { + std::time_t raw_time = std::chrono::system_clock::to_time_t(now); + std::tm new_time; + +#if defined(__linux__) || defined(__APPLE__) + std::tm* lt_tm = localtime_r(&raw_time, &new_time); + +#elif defined(WIN32) || defined(_WIN32) + localtime_s(&new_time, &raw_time); + std::tm* lt_tm = &new_time; +#endif + + year = lt_tm->tm_year + 1900; + month = lt_tm->tm_mon + 1; + day = lt_tm->tm_mday; + hour = lt_tm->tm_hour; + min = lt_tm->tm_min; + sec = lt_tm->tm_sec; + + size_t us_epoch = std::chrono::duration_cast< std::chrono::microseconds > + ( now.time_since_epoch() ).count(); + msec = (us_epoch / 1000) % 1000; + usec = us_epoch % 1000; +} + + +SimpleLoggerMgr* SimpleLoggerMgr::init() { + SimpleLoggerMgr* mgr = instance.load(SimpleLogger::MOR); + if (!mgr) { + std::lock_guard l(instanceLock); + mgr = instance.load(SimpleLogger::MOR); + if (!mgr) { + mgr = new SimpleLoggerMgr(); + instance.store(mgr, SimpleLogger::MOR); + } + } + return mgr; +} + +SimpleLoggerMgr* SimpleLoggerMgr::get() { + SimpleLoggerMgr* mgr = instance.load(SimpleLogger::MOR); + if (!mgr) return init(); + return mgr; +} + +SimpleLoggerMgr* SimpleLoggerMgr::getWithoutInit() { + SimpleLoggerMgr* mgr = instance.load(SimpleLogger::MOR); + return mgr; +} + +void SimpleLoggerMgr::destroy() { + std::lock_guard l(instanceLock); + SimpleLoggerMgr* mgr = instance.load(SimpleLogger::MOR); + if (mgr) { + mgr->flushAllLoggers(); + delete mgr; + instance.store(nullptr, SimpleLogger::MOR); + } +} + +int SimpleLoggerMgr::getTzGap() { + std::chrono::system_clock::time_point now = + std::chrono::system_clock::now(); + std::time_t raw_time = std::chrono::system_clock::to_time_t(now); + std::tm new_time; + +#if defined(__linux__) || defined(__APPLE__) + std::tm* lt_tm = localtime_r(&raw_time, &new_time); + std::tm* gmt_tm = std::gmtime(&raw_time); + +#elif defined(WIN32) || defined(_WIN32) + localtime_s(&new_time, &raw_time); + std::tm* lt_tm = &new_time; + std::tm new_gmt_time; + gmtime_s(&new_gmt_time, &raw_time); + std::tm* gmt_tm = &new_gmt_time; +#endif + + TimeInfo lt(lt_tm); + TimeInfo gmt(gmt_tm); + + return ( ( lt.day * 60 * 24 + lt.hour * 60 + lt.min ) - + ( gmt.day * 60 * 24 + gmt.hour * 60 + gmt.min ) ); +} + +// LCOV_EXCL_START + +void SimpleLoggerMgr::flushCriticalInfo() { + std::string msg = " === Critical info (given by user): "; + msg += std::to_string(globalCriticalInfo.size()) + " bytes"; + msg += " ==="; + if (!globalCriticalInfo.empty()) { + msg += "\n" + globalCriticalInfo; + } + flushAllLoggers(2, msg); + if (crashDumpFile.is_open()) { + crashDumpFile << msg << std::endl << std::endl; + } +} + +void SimpleLoggerMgr::_flushStackTraceBuffer(size_t buffer_len, + uint32_t tid_hash, + uint64_t kernel_tid, + bool crash_origin) +{ + std::string msg; + char temp_buf[256]; + sprintf(temp_buf, "\nThread %04x", tid_hash); + msg += temp_buf; + if (kernel_tid) { + msg += " (" + std::to_string(kernel_tid) + ")"; + } + if (crash_origin) { + msg += " (crashed here)"; + } + msg += "\n\n"; + msg += std::string(stackTraceBuffer, buffer_len); + + size_t msg_len = msg.size(); + size_t per_log_size = SimpleLogger::MSG_SIZE - 1024; + for (size_t ii=0; ii l(activeThreadsLock); + std::string msg = "captured "; + msg += std::to_string(activeThreads.size()) + " active threads"; + flushAllLoggers(2, msg); + if (crashDumpFile.is_open()) crashDumpFile << msg << "\n\n"; + + for (uint64_t _tid: activeThreads) { + pthread_t tid = (pthread_t)_tid; + if (_tid == crashOriginThread) continue; + + struct sigaction _action; + sigfillset(&_action.sa_mask); + _action.sa_flags = SA_SIGINFO; + _action.sa_sigaction = SimpleLoggerMgr::handleStackTrace; + sigaction(SIGUSR2, &_action, NULL); + + pthread_kill(tid, SIGUSR2); + + sigset_t _mask; + sigfillset(&_mask); + sigdelset(&_mask, SIGUSR2); + sigsuspend(&_mask); + } + + msg = "got all stack traces, now flushing them"; + flushAllLoggers(2, msg); + + got_other_stacks = true; + } +#endif + + if (!got_other_stacks) { + std::string msg = "will not explore other threads (disabled by user)"; + flushAllLoggers(2, msg); + if (crashDumpFile.is_open()) { + crashDumpFile << msg << "\n\n"; + } + } +} + +void SimpleLoggerMgr::flushRawStack(RawStackInfo& stack_info) { + if (!crashDumpFile.is_open()) return; + + crashDumpFile << "Thread " << std::hex << std::setw(4) << std::setfill('0') + << stack_info.tidHash << std::dec + << " " << stack_info.kernelTid << std::endl; + if (stack_info.crashOrigin) { + crashDumpFile << "(crashed here)" << std::endl; + } + for (void* stack_ptr: stack_info.stackPtrs) { + crashDumpFile << std::hex << stack_ptr << std::dec << std::endl; + } + crashDumpFile << std::endl; +} + +void SimpleLoggerMgr::addRawStackInfo(bool crash_origin) { +#if defined(__linux__) || defined(__APPLE__) + void* stack_ptr[256]; + size_t len = _stack_backtrace(stack_ptr, 256); + + crashDumpThreadStacks.push_back(RawStackInfo()); + RawStackInfo& stack_info = *(crashDumpThreadStacks.rbegin()); + std::thread::id tid = std::this_thread::get_id(); + stack_info.tidHash = std::hash{}(tid) % 0x10000; +#ifdef __linux__ + stack_info.kernelTid = (uint64_t)syscall(SYS_gettid); +#endif + stack_info.crashOrigin = crash_origin; + for (size_t ii=0; ii= 0) ? '+' : '-', + (int)(tz_gap_abs / 60), tz_gap_abs % 60); + std::string path = crashDumpPath + "/" + filename; + crashDumpFile.open(path); + + char time_fmt[64]; + sprintf(time_fmt, "%04d-%02d-%02dT%02d:%02d:%02d.%03d%03d%c%02d:%02d", + lt.year, lt.month, lt.day, + lt.hour, lt.min, lt.sec, lt.msec, lt.usec, + (tz_gap >= 0) ? '+' : '-', + (int)(tz_gap_abs / 60), tz_gap_abs % 60); + crashDumpFile << "When: " << time_fmt << std::endl << std::endl; + } + + flushCriticalInfo(); + addRawStackInfo(true); + // Collect other threads' stack info. + logStackBackTraceOtherThreads(); + + // Now print out. + // For the case where `addr2line` is hanging, flush raw pointer first. + for (RawStackInfo& entry: crashDumpThreadStacks) { + flushRawStack(entry); + } + for (RawStackInfo& entry: crashDumpThreadStacks) { + flushStackTraceBuffer(entry); + } +} + +bool SimpleLoggerMgr::chkExitOnCrash() { + if (exitOnCrash) return true; + + std::string env_segv_str; + const char* env_segv = std::getenv("SIMPLELOGGER_EXIT_ON_CRASH"); + if (env_segv) env_segv_str = env_segv; + + if ( env_segv_str == "ON" || + env_segv_str == "on" || + env_segv_str == "TRUE" || + env_segv_str == "true" ) { + // Manually turned off by user, via env var. + return true; + } + + return false; +} + +void SimpleLoggerMgr::handleSegFault(int sig) { +#if defined(__linux__) || defined(__APPLE__) + SimpleLoggerMgr* mgr = SimpleLoggerMgr::get(); + signal(SIGSEGV, mgr->oldSigSegvHandler); + mgr->enableOnlyOneDisplayer(); + mgr->flushAllLoggers(1, "Segmentation fault"); + mgr->logStackBacktrace(); + + printf("[SEG FAULT] Flushed all logs safely.\n"); + fflush(stdout); + + if (mgr->chkExitOnCrash()) { + printf("[SEG FAULT] Exit on crash.\n"); + fflush(stdout); + exit(-1); + } + + if (mgr->oldSigSegvHandler) { + mgr->oldSigSegvHandler(sig); + } +#endif +} + +void SimpleLoggerMgr::handleSegAbort(int sig) { +#if defined(__linux__) || defined(__APPLE__) + SimpleLoggerMgr* mgr = SimpleLoggerMgr::get(); + signal(SIGABRT, mgr->oldSigAbortHandler); + mgr->enableOnlyOneDisplayer(); + mgr->flushAllLoggers(1, "Abort"); + mgr->logStackBacktrace(); + + printf("[ABORT] Flushed all logs safely.\n"); + fflush(stdout); + + if (mgr->chkExitOnCrash()) { + printf("[ABORT] Exit on crash.\n"); + fflush(stdout); + exit(-1); + } + + abort(); +#endif +} + +#if defined(__linux__) || defined(__APPLE__) +void SimpleLoggerMgr::handleStackTrace(int sig, siginfo_t* info, void* secret) { +#ifndef __linux__ + // Not support non-Linux platform. + return; +#else + SimpleLoggerMgr* mgr = SimpleLoggerMgr::get(); + if (!mgr->crashOriginThread) return; + + pthread_t myself = pthread_self(); + if (mgr->crashOriginThread == myself) return; + + // NOTE: + // As getting exact line number is too expensive, + // keep stack pointers first and then interpret it. + mgr->addRawStackInfo(); + + // Go back to origin thread. + pthread_kill(mgr->crashOriginThread, SIGUSR2); +#endif +} +#endif + +// LCOV_EXCL_STOP + +void SimpleLoggerMgr::flushWorker() { +#ifdef __linux__ + pthread_setname_np(pthread_self(), "sl_flusher"); +#endif + SimpleLoggerMgr* mgr = SimpleLoggerMgr::get(); + while (!mgr->chkTermination()) { + // Every 500ms. + size_t sub_ms = 500; + mgr->sleepFlusher(sub_ms); + mgr->flushAllLoggers(); + if (mgr->abortTimer) { + if (mgr->abortTimer > sub_ms) { + mgr->abortTimer.fetch_sub(sub_ms); + } else { + std::cerr << "STACK DUMP TIMEOUT, FORCE ABORT" << std::endl; + exit(-1); + } + } + } +} + +void SimpleLoggerMgr::compressWorker() { +#ifdef __linux__ + pthread_setname_np(pthread_self(), "sl_compressor"); +#endif + SimpleLoggerMgr* mgr = SimpleLoggerMgr::get(); + bool sleep_next_time = true; + while (!mgr->chkTermination()) { + // Every 500ms. + size_t sub_ms = 500; + if (sleep_next_time) { + mgr->sleepCompressor(sub_ms); + } + sleep_next_time = true; + + CompElem* elem = nullptr; + { std::lock_guard l(mgr->pendingCompElemsLock); + auto entry = mgr->pendingCompElems.begin(); + if (entry != mgr->pendingCompElems.end()) { + elem = *entry; + mgr->pendingCompElems.erase(entry); + } + } + + if (elem) { + elem->targetLogger->doCompression(elem->fileNum); + delete elem; + // Continuous compression if pending item exists. + sleep_next_time = false; + } + } +} + +void SimpleLoggerMgr::setCrashDumpPath(const std::string& path, + bool origin_only) +{ + crashDumpPath = path; + setStackTraceOriginOnly(origin_only); +} + +void SimpleLoggerMgr::setStackTraceOriginOnly(bool origin_only) { + crashDumpOriginOnly = origin_only; +} + +void SimpleLoggerMgr::setExitOnCrash(bool exit_on_crash) { + exitOnCrash = exit_on_crash; +} + + +SimpleLoggerMgr::SimpleLoggerMgr() + : termination(false) + , oldSigSegvHandler(nullptr) + , oldSigAbortHandler(nullptr) + , stackTraceBuffer(nullptr) + , crashOriginThread(0) + , crashDumpOriginOnly(true) + , exitOnCrash(false) + , abortTimer(0) +{ +#if defined(__linux__) || defined(__APPLE__) + std::string env_segv_str; + const char* env_segv = std::getenv("SIMPLELOGGER_HANDLE_SEGV"); + if (env_segv) env_segv_str = env_segv; + + if ( env_segv_str == "OFF" || + env_segv_str == "off" || + env_segv_str == "FALSE" || + env_segv_str == "false" ) { + // Manually turned off by user, via env var. + } else { + oldSigSegvHandler = signal(SIGSEGV, SimpleLoggerMgr::handleSegFault); + oldSigAbortHandler = signal(SIGABRT, SimpleLoggerMgr::handleSegAbort); + } + stackTraceBuffer = (char*)malloc(stackTraceBufferSize); + +#endif + tFlush = std::thread(SimpleLoggerMgr::flushWorker); + tCompress = std::thread(SimpleLoggerMgr::compressWorker); +} + +SimpleLoggerMgr::~SimpleLoggerMgr() { + termination = true; + +#if defined(__linux__) || defined(__APPLE__) + signal(SIGSEGV, oldSigSegvHandler); + signal(SIGABRT, oldSigAbortHandler); +#endif + { std::unique_lock l(cvFlusherLock); + cvFlusher.notify_all(); + } + { std::unique_lock l(cvCompressorLock); + cvCompressor.notify_all(); + } + if (tFlush.joinable()) { + tFlush.join(); + } + if (tCompress.joinable()) { + tCompress.join(); + } + + free(stackTraceBuffer); +} + +// LCOV_EXCL_START +void SimpleLoggerMgr::enableOnlyOneDisplayer() { + bool marked = false; + std::unique_lock l(loggersLock); + for (auto& entry: loggers) { + SimpleLogger* logger = entry; + if (!logger) continue; + if (!marked) { + // The first logger: enable display + if (logger->getLogLevel() < 4) { + logger->setLogLevel(4); + } + logger->setDispLevel(4); + marked = true; + } else { + // The others: disable display + logger->setDispLevel(-1); + } + } +} +// LCOV_EXCL_STOP + +void SimpleLoggerMgr::flushAllLoggers(int level, const std::string& msg) { + std::unique_lock l(loggersLock); + for (auto& entry: loggers) { + SimpleLogger* logger = entry; + if (!logger) continue; + if (!msg.empty()) { + logger->put(level, __FILE__, __func__, __LINE__, "%s", msg.c_str()); + } + logger->flushAll(); + } +} + +void SimpleLoggerMgr::addLogger(SimpleLogger* logger) { + std::unique_lock l(loggersLock); + loggers.insert(logger); +} + +void SimpleLoggerMgr::removeLogger(SimpleLogger* logger) { + std::unique_lock l(loggersLock); + loggers.erase(logger); +} + +void SimpleLoggerMgr::addThread(uint64_t tid) { + std::unique_lock l(activeThreadsLock); + activeThreads.insert(tid); +} + +void SimpleLoggerMgr::removeThread(uint64_t tid) { + std::unique_lock l(activeThreadsLock); + activeThreads.erase(tid); +} + +void SimpleLoggerMgr::addCompElem(SimpleLoggerMgr::CompElem* elem) { + { std::unique_lock l(pendingCompElemsLock); + pendingCompElems.push_back(elem); + } + { std::unique_lock l(cvCompressorLock); + cvCompressor.notify_all(); + } +} + +void SimpleLoggerMgr::sleepFlusher(size_t ms) { + std::unique_lock l(cvFlusherLock); + cvFlusher.wait_for(l, std::chrono::milliseconds(ms)); +} + +void SimpleLoggerMgr::sleepCompressor(size_t ms) { + std::unique_lock l(cvCompressorLock); + cvCompressor.wait_for(l, std::chrono::milliseconds(ms)); +} + +bool SimpleLoggerMgr::chkTermination() const { + return termination; +} + +void SimpleLoggerMgr::setCriticalInfo(const std::string& info_str) { + globalCriticalInfo = info_str; +} + +const std::string& SimpleLoggerMgr::getCriticalInfo() const { + return globalCriticalInfo; +} + + +// ========================================== + +struct ThreadWrapper { +#ifdef __linux__ + ThreadWrapper() { + myTid = (uint64_t)pthread_self(); + SimpleLoggerMgr* mgr = SimpleLoggerMgr::getWithoutInit(); + if (mgr) { + mgr->addThread(myTid); + } + } + ~ThreadWrapper() { + SimpleLoggerMgr* mgr = SimpleLoggerMgr::getWithoutInit(); + if (mgr) { + mgr->removeThread(myTid); + } + } +#else + ThreadWrapper() : myTid(0) {} + ~ThreadWrapper() {} +#endif + uint64_t myTid; +}; + + + +// ========================================== + +SimpleLogger::LogElem::LogElem() : len(0), status(CLEAN) { +#ifdef SUPPRESS_TSAN_FALSE_ALARMS + std::lock_guard l(ctxLock); +#endif + memset(ctx, 0x0, MSG_SIZE); +} + +// True if dirty. +bool SimpleLogger::LogElem::needToFlush() { + return status.load(MOR) == DIRTY; +} + +// True if no other thread is working on it. +bool SimpleLogger::LogElem::available() { + Status s = status.load(MOR); + return s == CLEAN || s == DIRTY; +} + +int SimpleLogger::LogElem::write(size_t _len, char* msg) { + Status exp = CLEAN; + Status val = WRITING; + if (!status.compare_exchange_strong(exp, val, MOR)) return -1; + + { +#ifdef SUPPRESS_TSAN_FALSE_ALARMS + std::lock_guard l(ctxLock); +#endif + len = (_len > MSG_SIZE) ? MSG_SIZE : _len; + memcpy(ctx, msg, len); + } + + status.store(LogElem::DIRTY, MOR); + return 0; +} + +int SimpleLogger::LogElem::flush(std::ofstream& fs) { + Status exp = DIRTY; + Status val = FLUSHING; + if (!status.compare_exchange_strong(exp, val, MOR)) return -1; + + { +#ifdef SUPPRESS_TSAN_FALSE_ALARMS + std::lock_guard l(ctxLock); +#endif + fs.write(ctx, len); + } + + status.store(LogElem::CLEAN, MOR); + return 0; +} + + +// ========================================== + + +SimpleLogger::SimpleLogger(const std::string& file_path, + size_t max_log_elems, + uint64_t log_file_size_limit, + uint32_t max_log_files) + : filePath(replaceString(file_path, "//", "/")) + , maxLogFiles(max_log_files) + , maxLogFileSize(log_file_size_limit) + , numCompJobs(0) + , curLogLevel(4) + , curDispLevel(4) + , tzGap( SimpleLoggerMgr::getTzGap() ) + , cursor(0) + , logs(max_log_elems) +{ + findMinMaxRevNum(minRevnum, curRevnum); +} + +SimpleLogger::~SimpleLogger() { + stop(); +} + +void SimpleLogger::setCriticalInfo(const std::string& info_str) { + SimpleLoggerMgr* mgr = SimpleLoggerMgr::get(); + if (mgr) { + mgr->setCriticalInfo(info_str); + } +} + +void SimpleLogger::setCrashDumpPath(const std::string& path, + bool origin_only) +{ + SimpleLoggerMgr* mgr = SimpleLoggerMgr::get(); + if (mgr) { + mgr->setCrashDumpPath(path, origin_only); + } +} + +void SimpleLogger::setStackTraceOriginOnly(bool origin_only) { + SimpleLoggerMgr* mgr = SimpleLoggerMgr::get(); + if (mgr) { + mgr->setStackTraceOriginOnly(origin_only); + } +} + +void SimpleLogger::logStackBacktrace() { + SimpleLoggerMgr* mgr = SimpleLoggerMgr::get(); + if (mgr) { + mgr->enableOnlyOneDisplayer(); + mgr->logStackBacktrace(0); + } +} + +void SimpleLogger::shutdown() { + SimpleLoggerMgr* mgr = SimpleLoggerMgr::getWithoutInit(); + if (mgr) { + mgr->destroy(); + } +} + +std::string SimpleLogger::replaceString( const std::string& src_str, + const std::string& before, + const std::string& after ) +{ + size_t last = 0; + size_t pos = src_str.find(before, last); + std::string ret; + while (pos != std::string::npos) { + ret += src_str.substr(last, pos - last); + ret += after; + last = pos + before.size(); + pos = src_str.find(before, last); + } + if (last < src_str.size()) { + ret += src_str.substr(last); + } + return ret; +} + +void SimpleLogger::findMinMaxRevNum( size_t& min_revnum_out, + size_t& max_revnum_out ) +{ + std::string dir_path = "./"; + std::string file_name_only = filePath; + size_t last_pos = filePath.rfind("/"); + if (last_pos != std::string::npos) { + dir_path = filePath.substr(0, last_pos); + file_name_only = filePath.substr + ( last_pos + 1, filePath.size() - last_pos - 1 ); + } + + bool min_revnum_initialized = false; + size_t min_revnum = 0; + size_t max_revnum = 0; + +#if defined(__linux__) || defined(__APPLE__) + DIR* dir_info = opendir(dir_path.c_str()); + struct dirent *dir_entry = nullptr; + while ( dir_info && (dir_entry = readdir(dir_info)) ) { + std::string f_name(dir_entry->d_name); + size_t f_name_pos = f_name.rfind(file_name_only); + // Irrelavent file: skip. + if (f_name_pos == std::string::npos) continue; + + findMinMaxRevNumInternal(min_revnum_initialized, + min_revnum, + max_revnum, + f_name); + } + if (dir_info) { + closedir(dir_info); + } +#elif defined(WIN32) || defined(_WIN32) + // Windows + WIN32_FIND_DATA filedata; + HANDLE hfind; + std::string query_str = dir_path + "*"; + + // find all files start with 'prefix' + hfind = FindFirstFile(query_str.c_str(), &filedata); + while (hfind != INVALID_HANDLE_VALUE) { + std::string f_name(filedata.cFileName); + size_t f_name_pos = f_name.rfind(file_name_only); + // Irrelavent file: skip. + if (f_name_pos != std::string::npos) { + findMinMaxRevNumInternal(min_revnum_initialized, + min_revnum, + max_revnum, + f_name); + } + + if (!FindNextFile(hfind, &filedata)) { + FindClose(hfind); + hfind = INVALID_HANDLE_VALUE; + } + } +#endif + + min_revnum_out = min_revnum; + max_revnum_out = max_revnum; +} + +void SimpleLogger::findMinMaxRevNumInternal(bool& min_revnum_initialized, + size_t& min_revnum, + size_t& max_revnum, + std::string& f_name) +{ + size_t last_dot = f_name.rfind("."); + if (last_dot == std::string::npos) return; + + bool comp_file = false; + std::string ext = f_name.substr(last_dot + 1, f_name.size() - last_dot - 1); + if (ext == "gz" && f_name.size() > 7) { + // Compressed file: asdf.log.123.tar.gz => need to get 123. + f_name = f_name.substr(0, f_name.size() - 7); + last_dot = f_name.rfind("."); + if (last_dot == std::string::npos) return; + ext = f_name.substr(last_dot + 1, f_name.size() - last_dot - 1); + comp_file = true; + } + + size_t revnum = atoi(ext.c_str()); + max_revnum = std::max( max_revnum, + ( (comp_file) ? (revnum+1) : (revnum) ) ); + if (!min_revnum_initialized) { + min_revnum = revnum; + min_revnum_initialized = true; + } + min_revnum = std::min(min_revnum, revnum); +} + +std::string SimpleLogger::getLogFilePath(size_t file_num) const { + if (file_num) { + return filePath + "." + std::to_string(file_num); + } + return filePath; +} + +int SimpleLogger::start() { + if (filePath.empty()) return 0; + + // Append at the end. + fs.open(getLogFilePath(curRevnum), std::ofstream::out | std::ofstream::app); + if (!fs) return -1; + + SimpleLoggerMgr* mgr = SimpleLoggerMgr::get(); + SimpleLogger* ll = this; + mgr->addLogger(ll); + + _log_sys(ll, "Start logger: %s (%zu MB per file, up to %zu files)", + filePath.c_str(), + maxLogFileSize / 1024 / 1024, + maxLogFiles); + + const std::string& critical_info = mgr->getCriticalInfo(); + if (!critical_info.empty()) { + _log_info(ll, "%s", critical_info.c_str()); + } + + return 0; +} + +int SimpleLogger::stop() { + if (fs.is_open()) { + SimpleLoggerMgr* mgr = SimpleLoggerMgr::getWithoutInit(); + if (mgr) { + SimpleLogger* ll = this; + mgr->removeLogger(ll); + + _log_sys(ll, "Stop logger: %s", filePath.c_str()); + flushAll(); + fs.flush(); + fs.close(); + + while (numCompJobs.load() > 0) std::this_thread::yield(); + } + } + + return 0; +} + +void SimpleLogger::setLogLevel(int level) { + if (level > 6) return; + if (!fs) return; + + curLogLevel = level; +} + +void SimpleLogger::setDispLevel(int level) { + if (level > 6) return; + + curDispLevel = level; +} + +#define _snprintf(msg, avail_len, cur_len, msg_len, ...) \ + avail_len = (avail_len > cur_len) ? (avail_len - cur_len) : 0; \ + msg_len = snprintf( msg + cur_len, avail_len, __VA_ARGS__ ); \ + cur_len += (avail_len > msg_len) ? msg_len : avail_len + +#define _vsnprintf(msg, avail_len, cur_len, msg_len, ...) \ + avail_len = (avail_len > cur_len) ? (avail_len - cur_len) : 0; \ + msg_len = vsnprintf( msg + cur_len, avail_len, __VA_ARGS__ ); \ + cur_len += (avail_len > msg_len) ? msg_len : avail_len + +void SimpleLogger::put(int level, + const char* source_file, + const char* func_name, + size_t line_number, + const char* format, + ...) +{ + if (level > curLogLevel.load(MOR)) return; + if (!fs) return; + + static const char* lv_names[7] = {"====", + "FATL", "ERRO", "WARN", + "INFO", "DEBG", "TRAC"}; + char msg[MSG_SIZE]; + thread_local std::thread::id tid = std::this_thread::get_id(); + thread_local uint32_t tid_hash = std::hash{}(tid) % 0x10000; + thread_local ThreadWrapper thread_wrapper; + + // Print filename part only (excluding directory path). + size_t last_slash = 0; + for (size_t ii=0; source_file && source_file[ii] != 0; ++ii) { + if (source_file[ii] == '/' || source_file[ii] == '\\') last_slash = ii; + } + + SimpleLoggerMgr::TimeInfo lt( std::chrono::system_clock::now() ); + int tz_gap_abs = (tzGap < 0) ? (tzGap * -1) : (tzGap); + + // [time] [tid] [log type] [user msg] [stack info] + // Timestamp: ISO 8601 format. + size_t cur_len = 0; + size_t avail_len = MSG_SIZE; + size_t msg_len = 0; + + _snprintf( msg, avail_len, cur_len, msg_len, + "%04d-%02d-%02dT%02d:%02d:%02d.%03d_%03d%c%02d:%02d " + "[%04x] " + "[%s] ", + lt.year, lt.month, lt.day, + lt.hour, lt.min, lt.sec, lt.msec, lt.usec, + (tzGap >= 0)?'+':'-', tz_gap_abs / 60, tz_gap_abs % 60, + tid_hash, + lv_names[level] ); + + va_list args; + va_start(args, format); + _vsnprintf(msg, avail_len, cur_len, msg_len, format, args); + va_end(args); + + if (source_file && func_name) { + _snprintf( msg, avail_len, cur_len, msg_len, + "\t[%s:%zu, %s()]\n", + source_file + ((last_slash)?(last_slash+1):0), + line_number, func_name ); + } else { + _snprintf(msg, avail_len, cur_len, msg_len, "\n"); + } + + size_t num = logs.size(); + uint64_t cursor_exp = 0, cursor_val = 0; + LogElem* ll = nullptr; + do { + cursor_exp = cursor.load(MOR); + cursor_val = (cursor_exp + 1) % num; + ll = &logs[cursor_exp]; + } while ( !cursor.compare_exchange_strong(cursor_exp, cursor_val, MOR) ); + while ( !ll->available() ) std::this_thread::yield(); + + if (ll->needToFlush()) { + // Allow only one thread to flush. + if (!flush(cursor_exp)) { + // Other threads: wait. + while (ll->needToFlush()) std::this_thread::yield(); + } + } + ll->write(cur_len, msg); + + if (level > curDispLevel) return; + + // Console part. + static const char* colored_lv_names[7] = + { _CL_B_BROWN("===="), + _CL_WHITE_FG_RED_BG("FATL"), + _CL_B_RED("ERRO"), + _CL_B_MAGENTA("WARN"), + "INFO", + _CL_D_GRAY("DEBG"), + _CL_D_GRAY("TRAC") }; + + cur_len = 0; + avail_len = MSG_SIZE; + _snprintf( msg, avail_len, cur_len, msg_len, + " [" _CL_BROWN("%02d") ":" _CL_BROWN("%02d") ":" _CL_BROWN("%02d") "." + _CL_BROWN("%03d") " " _CL_BROWN("%03d") + "] [tid " _CL_B_BLUE("%04x") "] " + "[%s] ", + lt.hour, lt.min, lt.sec, lt.msec, lt.usec, + tid_hash, + colored_lv_names[level] ); + + if (source_file && func_name) { + _snprintf( msg, avail_len, cur_len, msg_len, + "[" _CL_GREEN("%s") ":" _CL_B_RED("%zu") + ", " _CL_CYAN("%s()") "]\n", + source_file + ((last_slash)?(last_slash+1):0), + line_number, func_name ); + } else { + _snprintf(msg, avail_len, cur_len, msg_len, "\n"); + } + + va_start(args, format); + +#ifndef LOGGER_NO_COLOR + if (level == 0) { + _snprintf(msg, avail_len, cur_len, msg_len, _CLM_B_BROWN); + } else if (level == 1) { + _snprintf(msg, avail_len, cur_len, msg_len, _CLM_B_RED); + } +#endif + + _vsnprintf(msg, avail_len, cur_len, msg_len, format, args); + +#ifndef LOGGER_NO_COLOR + _snprintf(msg, avail_len, cur_len, msg_len, _CLM_END); +#endif + + va_end(args); + (void)cur_len; + + std::unique_lock l(SimpleLoggerMgr::displayLock); + std::cout << msg << std::endl; + l.unlock(); +} + +void SimpleLogger::execCmd(const std::string& cmd_given) { + int r = 0; + std::string cmd = cmd_given; + +#if defined(__linux__) + cmd += " > /dev/null"; + r = system(cmd.c_str()); + +#elif defined(__APPLE__) + cmd += " 2> /dev/null"; + FILE* fp = popen(cmd.c_str(), "r"); + r = pclose(fp); +#endif + (void)r; +} + +void SimpleLogger::doCompression(size_t file_num) { +#if defined(__linux__) || defined(__APPLE__) + std::string filename = getLogFilePath(file_num); + std::string cmd; + cmd = "tar zcvf " + filename + ".tar.gz " + filename; + execCmd(cmd); + + cmd = "rm -f " + filename; + execCmd(cmd); + + // Remove previous log files. + if (maxLogFiles && file_num >= maxLogFiles) { + for (size_t ii=minRevnum; ii<=file_num-maxLogFiles; ++ii) { + filename = getLogFilePath(ii); + std::string filename_tar = getLogFilePath(ii) + ".tar.gz"; + cmd = "rm -f " + filename + " " + filename_tar; + execCmd(cmd); + minRevnum = ii+1; + } + } +#endif + + numCompJobs.fetch_sub(1); +} + +bool SimpleLogger::flush(size_t start_pos) { + std::unique_lock ll(flushingLogs, std::try_to_lock); + if (!ll.owns_lock()) return false; + + size_t num = logs.size(); + // Circular flush into file. + for (size_t ii=start_pos; ii (int64_t)maxLogFileSize ) { + // Exceeded limit, make a new file. + curRevnum++; + fs.close(); + fs.open(getLogFilePath(curRevnum), std::ofstream::out | std::ofstream::app); + + // Compress it (tar gz). Register to the global queue. +#ifndef SUPPRESS_TSAN_FALSE_ALARMS + SimpleLoggerMgr* mgr = SimpleLoggerMgr::getWithoutInit(); + if (mgr) { + numCompJobs.fetch_add(1); + SimpleLoggerMgr::CompElem* elem = + new SimpleLoggerMgr::CompElem(curRevnum-1, this); + mgr->addCompElem(elem); + } +#endif + } + + return true; +} + +void SimpleLogger::flushAll() { + uint64_t start_pos = cursor.load(MOR); + flush(start_pos); +} + diff --git a/src/logger.h b/src/logger.h new file mode 100644 index 0000000..78eac03 --- /dev/null +++ b/src/logger.h @@ -0,0 +1,485 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2017 Jung-Sang Ahn +See URL: https://github.com/greensky00/simple_logger + (v0.3.25) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#if defined(__linux__) || defined(__APPLE__) + #include +#endif + +// To suppress false alarms by thread sanitizer, +// add -DSUPPRESS_TSAN_FALSE_ALARMS=1 flag to CXXFLAGS. +// #define SUPPRESS_TSAN_FALSE_ALARMS (1) + +// 0: System [====] +// 1: Fatal [FATL] +// 2: Error [ERRO] +// 3: Warning [WARN] +// 4: Info [INFO] +// 5: Debug [DEBG] +// 6: Trace [TRAC] + + +// printf style log macro +#define _log_(level, l, ...) \ + if (l && l->getLogLevel() >= level) \ + (l)->put(level, __FILE__, __func__, __LINE__, __VA_ARGS__) + +#define _log_sys(l, ...) _log_(SimpleLogger::SYS, l, __VA_ARGS__) +#define _log_fatal(l, ...) _log_(SimpleLogger::FATAL, l, __VA_ARGS__) +#define _log_err(l, ...) _log_(SimpleLogger::ERROR, l, __VA_ARGS__) +#define _log_warn(l, ...) _log_(SimpleLogger::WARNING, l, __VA_ARGS__) +#define _log_info(l, ...) _log_(SimpleLogger::INFO, l, __VA_ARGS__) +#define _log_debug(l, ...) _log_(SimpleLogger::DEBUG, l, __VA_ARGS__) +#define _log_trace(l, ...) _log_(SimpleLogger::TRACE, l, __VA_ARGS__) + + +// stream log macro +#define _stream_(level, l) \ + if (l && l->getLogLevel() >= level) \ + l->eos() = l->stream(level, l, __FILE__, __func__, __LINE__) + +#define _s_sys(l) _stream_(SimpleLogger::SYS, l) +#define _s_fatal(l) _stream_(SimpleLogger::FATAL, l) +#define _s_err(l) _stream_(SimpleLogger::ERROR, l) +#define _s_warn(l) _stream_(SimpleLogger::WARNING, l) +#define _s_info(l) _stream_(SimpleLogger::INFO, l) +#define _s_debug(l) _stream_(SimpleLogger::DEBUG, l) +#define _s_trace(l) _stream_(SimpleLogger::TRACE, l) + + +// Do printf style log, but print logs in `lv1` level during normal time, +// once in given `interval_ms` interval, print a log in `lv2` level. +// The very first log will be printed in `lv2` level. +// +// This function is global throughout the process, so that +// multiple threads will share the interval. +#define _timed_log_g(l, interval_ms, lv1, lv2, ...) \ +{ \ + _timed_log_definition(static); \ + _timed_log_body(l, interval_ms, lv1, lv2, __VA_ARGS__); \ +} + +// Same as `_timed_log_g` but per-thread level. +#define _timed_log_t(l, interval_ms, lv1, lv2, ...) \ +{ \ + _timed_log_definition(thread_local); \ + _timed_log_body(l, interval_ms, lv1, lv2, __VA_ARGS__); \ +} + +#define _timed_log_definition(prefix) \ + prefix std::mutex timer_lock; \ + prefix bool first_event_fired = false; \ + prefix std::chrono::system_clock::time_point last_timeout = \ + std::chrono::system_clock::now(); + +#define _timed_log_body(l, interval_ms, lv1, lv2, ...) \ + std::chrono::system_clock::time_point cur = \ + std::chrono::system_clock::now(); \ + std::chrono::duration elapsed = cur - last_timeout; \ + bool timeout = false; \ + { std::lock_guard l(timer_lock); \ + if ( elapsed.count() * 1000 > interval_ms || \ + !first_event_fired ) { \ + cur = std::chrono::system_clock::now(); \ + elapsed = cur - last_timeout; \ + if ( elapsed.count() * 1000 > interval_ms || \ + !first_event_fired ) { \ + timeout = first_event_fired = true; \ + last_timeout = cur; \ + } \ + } \ + } \ + if (timeout) { \ + _log_(lv2, l, __VA_ARGS__); \ + } else { \ + _log_(lv1, l, __VA_ARGS__); \ + } + + +class SimpleLoggerMgr; +class SimpleLogger { + friend class SimpleLoggerMgr; +public: + static const int MSG_SIZE = 4096; + static const std::memory_order MOR = std::memory_order_relaxed; + + enum Levels { + SYS = 0, + FATAL = 1, + ERROR = 2, + WARNING = 3, + INFO = 4, + DEBUG = 5, + TRACE = 6, + UNKNOWN = 99, + }; + + class LoggerStream : public std::ostream { + public: + LoggerStream() : std::ostream(&buf), level(0), logger(nullptr) + , file(nullptr), func(nullptr), line(0) {} + + template + inline LoggerStream& operator<<(const T& data) { + sStream << data; + return *this; + } + + using MyCout = std::basic_ostream< char, std::char_traits >; + typedef MyCout& (*EndlFunc)(MyCout&); + inline LoggerStream& operator<<(EndlFunc func) { + func(sStream); + return *this; + } + + inline void put() { + if (logger) { + logger->put( level, file, func, line, + "%s", sStream.str().c_str() ); + } + } + + inline void setLogInfo(int _level, + SimpleLogger* _logger, + const char* _file, + const char* _func, + size_t _line) + { + sStream.str(std::string()); + level = _level; + logger = _logger; + file = _file; + func = _func; + line = _line; + } + + private: + std::stringbuf buf; + std::stringstream sStream; + int level; + SimpleLogger* logger; + const char* file; + const char* func; + size_t line; + }; + + class EndOfStmt { + public: + EndOfStmt() {} + EndOfStmt(LoggerStream& src) { src.put(); } + EndOfStmt& operator=(LoggerStream& src) { src.put(); return *this; } + }; + + LoggerStream& stream( int level, + SimpleLogger* logger, + const char* file, + const char* func, + size_t line ) { + thread_local LoggerStream msg; + msg.setLogInfo(level, logger, file, func, line); + return msg; + } + + EndOfStmt& eos() { + thread_local EndOfStmt _eos; + return _eos; + } + +private: + struct LogElem { + enum Status { + CLEAN = 0, + WRITING = 1, + DIRTY = 2, + FLUSHING = 3, + }; + + LogElem(); + + // True if dirty. + bool needToFlush(); + + // True if no other thread is working on it. + bool available(); + + int write(size_t _len, char* msg); + int flush(std::ofstream& fs); + +#ifdef SUPPRESS_TSAN_FALSE_ALARMS + // To avoid false alarm by TSan. + std::mutex ctxLock; +#endif + size_t len; + char ctx[MSG_SIZE]; + std::atomic status; + }; + +public: + SimpleLogger(const std::string& file_path, + size_t max_log_elems = 4096, + uint64_t log_file_size_limit = 32*1024*1024, + uint32_t max_log_files = 16); + ~SimpleLogger(); + + static void setCriticalInfo(const std::string& info_str); + static void setCrashDumpPath(const std::string& path, + bool origin_only = true); + static void setStackTraceOriginOnly(bool origin_only); + static void logStackBacktrace(); + + static void shutdown(); + static std::string replaceString(const std::string& src_str, + const std::string& before, + const std::string& after); + + int start(); + int stop(); + + inline bool traceAllowed() const { return (curLogLevel.load(MOR) >= 6); } + inline bool debugAllowed() const { return (curLogLevel.load(MOR) >= 5); } + + void setLogLevel(int level); + void setDispLevel(int level); + + inline int getLogLevel() const { return curLogLevel.load(MOR); } + inline int getDispLevel() const { return curDispLevel.load(MOR); } + + void put(int level, + const char* source_file, + const char* func_name, + size_t line_number, + const char* format, + ...); + void flushAll(); + +private: + void calcTzGap(); + void findMinMaxRevNum(size_t& min_revnum_out, + size_t& max_revnum_out); + void findMinMaxRevNumInternal(bool& min_revnum_initialized, + size_t& min_revnum, + size_t& max_revnum, + std::string& f_name); + std::string getLogFilePath(size_t file_num) const; + void execCmd(const std::string& cmd); + void doCompression(size_t file_num); + bool flush(size_t start_pos); + + std::string filePath; + size_t minRevnum; + size_t curRevnum; + size_t maxLogFiles; + std::ofstream fs; + + uint64_t maxLogFileSize; + std::atomic numCompJobs; + + // Log up to `curLogLevel`, default: 6. + // Disable: -1. + std::atomic curLogLevel; + + // Display (print out on terminal) up to `curDispLevel`, + // default: 4 (do not print debug and trace). + // Disable: -1. + std::atomic curDispLevel; + + std::mutex displayLock; + + int tzGap; + std::atomic cursor; + std::vector logs; + std::mutex flushingLogs; +}; + +// Singleton class +class SimpleLoggerMgr { +public: + struct CompElem; + + struct TimeInfo { + TimeInfo(std::tm* src); + TimeInfo(std::chrono::system_clock::time_point now); + int year; + int month; + int day; + int hour; + int min; + int sec; + int msec; + int usec; + }; + + struct RawStackInfo { + RawStackInfo() : tidHash(0), kernelTid(0), crashOrigin(false) {} + uint32_t tidHash; + uint64_t kernelTid; + std::vector stackPtrs; + bool crashOrigin; + }; + + static SimpleLoggerMgr* init(); + static SimpleLoggerMgr* get(); + static SimpleLoggerMgr* getWithoutInit(); + static void destroy(); + static int getTzGap(); + static void handleSegFault(int sig); + static void handleSegAbort(int sig); +#if defined(__linux__) || defined(__APPLE__) + static void handleStackTrace(int sig, siginfo_t* info, void* secret); +#endif + static void flushWorker(); + static void compressWorker(); + + void logStackBacktrace(size_t timeout_ms = 60*1000); + void flushCriticalInfo(); + void enableOnlyOneDisplayer(); + void flushAllLoggers() { flushAllLoggers(0, std::string()); } + void flushAllLoggers(int level, const std::string& msg); + void addLogger(SimpleLogger* logger); + void removeLogger(SimpleLogger* logger); + void addThread(uint64_t tid); + void removeThread(uint64_t tid); + void addCompElem(SimpleLoggerMgr::CompElem* elem); + void sleepFlusher(size_t ms); + void sleepCompressor(size_t ms); + bool chkTermination() const; + void setCriticalInfo(const std::string& info_str); + void setCrashDumpPath(const std::string& path, + bool origin_only); + void setStackTraceOriginOnly(bool origin_only); + + /** + * Set the flag regarding exiting on crash. + * If flag is `true`, custom segfault handler will not invoke + * original handler so that process will terminate without + * generating core dump. + * The flag is `false` by default. + * + * @param exit_on_crash New flag value. + * @return void. + */ + void setExitOnCrash(bool exit_on_crash); + + const std::string& getCriticalInfo() const; + + static std::mutex displayLock; + +private: + // Copy is not allowed. + SimpleLoggerMgr(const SimpleLoggerMgr&) = delete; + SimpleLoggerMgr& operator=(const SimpleLoggerMgr&) = delete; + + static const size_t stackTraceBufferSize = 65536; + + // Singleton instance and lock. + static std::atomic instance; + static std::mutex instanceLock; + + SimpleLoggerMgr(); + ~SimpleLoggerMgr(); + + void _flushStackTraceBuffer(size_t buffer_len, + uint32_t tid_hash, + uint64_t kernel_tid, + bool crash_origin); + void flushStackTraceBuffer(RawStackInfo& stack_info); + void flushRawStack(RawStackInfo& stack_info); + void addRawStackInfo(bool crash_origin = false); + void logStackBackTraceOtherThreads(); + + bool chkExitOnCrash(); + + std::mutex loggersLock; + std::unordered_set loggers; + + std::mutex activeThreadsLock; + std::unordered_set activeThreads; + + // Periodic log flushing thread. + std::thread tFlush; + + // Old log file compression thread. + std::thread tCompress; + + // List of files to be compressed. + std::list pendingCompElems; + + // Lock for `pendingCompFiles`. + std::mutex pendingCompElemsLock; + + // Condition variable for BG flusher. + std::condition_variable cvFlusher; + std::mutex cvFlusherLock; + + // Condition variable for BG compressor. + std::condition_variable cvCompressor; + std::mutex cvCompressorLock; + + // Termination signal. + std::atomic termination; + + // Original segfault handler. + void (*oldSigSegvHandler)(int); + + // Original abort handler. + void (*oldSigAbortHandler)(int); + + // Critical info that will be displayed on crash. + std::string globalCriticalInfo; + + // Reserve some buffer for stack trace. + char* stackTraceBuffer; + + // TID of thread where crash happens. + std::atomic crashOriginThread; + + std::string crashDumpPath; + std::ofstream crashDumpFile; + + // If `true`, generate stack trace only for the origin thread. + // Default: `true`. + bool crashDumpOriginOnly; + + // If `true`, do not invoke original segfault handler + // so that process just terminates. + // Default: `false`. + bool exitOnCrash; + + std::atomic abortTimer; + + // Assume that only one thread is updating this. + std::vector crashDumpThreadStacks; +}; + diff --git a/src/memtable.cc b/src/memtable.cc new file mode 100644 index 0000000..039cc0d --- /dev/null +++ b/src/memtable.cc @@ -0,0 +1,1250 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "memtable.h" + +#include "bloomfilter.h" +#include "crc32.h" +#include "db_mgr.h" +#include "internal_helper.h" +#include "log_file.h" +#include "log_mgr.h" +#include "skiplist.h" + +#include "generic_bitmap.h" + +#include _MACRO_TO_STR(LOGGER_H) + +#include +#include +#include + +namespace jungle { + +#define REC_LEN_SIZE (20) + +MemTable::RecNode::RecNode(const SizedBuf& _key) { + skiplist_init_node(&snode); + recList = new RecList(); + key.alloc(_key); + freeKey = true; +} + +MemTable::RecNode::RecNode(const SizedBuf* _key) + : recList(nullptr) { + // Just keep the pointer (for query purpose). + skiplist_init_node(&snode); + key.referTo(*_key); + freeKey = false; +} + +MemTable::RecNode::~RecNode() { + { mGuard l(recListLock); + DELETE(recList); + } + if (freeKey) key.free(); + skiplist_free_node(&snode); +} + + +int MemTable::RecNode::cmp(skiplist_node *a, skiplist_node *b, void *aux) { + RecNode *aa, *bb; + aa = _get_entry(a, RecNode, snode); + bb = _get_entry(b, RecNode, snode); + + return SizedBuf::cmp(aa->key, bb->key); +} + +Record* MemTable::RecNode::getLatestRecord(const uint64_t chk) { + mGuard l(recListLock); + Record* rec = nullptr; + auto entry = recList->rbegin(); + while (entry != recList->rend()) { + Record* tmp = *entry; + if (!valid_number(chk) || tmp->seqNum <= chk) { + rec = tmp; + break; + } + entry++; + } + return rec; +} + +uint64_t MemTable::RecNode::getMinSeq() { + mGuard l(recListLock); + auto entry = recList->begin(); + if (entry == recList->end()) { + // Shouldn't happen. + assert(0); + } + Record* rec = *entry; + return rec->seqNum; +} + +bool MemTable::RecNode::validKeyExist( const uint64_t chk, + bool allow_tombstone ) +{ + bool valid_key_exist = true; + + if (getMinSeq() > chk) { + // No record belongs to the current snapshot (iterator), + valid_key_exist = false; + } + + if (valid_key_exist) { + Record* rec = getLatestRecord(chk); + assert(rec); + if ( !allow_tombstone && + !rec->isIns() ) { + // Record exists, but latest one is not an insert, + // and doesn't allow tombstone. + valid_key_exist = false; + } + } + return valid_key_exist; +} + +MemTable::RecNodeSeq::RecNodeSeq(Record* _rec) + : rec(_rec) +{ + skiplist_init_node(&snode); +} + +MemTable::RecNodeSeq::~RecNodeSeq() { + skiplist_free_node(&snode); +} + +int MemTable::RecNodeSeq::cmp(skiplist_node *a, skiplist_node *b, void *aux) { + RecNodeSeq *aa, *bb; + aa = _get_entry(a, RecNodeSeq , snode); + bb = _get_entry(b, RecNodeSeq , snode); + + if (aa->rec->seqNum < bb->rec->seqNum) return -1; + if (aa->rec->seqNum > bb->rec->seqNum) return 1; + return 0; +} + +uint64_t record_flags(Record* rec) { + // 00 00 00 00 00 00 00 00: normal doc (insertion) + // 00 00 00 00 00 00 00 01: deletion marker + // 00 00 00 00 00 00 00 02: other commands + uint64_t ret = 0x0; + switch (rec->type) { + case Record::Type::INSERTION: + ret = 0x0; + break; + case Record::Type::DELETION: + ret = 0x1; + break; + case Record::Type::COMMAND: + ret = 0x2; + break; + } + return ret; +} + +Record::Type get_record_type_from_flags(uint64_t flags) { + return static_cast(flags & 0xff); +} + +uint64_t flush_marker_flags() { + // 01 00 00 00 00 00 00 00: flush marker + uint64_t ret = 0x01; + ret = ret << 56; // 7-byte shift + return ret; +} + +uint64_t checkpoint_flags() { + // 02 00 00 00 00 00 00 00: checkpoint marker + uint64_t ret = 0x02; + ret = ret << 56; // 7-byte shift + return ret; +} + +enum FlagType { + // 00 00 00 00 00 00 00 00: RECORD + // 01 00 00 00 00 00 00 00: FLUSH_MARKER + // 02 00 00 00 00 00 00 00: CHECKPOINT + // fe 00 00 00 00 00 00 00: PADDING + RECORD = 0, + FLUSH_MARKER = 1, + CHECKPOINT = 2, + // Indicate that the following bytes are just for alignment + PADDING = 254, + UNKNOWN = 255 +}; + +FlagType identify_type(uint64_t flags) { + if (flags == PADDING_HEADER_FLAG) return FlagType::PADDING; + flags = flags >> (8 * 7); + if (flags == 0x0 ) return FlagType::RECORD; + else if (flags == 0x01) return FlagType::FLUSH_MARKER; + else if (flags == 0x02) return FlagType::CHECKPOINT; + else if (flags == 0xfe) return FlagType::PADDING; + + return FlagType::UNKNOWN; +} + +MemTable::MemTable(const LogFile* log_file) + : startSeqNum(0) + , minSeqNum(NOT_INITIALIZED) + , flushedSeqNum(NOT_INITIALIZED) + , syncedSeqNum(NOT_INITIALIZED) + , maxSeqNum(NOT_INITIALIZED) + , seqNumAlloc(NOT_INITIALIZED) + , bytesSize(0) + , checkpointsDirty(false) + , logFile(log_file) + , lastInsertedRec(nullptr) + , increasingOrder(false) + , myLog(nullptr) +{ + idxByKey = new skiplist_raw(); + skiplist_init(idxByKey, MemTable::cmpKey); + + idxBySeq = new skiplist_raw(); + skiplist_init(idxBySeq, RecNodeSeq::cmp); + + // 1M bits (128KB) for 16384 entries. + uint64_t bf_bitmap_size = + logFile->logMgr->getDbConfig()->maxEntriesInLogFile * 64; + bfByKey = new BloomFilter(bf_bitmap_size, 3); +} + +MemTable::~MemTable() { + if (idxByKey) { + // Note: due to dedup, + // # logs in skiplist <= # total logs. + skiplist_node* cursor = skiplist_begin(idxByKey); + while (cursor) { + RecNode* node = _get_entry(cursor, RecNode, snode); + cursor = skiplist_next(idxByKey, cursor); + + skiplist_erase_node(idxByKey, &node->snode); + skiplist_release_node(&node->snode); + skiplist_wait_for_free(&node->snode); + delete node; + } + skiplist_free(idxByKey); + delete idxByKey; + + { mGuard l(staleLogsLock); + for (auto& entry: staleLogs) { + Record* rec = entry; + rec->free(); + delete rec; + } + } + } + + if (idxBySeq) { + skiplist_node* cursor = skiplist_begin(idxBySeq); + while (cursor) { + RecNodeSeq* node = _get_entry(cursor, RecNodeSeq, snode); + cursor = skiplist_next(idxBySeq, cursor); + + skiplist_erase_node(idxBySeq, &node->snode); + skiplist_release_node(&node->snode); + skiplist_wait_for_free(&node->snode); + node->rec->free(); + delete node->rec; + delete node; + } + skiplist_free(idxBySeq); + delete idxBySeq; + } + + if (bfByKey) { + delete bfByKey; + bfByKey = nullptr; + } +} + +int MemTable::cmpKey(skiplist_node *a, skiplist_node *b, void *aux) { + RecNode *aa, *bb; + aa = _get_entry(a, RecNode, snode); + bb = _get_entry(b, RecNode, snode); + + CMP_NULL_CHK(aa->key.data, bb->key.data); + + if (aux) { + // Custom cmp + MemTable* mt = reinterpret_cast(aux); + CustomCmpFunc func = mt->logFile->logMgr->getDbConfig()->cmpFunc; + void* param = mt->logFile->logMgr->getDbConfig()->cmpFuncParam; + return func(aa->key.data, aa->key.size, + bb->key.data, bb->key.size, + param); + } + + return SizedBuf::cmp(aa->key, bb->key); +} + +Status MemTable::getReady() { + // Set custom cmp function if given. + if (logFile->logMgr->getDbConfig()->cmpFunc) { + skiplist_raw_config s_config = skiplist_get_config(idxByKey); + s_config.aux = (void*)this; + skiplist_set_config(idxByKey, s_config); + } + return Status(); +} + +Status MemTable::init(const uint64_t start_seq_num) { + getReady(); + + startSeqNum = start_seq_num; + return Status(); +} + +Status MemTable::assignSeqNum(Record& rec_local) { + if (rec_local.seqNum == NOT_INITIALIZED) { + // Seqnum is not given. + uint64_t expected = NOT_INITIALIZED; + uint64_t val = startSeqNum; + + // NOTE: seqnum should start from 1. + if (!val) val = 1; + if (seqNumAlloc.compare_exchange_weak(expected, val)) { + // First insert. + minSeqNum = val; + rec_local.seqNum = val; + } else { + // NOTE: `memory_order_seq_cst` will be thread-safe. + uint64_t seq = seqNumAlloc.fetch_add(1, MOSC) + 1; + rec_local.seqNum = seq; + } + + } else { + // Seqnum is given by user. + uint64_t expected = seqNumAlloc; + uint64_t val = rec_local.seqNum; + if (expected != NOT_INITIALIZED && + rec_local.seqNum <= expected && + !logFile->logMgr->getDbConfig()->allowOverwriteSeqNum) { + // Overwrite is not allowed, fail. + return Status::INVALID_SEQNUM; + } + + if (minSeqNum != NOT_INITIALIZED && + rec_local.seqNum < minSeqNum) { + // Smaller than the min seq num. + return Status::INVALID_SEQNUM; + } + + if (seqNumAlloc.compare_exchange_weak(expected, val)) { + if (expected == NOT_INITIALIZED) { + // First insert. + minSeqNum = rec_local.seqNum; + } + } else { + // Other thread interfered. + if (!logFile->logMgr->getDbConfig()->allowOverwriteSeqNum) { + // Overwrite is not allowed. + return Status::INVALID_SEQNUM; + } + } + } + return Status(); +} + +Status MemTable::updateMaxSeqNum(const uint64_t seq_num) { + uint64_t expected = maxSeqNum; + uint64_t val = seq_num; + if (expected < seq_num || expected == NOT_INITIALIZED) { + maxSeqNum.compare_exchange_weak(expected, val); + } // Otherwise, ignore it. + return Status(); +} + +MemTable::RecNode* MemTable::findRecNode(Record* rec) { + RecNode query(&rec->kv.key); + skiplist_node* cursor = skiplist_find(idxByKey, &query.snode); + RecNode* existing = _get_entry(cursor, RecNode, snode); + return existing; +} + +void MemTable::addToByKeyIndex(Record* rec) { + // Find existing record. + RecNode* existing_node = findRecNode(rec); + if (!existing_node) { + // Not exist, brand-new key. + + // Create a RecNode as a wrapper + // both for insertion and deletion (tombstone). + RecNode* rec_node = new RecNode(rec->kv.key); + { + mGuard l(rec_node->recListLock); + rec_node->recList->push_back(rec); + } + + int ret = skiplist_insert_nodup(idxByKey, &rec_node->snode); + if (ret == 0) { + bfByKey->set(rec->kv.key.data, rec->kv.key.size); + return; + } else { + // Concurrent thread already inserted it, re-find. + delete rec_node; + existing_node = findRecNode(rec); + assert(existing_node); + } + } + + // Existing key RecNode: just push back. + mGuard l(existing_node->recListLock); + existing_node->recList->push_back(rec); + skiplist_release_node(&existing_node->snode); +} + +Status MemTable::addToBySeqIndex(Record* rec, Record*& prev_rec_out) { + RecNodeSeq query(rec); + skiplist_node* cursor = skiplist_find(idxBySeq, &query.snode); + if (cursor) { + RecNodeSeq* prev = _get_entry(cursor, RecNodeSeq, snode); + prev_rec_out = prev->rec; + // If already exist, replace and then return the old one. + prev->rec = rec; + skiplist_release_node(&prev->snode); + return Status(); + } + + // TODO: multi-thread update on the same key? + RecNodeSeq* new_rec_node = new RecNodeSeq(rec); + skiplist_insert(idxBySeq, &new_rec_node->snode); + prev_rec_out = nullptr; + + return Status(); +} + +Status MemTable::putNewRecord(const Record& _rec) { + Status s; + Record rec_local = _rec; + // If deletion, clear value. + if (rec_local.isDel()) { + rec_local.kv.value = SizedBuf(); + } + + s = assignSeqNum(rec_local); + if (!s) return s; + + // Make a clone + Record* rec = new Record(); + s = rec->clone(rec_local); + if (!s) return s; + + // Append into by-seq index + Record* prev_rec = nullptr; + addToBySeqIndex(rec, prev_rec); + + if (prev_rec) { + mGuard l(staleLogsLock); + staleLogs.push_back(prev_rec); + } + + // If this is a special command (not ins or del), + // skip by-key index update. + if (!rec->isCmd()) { + addToByKeyIndex(rec); + } + + if (!lastInsertedRec) { + increasingOrder = true; + } else { + if ( increasingOrder && + lastInsertedRec->kv.key > rec->kv.key ) { + increasingOrder = false; + } + } + lastInsertedRec = rec; + + updateMaxSeqNum(rec_local.seqNum); + + bytesSize += rec->size(); + return Status(); +} + +Status MemTable::findRecordBySeq(const uint64_t seq_num, + Record& rec_out) +{ + if ( flushedSeqNum != NOT_INITIALIZED && + seq_num <= flushedSeqNum ) + return Status::ALREADY_PURGED; + + Record query_rec; + RecNodeSeq query(&query_rec); + query.rec->seqNum = seq_num; + + skiplist_node* cursor = skiplist_find(idxBySeq, &query.snode); + if (cursor) { + RecNodeSeq* rec_node = _get_entry(cursor, RecNodeSeq, snode); + rec_out = *rec_node->rec; + skiplist_release_node(cursor); + return Status(); + } + + return Status::SEQNUM_NOT_FOUND; +} + +Status MemTable::getRecordByKey(const uint64_t chk, + const SizedBuf& key, + uint64_t* key_hash, + Record& rec_out, + bool allow_tombstone) +{ + // Check bloom filter first for fast screening. + if (key_hash) { + if (!bfByKey->check(key_hash)) return Status::KEY_NOT_FOUND; + } else { + if (!bfByKey->check(key.data, key.size)) return Status::KEY_NOT_FOUND; + } + + RecNode query(&key); + skiplist_node* cursor = skiplist_find(idxByKey, &query.snode); + if (!cursor) return Status::KEY_NOT_FOUND; + + RecNode* node = _get_entry(cursor, RecNode, snode); + Record* rec_ret = node->getLatestRecord(chk); + + if ( !rec_ret || + ( valid_number(flushedSeqNum) && + rec_ret->seqNum <= flushedSeqNum ) ) { + // Already purged KV pair, go to table. + if (!valid_number(chk)) { + skiplist_release_node(&node->snode); + return Status::KEY_NOT_FOUND; + } // Tolerate if this is snapshot. + } + + if ( !allow_tombstone && rec_ret->isDel() ) { + // Last operation is deletion. + skiplist_release_node(&node->snode); + return Status::KEY_NOT_FOUND; + } + rec_out = *rec_ret; + skiplist_release_node(&node->snode); + return Status(); +} + +Status MemTable::sync(FileOps* f_ops, + FileHandle* fh) +{ + Status s; + s = f_ops->fsync(fh); + if (!s) return s; + + return Status(); +} + +Status MemTable::loadRecord(RwSerializer& rws, + uint64_t flags, + uint64_t& seqnum_out) +{ + Status s; + Record* rec = new Record(); + rec->type = get_record_type_from_flags(flags); + + try{ + uint32_t crc_len = 0; + uint8_t len_buf[32]; + if (!rws.available(4 + REC_LEN_SIZE + 4)) { + throw Status(Status::INCOMPLETE_LOG); + } + + TC_( crc_len = rws.getU32(s) ); + TC( rws.get(len_buf, REC_LEN_SIZE) ); + + uint32_t crc_len_chk = crc32_8(len_buf, REC_LEN_SIZE, 0); + if (crc_len != crc_len_chk) { + _log_err(myLog, "crc error %x != %x, at %zu", + crc_len, crc_len_chk, rws.pos() - sizeof(crc_len)); + throw Status(Status::CHECKSUM_ERROR); + } + + RwSerializer len_buf_rw(len_buf, 32); + rec->seqNum = len_buf_rw.getU64(s); + seqnum_out = rec->seqNum; + + uint32_t k_size = len_buf_rw.getU32(s); + uint32_t m_size = len_buf_rw.getU32(s); + uint32_t v_size = len_buf_rw.getU32(s); + + uint32_t crc_data = 0; + TC_( crc_data = rws.getU32(s) ); + + uint32_t crc_data_chk = 0; + if (k_size) { + if (!rws.available(k_size)) { + throw Status(Status::INCOMPLETE_LOG); + } + rec->kv.key.alloc(k_size, nullptr); + TC( rws.get(rec->kv.key.data, k_size) ); + crc_data_chk = crc32_8(rec->kv.key.data, k_size, crc_data_chk); + } + if (m_size) { + if (!rws.available(m_size)) { + throw Status(Status::INCOMPLETE_LOG); + } + rec->meta.alloc(m_size, nullptr); + TC( rws.get(rec->meta.data, m_size) ); + crc_data_chk = crc32_8(rec->meta.data, m_size, crc_data_chk); + } + if (v_size) { + if (!rws.available(v_size)) { + throw Status(Status::INCOMPLETE_LOG); + } + rec->kv.value.alloc(v_size, nullptr); + TC( rws.get(rec->kv.value.data, v_size) ); + crc_data_chk = crc32_8(rec->kv.value.data, v_size, crc_data_chk); + } + + if (crc_data != crc_data_chk) { + _log_err(myLog, "crc error %x != %x", crc_data, crc_data_chk); + throw Status(Status::CHECKSUM_ERROR); + } + + Record* prev_rec = nullptr; + addToBySeqIndex(rec, prev_rec); + if (prev_rec) { + mGuard l(staleLogsLock); + staleLogs.push_back(prev_rec); + } + + bytesSize += rec->size(); + + if (rec->isCmd()) { + // If this is a special command (not ins or del), + // skip by-key index update. + return Status(); + } + addToByKeyIndex(rec); + + return Status(); + + } catch (Status s) { + rec->free(); + delete rec; + return s; + } +} + +Status MemTable::loadFlushMarker(RwSerializer& rws, + uint64_t& synced_seq_num_out) +{ + if (!rws.available(8)) { + return Status::INCOMPLETE_LOG; + } + + Status s; + EP_( synced_seq_num_out = rws.getU64(s) ); + return Status(); +} + +Status MemTable::loadCheckpoint(RwSerializer& rws) +{ + if (!rws.available(8)) { + return Status::INCOMPLETE_LOG; + } + + Status s; + uint64_t chk_seqnum = 0; + EP_( chk_seqnum = rws.getU64(s) ); + + mGuard l(checkpointsLock); + auto entry = checkpoints.rbegin(); + if ( entry != checkpoints.rend() && + *entry == chk_seqnum ) { + // Same checkpoint already exists, ignore. + return Status(); + } + checkpoints.push_back(chk_seqnum); + return Status(); +} + +Status MemTable::load(RwSerializer& rws, + uint64_t min_seq, + uint64_t flushed_seq, + uint64_t synced_seq) +{ + Timer tt; + getReady(); + + flushedSeqNum = flushed_seq; + minSeqNum = min_seq; + startSeqNum = min_seq; + seqNumAlloc = synced_seq; + maxSeqNum = synced_seq; + uint64_t padding_start_pos = NOT_INITIALIZED; + + Status s; + size_t filesize = rws.size(); + size_t last_valid_size = rws.pos(); + uint64_t num_record = 0, num_flush = 0, num_chk = 0; + uint64_t last_seq = NOT_INITIALIZED; + std::string m; + + for (; rws.pos() < filesize;) { + if (!rws.available(8)) { + s = Status::INCOMPLETE_LOG; + m = "not enough bytes for flags"; + break; + } + + uint64_t flags; + EB_( flags = rws.getU64(s), "failed to load flags" ) + FlagType type = identify_type(flags); + if (type == FlagType::RECORD) { + uint64_t seq = 0; + EB( loadRecord(rws, flags, seq), "failed to load record" ) + last_seq = seq; + num_record++; + last_valid_size = rws.pos(); + + } else if (type == FlagType::FLUSH_MARKER) { + uint64_t marker_seq; + EB( loadFlushMarker(rws, marker_seq), "failed to load flush marker" ) + num_flush++; + last_valid_size = rws.pos(); + + } else if (type == FlagType::CHECKPOINT) { + EB( loadCheckpoint(rws), "failed to load checkpoint" ) + num_chk++; + last_valid_size = rws.pos(); + + } else if (type == FlagType::PADDING) { + // The rest bytes are just for alignment + m = "hit padding bytes"; + padding_start_pos = last_valid_size; + break; + } else { + s = Status::UNKNOWN_LOG_FLAG; + break; + } + } + + if (NOT_INITIALIZED != last_seq) { + maxSeqNum = last_seq; + seqNumAlloc = last_seq; + syncedSeqNum = last_seq; + } + + if (NOT_INITIALIZED != last_seq && last_seq < synced_seq) { + _log_err( myLog, + "failed to load memTable for log file %s %ld " + "as some log entries are missing. " + "File size %zu, last read pos %zu, padding_start_pos %s. " + "Loaded last_seq %s, record %ld, flush marker %ld, chk %ld, " + "skiplist %zu. Expected min seq %s, synced_seq %s, " + "flushed_seq %s. %lu us elapsed, error code %s, message %s", + logFile->filename.c_str(), logFile->getLogFileNum(), + filesize, rws.pos(), _seq_str(padding_start_pos).c_str(), + _seq_str(last_seq).c_str(), + num_record, + num_flush, + num_chk, + skiplist_get_size(idxBySeq), + _seq_str(min_seq).c_str(), + _seq_str(synced_seq).c_str(), + _seq_str(flushed_seq).c_str(), + tt.getUs(), + s.toString().c_str(), + m.c_str() ); + } else if (!s) { + _log_warn( myLog, + "MemTable for log file %s %ld loading complete with error. " + "File size %zu, last read pos %zu, padding_start_pos %s. " + "Loaded last_seq %s, record %ld, flush marker %ld, chk %ld, " + "skiplist %zu. Expected min seq %s, synced_seq %s, " + "flushed_seq %s. %lu us elapsed, error code %s, message %s", + logFile->filename.c_str(), logFile->getLogFileNum(), + filesize, rws.pos(), _seq_str(padding_start_pos).c_str(), + _seq_str(last_seq).c_str(), + num_record, + num_flush, + num_chk, + skiplist_get_size(idxBySeq), + _seq_str(min_seq).c_str(), + _seq_str(synced_seq).c_str(), + _seq_str(flushed_seq).c_str(), + tt.getUs(), + s.toString().c_str(), + m.c_str() ); + } else { + _log_info( myLog, + "MemTable for log file %s %ld loading complete. " + "File size %zu, last read pos %zu, padding_start_pos %s. " + "Loaded last_seq %s, record %ld, flush marker %ld, chk %ld, " + "skiplist %zu. Expected min seq %s, synced_seq %s, " + "flushed_seq %s. %lu us elapsed, error code %s, message %s", + logFile->filename.c_str(), logFile->getLogFileNum(), + filesize, rws.pos(), _seq_str(padding_start_pos).c_str(), + _seq_str(last_seq).c_str(), + num_record, + num_flush, + num_chk, + skiplist_get_size(idxBySeq), + _seq_str(min_seq).c_str(), + _seq_str(synced_seq).c_str(), + _seq_str(flushed_seq).c_str(), + tt.getUs(), + s.toString().c_str(), + m.c_str() ); + } + return s; +} + +Status MemTable::findOffsetOfSeq(SimpleLogger* logger, + RwSerializer& rws, + uint64_t target_seq, + uint64_t& offset_out, + uint64_t* padding_start_pos_out) +{ + Timer tt; + + if (padding_start_pos_out != nullptr) { + *padding_start_pos_out = NOT_INITIALIZED; + } + + Status s; + size_t filesize = rws.size(); + size_t last_valid_size = rws.pos(); + uint64_t last_offset = 0; + uint64_t last_seq = 0; + std::string m; + + for (; rws.pos() < filesize;) { + if (!rws.available(8)) { + s = Status::INCOMPLETE_LOG; + m = "not enough bytes for flags"; + break; + } + + uint64_t flags; + EB_( flags = rws.getU64(s), "failed to load flags" ) + FlagType type = identify_type(flags); + if (type == FlagType::RECORD) { + // Just read length part and then skip. + if (!rws.available(4 + REC_LEN_SIZE + 4)) { + s = Status::INCOMPLETE_LOG; + m = "not enough bytes for record"; + break; + } + uint32_t crc_len = 0; + uint8_t len_buf[32]; + EP_( crc_len = rws.getU32(s) ); + EP( rws.get(len_buf, REC_LEN_SIZE) ); + + uint32_t crc_len_chk = crc32_8(len_buf, REC_LEN_SIZE, 0); + if (crc_len != crc_len_chk) { + _log_err(logger, "crc error %x != %x", crc_len, crc_len_chk); + std::stringstream ss; + ss << "crc error " << crc_len << " != " << crc_len_chk; + m = ss.str(); + s = Status::CHECKSUM_ERROR; + break; + } + + RwSerializer len_buf_rw(len_buf, 32); + last_seq = len_buf_rw.getU64(s); + + uint32_t k_size = len_buf_rw.getU32(s); + uint32_t m_size = len_buf_rw.getU32(s); + uint32_t v_size = len_buf_rw.getU32(s); + + uint32_t crc_data = 0; + TC_( crc_data = rws.getU32(s) ); + (void)crc_data; + + // Skip. + if (!rws.available(k_size + m_size + v_size)) { + s = Status::INCOMPLETE_LOG; + m = "not enough bytes for content of record"; + break; + } + rws.pos( rws.pos() + k_size + m_size + v_size ); + + // Seq number check. + if (last_seq <= target_seq) last_offset = rws.pos(); + last_valid_size = rws.pos(); + + } else if (type == FlagType::FLUSH_MARKER) { + if (!rws.available(8)) { + s = Status::INCOMPLETE_LOG; + m = "not enough bytes for flush marker"; + break; + } + uint64_t dummy = rws.getU64(s); + (void)dummy; + last_valid_size = rws.pos(); + + } else if (type == FlagType::CHECKPOINT) { + if (!rws.available(8)) { + s = Status::INCOMPLETE_LOG; + m = "not enough bytes for checkpoint"; + break; + } + uint64_t dummy = rws.getU64(s); + (void)dummy; + last_valid_size = rws.pos(); + + } else if (type == FlagType::PADDING) { + // The rest bytes are just for alignment + m = "hit padding bytes"; + if (padding_start_pos_out != nullptr) { + *padding_start_pos_out = last_valid_size; + } + break; + } else { + s = Status::UNKNOWN_LOG_FLAG; + break; + } + } + + if (s) { + // Only return offset on success + offset_out = last_offset; + _log_info( logger, + "found offset %ld for seqnum %ld, loaded last_seq %s, " + "file_size %zu, last read pos %zu, " + "%lu us elapsed, error code %s, message %s", + last_offset, target_seq, + _seq_str(last_seq).c_str(), + filesize, rws.pos(), + tt.getUs(), + s.toString().c_str(), + m.c_str() ); + } else { + _log_err( logger, + "failed to find offset for seqnum %ld, last_offset %ld, " + "loaded last_seq %s, file_size %zu, last read pos %zu, " + "%lu us elapsed, error code %s, message %s", + target_seq, last_offset, + _seq_str(last_seq).c_str(), + filesize, rws.pos(), + tt.getUs(), + s.toString().c_str(), + m.c_str() ); + } + + return s; +} + +// MemTable flush: skiplist (memory) -> log file. (disk) +Status MemTable::flush(RwSerializer& rws) +{ + if (minSeqNum == NOT_INITIALIZED) { + // No log in this file. Just do nothing and return OK. + return Status(); + } + + // Write logs in a mutation order. + // From `synced seq num` to `max seq num` + uint64_t seqnum_upto = maxSeqNum; + + // Flush never happened: start from min seq num + // Otherwise: start from last sync seq num + 1 + uint64_t ii = (syncedSeqNum.load() != NOT_INITIALIZED) + ? syncedSeqNum.load() + 1 + : minSeqNum.load(); + + Status s; + uint64_t num_flushed = 0; + uint64_t start_seqnum = NOT_INITIALIZED; + Record query_rec; + RecNodeSeq query(&query_rec); + query.rec->seqNum = ii; + skiplist_node* cursor = skiplist_find_greater_or_equal(idxBySeq, &query.snode); + + const size_t REC_META_LEN = + // Flags CRC for length seq number + sizeof(uint64_t) + sizeof(uint32_t) + sizeof(uint64_t) + + // KMV lengths CRC for KMV + sizeof(uint32_t) * 3 + sizeof(uint32_t); + + const size_t PAGE_LIMIT = + DBMgr::getWithoutInit()->getGlobalConfig()->memTableFlushBufferSize; + + // Keep extra headroom. + SizedBuf tmp_buf(PAGE_LIMIT * 2); + SizedBuf::Holder h_tmp_buf(tmp_buf); + RwSerializer ss_tmp_buf(tmp_buf); + + try { + uint8_t len_buf[32]; + memset(len_buf, 0xff, 32); + while (cursor) { + RecNodeSeq* rec_node = _get_entry(cursor, RecNodeSeq, snode); + Record* rec = rec_node->rec; + if (!valid_number(start_seqnum)) start_seqnum = rec->seqNum; + if (rec->seqNum > seqnum_upto) break; + + // << Record format >> + // flags 8 bytes (first byte: 0x0) + // CRC32 of lengths 4 bytes + // seq numnber 8 bytes + // key length (A) 4 bytes + // meta length (B) 4 bytes + // value length (C) 4 bytes + // CRC32 of KMV 4 bytes + // key A + // meta B + // value C + + if (ss_tmp_buf.pos() + REC_META_LEN > PAGE_LIMIT) { + // Flush and reset the position. + TC( rws.put(tmp_buf.data, ss_tmp_buf.pos()) ); + ss_tmp_buf.pos(0); + } + + TC( ss_tmp_buf.putU64(record_flags(rec)) ); + + // Put seqnum + length info to `len_buf`. + RwSerializer len_buf_s(len_buf, 32); + len_buf_s.putU64(rec->seqNum); + len_buf_s.putU32(rec->kv.key.size); + len_buf_s.putU32(rec->meta.size); + len_buf_s.putU32(rec->kv.value.size); + uint64_t len_buf_size = len_buf_s.pos(); + + // Calculate CRC of `len_buf`, + // and put both CRC and `len_buf` to `tmp_buf`. + uint32_t crc_len = crc32_8(len_buf, len_buf_size, 0); + TC( ss_tmp_buf.putU32(crc_len) ); + TC( ss_tmp_buf.put(len_buf, len_buf_size) ); + + // Calculate CRC of data, and put the CRC to `tmp_buf`. + uint32_t crc_data = crc32_8(rec->kv.key.data, rec->kv.key.size, 0); + crc_data = crc32_8(rec->meta.data, rec->meta.size, crc_data); + crc_data = crc32_8(rec->kv.value.data, rec->kv.value.size, crc_data); + TC( ss_tmp_buf.putU32(crc_data) ); + + size_t data_len = rec->kv.key.size + rec->kv.value.size + rec->meta.size; + if (data_len >= PAGE_LIMIT) { + // Data itself is bigger than a page, flush and bypass buffer. + TC( rws.put(tmp_buf.data, ss_tmp_buf.pos()) ); + ss_tmp_buf.pos(0); + + // Write to file directly. + TC( rws.put(rec->kv.key.data, rec->kv.key.size) ); + TC( rws.put(rec->meta.data, rec->meta.size) ); + TC( rws.put(rec->kv.value.data, rec->kv.value.size) ); + + } else { + if (ss_tmp_buf.pos() + data_len > PAGE_LIMIT) { + // Data itself is not bigger than a page, + // but the entire data in the buffer will exceed + // the page size, once we append data. + // Flush the buffer into file. + TC( rws.put(tmp_buf.data, ss_tmp_buf.pos()) ); + ss_tmp_buf.pos(0); + } + + // Write to buffer. + TC( ss_tmp_buf.put(rec->kv.key.data, rec->kv.key.size) ); + TC( ss_tmp_buf.put(rec->meta.data, rec->meta.size) ); + TC( ss_tmp_buf.put(rec->kv.value.data, rec->kv.value.size) ); + } + + num_flushed++; + + ii = rec->seqNum; + { mGuard l(checkpointsLock); + for (auto& entry: checkpoints) { + uint64_t chk_seqnum = entry; + if (chk_seqnum == ii) { + TC( appendCheckpointMarker(ss_tmp_buf, chk_seqnum) ); + } + } + } + + cursor = skiplist_next(idxBySeq, &rec_node->snode); + skiplist_release_node(&rec_node->snode); + } + if (cursor) skiplist_release_node(cursor); + + { // In case that only a checkpoint is appended without any new record. + mGuard l(checkpointsLock); + auto entry = checkpoints.rbegin(); + if ( checkpointsDirty && + entry != checkpoints.rend() && + valid_number(syncedSeqNum) && + *entry == syncedSeqNum ) { + TC( appendCheckpointMarker(rws, syncedSeqNum) ); + } + checkpointsDirty = false; + } + + // Just in case if remaining data exists. + if (ss_tmp_buf.pos()) { + TC( rws.put(tmp_buf.data, ss_tmp_buf.pos()) ); + } + + _log_debug(myLog, + "MemTable %ld flushed %ld skiplist %zu, " + "start_seqnum %ld seqnum_upto %ld", + logFile->getLogFileNum(), num_flushed, + skiplist_get_size(idxBySeq), + start_seqnum, seqnum_upto); + + syncedSeqNum = seqnum_upto; + return Status(); + + } catch (Status s) { + if (cursor) skiplist_release_node(cursor); + return s; + } +} + +Status MemTable::appendFlushMarker(RwSerializer& rws) +{ + // << Flush marker format >> + // flags, 8 bytes (first byte: 0x1) + // synced seq number, 8 bytes + Status s; + EP( rws.putU64(flush_marker_flags()) ); + EP( rws.putU64(syncedSeqNum) ); + return Status(); +} + +Status MemTable::appendCheckpointMarker(RwSerializer& rws, + uint64_t chk_seqnum) +{ + // << Flush marker format >> + // flags, 8 bytes (first byte: 0x2) + // checkpoint seq number, 8 bytes + Status s; + EP( rws.putU64(checkpoint_flags()) ); + EP( rws.putU64(chk_seqnum) ); + return Status(); +} + +Status MemTable::checkpoint(uint64_t& seq_num_out) { + // No log. Do nothing. + if (!valid_number(maxSeqNum)) return Status(); + + mGuard l(checkpointsLock); + seq_num_out = maxSeqNum; + + auto last_entry = checkpoints.rbegin(); + if ( last_entry != checkpoints.rend() && + *last_entry == maxSeqNum) { + // Same one already exists, ignore. + return Status(); + } + checkpoints.push_back(seq_num_out); + checkpointsDirty = true; + return Status(); +} + +Status MemTable::getLogsToFlush(const uint64_t seq_num, + std::list& list_out, + bool ignore_sync_seqnum) +{ + if ( !ignore_sync_seqnum && + syncedSeqNum == NOT_INITIALIZED ) { + return Status::LOG_NOT_SYNCED; + } + if ( flushedSeqNum != NOT_INITIALIZED && + seq_num <= flushedSeqNum ) { + return Status::ALREADY_PURGED; + } + + uint64_t ii = minSeqNum; + if (flushedSeqNum != NOT_INITIALIZED) ii = flushedSeqNum + 1; + + // Remove previous records if there is a seq number rollback. + auto entry = list_out.rbegin(); + while (entry != list_out.rend()) { + Record* cur_rec = *entry; + if (cur_rec->seqNum >= ii) { + _log_err(myLog, "found duplicate seq number across different log files: " + "%zu. will use newer one", + cur_rec->seqNum); + list_out.pop_back(); + entry = list_out.rbegin(); + } else { + break; + } + } + + Record query_rec; + RecNodeSeq query(&query_rec); + + // Check last put seq number for debugging purpose. + uint64_t last_seq = 0; + query.rec->seqNum = ii; + skiplist_node* cursor = skiplist_find_greater_or_equal(idxBySeq, &query.snode); + while (cursor) { + RecNodeSeq* rec_node = _get_entry(cursor, RecNodeSeq, snode); + Record* rec = rec_node->rec; + if (rec->seqNum > seq_num) break; + + if (rec->seqNum == last_seq) { + _log_err(myLog, "found duplicate seq number %zu", last_seq); + assert(0); + } + list_out.push_back(rec); + last_seq = rec->seqNum; + + cursor = skiplist_next(idxBySeq, &rec_node->snode); + skiplist_release_node(&rec_node->snode); + } + if (cursor) skiplist_release_node(cursor); + + return Status(); +} + +Status MemTable::getCheckpoints(const uint64_t seq_num, + std::list& list_out) +{ + mGuard l(checkpointsLock); + for (auto& entry: checkpoints) { + uint64_t chk_seqnum = entry; + if ( valid_number(flushedSeqNum) && + chk_seqnum <= flushedSeqNum ) continue; + if ( chk_seqnum <= seq_num || + !valid_number(seq_num) ) { + list_out.push_back(chk_seqnum); + } + } + return Status(); +} + +size_t MemTable::getNumLogs() const { + return skiplist_get_size(idxBySeq); +} + +Status MemTable::forceSeqnum(uint64_t to) { + if ( valid_number(minSeqNum) && + minSeqNum > to ) { + return Status(); + } + + if ( !valid_number(flushedSeqNum) || + !valid_number(syncedSeqNum) || + to > flushedSeqNum || + to > syncedSeqNum ) { + seqNumAlloc = to; + maxSeqNum = to; + syncedSeqNum = to; + flushedSeqNum = to; + if (!valid_number(minSeqNum)) minSeqNum = to; + _log_warn(myLog, "updated flushed/sync seq numbers to %zu", to); + } + return Status(); +} + +} // namespace jungle + diff --git a/src/memtable.h b/src/memtable.h new file mode 100644 index 0000000..896c73c --- /dev/null +++ b/src/memtable.h @@ -0,0 +1,236 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "fileops_base.h" +#include "internal_helper.h" +#include "skiplist.h" + +#include + +#include +#include +#include + +class BloomFilter; +class SimpleLogger; + +namespace jungle { + +class LogFile; +class MemTable { + friend class LogFile; +private: + struct RecNode; + +public: + MemTable(const LogFile* log_file); + ~MemTable(); + + static int cmpKey(skiplist_node *a, skiplist_node *b, void *aux); + + Status init(const uint64_t start_seq_num = 0); + + Status assignSeqNum(Record& rec_local); + Status updateMaxSeqNum(const uint64_t seq_num); + RecNode* findRecNode(Record* rec); + void addToByKeyIndex(Record* rec); + Status addToBySeqIndex(Record* rec, Record*& prev_rec_out); + Status putNewRecord(const Record& rec); + // Returns pointer only. + Status findRecordBySeq(const uint64_t seq_num, + Record& rec_out); + // Returns pointer only. + Status getRecordByKey(const uint64_t chk, + const SizedBuf& key, + uint64_t* key_hash, + Record& rec_out, + bool allow_tombstone = false); + Status sync(FileOps* f_ops, + FileHandle* fh); + Status appendFlushMarker(RwSerializer& rws); + Status appendCheckpointMarker(RwSerializer& rws, + uint64_t chk_seqnum); + Status loadRecord(RwSerializer& rws, + uint64_t flags, + uint64_t& seqnum_out); + Status loadFlushMarker(RwSerializer& rws, + uint64_t& synced_seq_num_out); + Status loadCheckpoint(RwSerializer& rws); + + Status load(RwSerializer& rws, + uint64_t min_seq, + uint64_t flushed_seq, + uint64_t synced_seq); + + static Status findOffsetOfSeq(SimpleLogger* logger, + RwSerializer& rws, + uint64_t target_seq, + uint64_t& offset_out, + uint64_t* padding_start_pos_out = nullptr); + + Status flush(RwSerializer& rws); + Status checkpoint(uint64_t& seq_num_out); + Status getLogsToFlush(const uint64_t seq_num, + std::list& list_out, + bool ignore_sync_seqnum); + // If seq_num == NOT_INITIALIZED, return all. + Status getCheckpoints(const uint64_t seq_num, + std::list& list_out); + + size_t getNumLogs() const; + + // Next seqnum will start from `to + 1`. + uint64_t getSeqCounter() const { return seqNumAlloc.load(MOR); } + Status forceSeqnum(uint64_t to); + + void setLogger(SimpleLogger* logger) { myLog = logger; } + + bool isIncreasingOrder() const { return increasingOrder; } + + // Size of memory table in bytes + uint64_t size() const { return bytesSize.load(); } + + class Iterator { + public: + Iterator(); + ~Iterator(); + + enum SeekOption { + GREATER = 0, + SMALLER = 1, + }; + + Status init(const MemTable* m_table, + const SizedBuf& start_key, + const SizedBuf& end_key, + const uint64_t seq_upto); + Status initSN(const MemTable* m_table, + const uint64_t min_seq, + const uint64_t max_seq); + Status get(Record& rec_out); + Status prev(bool allow_tombstone = false); + Status next(bool allow_tombstone = false); + Status seek(const SizedBuf& key, SeekOption opt = GREATER); + Status seekSN(const uint64_t seqnum, SeekOption opt = GREATER); + Status gotoBegin(); + Status gotoEnd(); + Status close(); + + private: + enum Type { + BY_KEY = 0, + BY_SEQ = 1 + } type; + const MemTable* mTable; + skiplist_node* cursor; + uint64_t minSeq; + uint64_t maxSeq; + SizedBuf startKey; + SizedBuf endKey; + uint64_t seqUpto; + }; + +private: +// === TYPES + static const int INIT_ARRAY_SIZE = 16384; + static const int MAX_LOGFILE_SIZE = 4*1024*1024; + + struct RecNode { + using RecList = std::list; + + RecNode(const SizedBuf& _key); + RecNode(const SizedBuf* _key); + ~RecNode(); + + static int cmp(skiplist_node *a, skiplist_node *b, void *aux); + Record* getLatestRecord(const uint64_t chk); + uint64_t getMinSeq(); + bool validKeyExist(const uint64_t chk, + bool allow_tombstone = false); + + skiplist_node snode; + SizedBuf key; + bool freeKey; + std::mutex recListLock; + RecList* recList; + }; + + struct RecNodeSeq { + RecNodeSeq(Record* _rec); + ~RecNodeSeq(); + + static int cmp(skiplist_node *a, skiplist_node *b, void *aux); + + skiplist_node snode; + // TODO: multi-thread update? + Record* rec; + }; + +// === FUNCTIONS + Status getReady(); + +// === VARIABLES + uint64_t startSeqNum; + + // Earliest log seq number. + std::atomic minSeqNum; + // Last flushed (moved to index table) seq number. + std::atomic flushedSeqNum; + // Last synced (flushed from Mem table into corresponding log file, + // but not physically durable yet) seq number. If logs are flushed + // and then synced at once, this value always will be the same as + // `fsyncedSeqNum`. + std::atomic syncedSeqNum; + // Latest log seq number in the current mem table. + std::atomic maxSeqNum; + // Atomic counter for seq number allocation. + std::atomic seqNumAlloc; + // Atomic counter for byte size of whole memtable + std::atomic bytesSize; + + // Index by sequence number. + skiplist_raw* idxBySeq; + + // Index by key. + skiplist_raw* idxByKey; + // Bloom filter for key. + BloomFilter* bfByKey; + + // Temporary store for stale (overwritten using the same seq num) logs. + std::mutex staleLogsLock; + std::list staleLogs; + + // Checkpoints. + std::mutex checkpointsLock; + std::list checkpoints; + bool checkpointsDirty; + + // Parent log file. + const LogFile* logFile; + + // Pointer to last inserted record. + Record* lastInsertedRec; + + // `true` if all records have been inserted in key order. + bool increasingOrder; + + SimpleLogger* myLog; +}; + +} // namespace jungle + diff --git a/src/memtable_iterator.cc b/src/memtable_iterator.cc new file mode 100644 index 0000000..b83eb81 --- /dev/null +++ b/src/memtable_iterator.cc @@ -0,0 +1,450 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "memtable.h" + +namespace jungle { + +MemTable::Iterator::Iterator() + : mTable(nullptr) + , cursor(nullptr) + , minSeq(NOT_INITIALIZED) + , maxSeq(NOT_INITIALIZED) + , seqUpto(NOT_INITIALIZED) + {} + +MemTable::Iterator::~Iterator() { + close(); +} + +Status MemTable::Iterator::init(const MemTable* m_table, + const SizedBuf& start_key, + const SizedBuf& end_key, + const uint64_t seq_upto) +{ + mTable = m_table; + type = BY_KEY; + + RecNode query(&start_key); + cursor = skiplist_find_greater_or_equal(mTable->idxByKey, &query.snode); + if (!cursor) return Status::ITERATOR_INIT_FAIL; + + RecNode* rn = _get_entry(cursor, RecNode, snode); + if (!end_key.empty() && SizedBuf::cmp(rn->key, end_key) > 0) { + skiplist_release_node(cursor); + cursor = nullptr; + return Status::OUT_OF_RANGE; + } + + startKey.alloc(start_key); + endKey.alloc(end_key); + seqUpto = seq_upto; + + // Skip invalid keys. + for (; cursor ;) { + RecNode* rn = _get_entry(cursor, RecNode, snode); + if (!endKey.empty() && SizedBuf::cmp(rn->key, endKey) > 0) { + // Greater than end key. + skiplist_release_node(cursor); + cursor = nullptr; + return Status::OUT_OF_RANGE; + } + + // Allow tombstone. + if ( !rn->validKeyExist(seqUpto, true) ) { + // No record belongs to the current snapshot (iterator), + // move further. + cursor = skiplist_next(mTable->idxByKey, cursor); + skiplist_release_node(&rn->snode); + continue; + } + break; + } + + return Status(); +} + +Status MemTable::Iterator::initSN(const MemTable* m_table, + const uint64_t min_seq, + const uint64_t max_seq) +{ + mTable = m_table; + type = BY_SEQ; + + if (mTable->maxSeqNum == NOT_INITIALIZED) + return Status::NOT_INITIALIZED; + + // Doesn't allow to access logs already purged. + uint64_t min_local = min_seq; + if (mTable->minSeqNum != NOT_INITIALIZED && + min_local < mTable->minSeqNum) + min_local = mTable->minSeqNum; + + if (mTable->flushedSeqNum != NOT_INITIALIZED && + min_local < mTable->flushedSeqNum) + min_local = mTable->flushedSeqNum; + + if (min_local > mTable->maxSeqNum) + min_local = mTable->maxSeqNum; + + minSeq = min_local; + + // Need to filter out iterator request beyond the range. + if (minSeq > max_seq) return Status::OUT_OF_RANGE; + + Record query_rec; + RecNodeSeq query(&query_rec); + query.rec->seqNum = minSeq; + cursor = skiplist_find_greater_or_equal(mTable->idxBySeq, &query.snode); + if (!cursor) return Status::ITERATOR_INIT_FAIL; + + // Doesn't allow to access logs beyond max seqnum. + uint64_t max_local = max_seq; + if (mTable->minSeqNum != NOT_INITIALIZED && + max_local < mTable->minSeqNum) + max_local = mTable->minSeqNum; + + if (max_local > mTable->maxSeqNum) + max_local = mTable->maxSeqNum; + + maxSeq = max_local; + + return Status(); +} + +Status MemTable::Iterator::get(Record& rec_out) { + if (!mTable) return Status::NOT_INITIALIZED; + if (!cursor) return Status::KEY_NOT_FOUND; + if (type == BY_KEY) { + RecNode* node = _get_entry(cursor, RecNode, snode); + Record* rec = node->getLatestRecord(seqUpto); + assert(rec); + rec_out = *rec; + + } else { // BY_SEQ + RecNodeSeq* node = _get_entry(cursor, RecNodeSeq, snode); + Record* rec = node->rec; + assert(rec); + rec_out = *rec; + } + + return Status(); +} + +Status MemTable::Iterator::prev(bool allow_tombstone) { + if (!mTable) return Status::NOT_INITIALIZED; + if (!cursor) return Status::KEY_NOT_FOUND; + if (type == BY_KEY) { + skiplist_node* node_to_release = cursor; + skiplist_node* prev_node = skiplist_prev(mTable->idxByKey, cursor); + + for (;;) { + if (!prev_node) return Status::OUT_OF_RANGE; + + RecNode* rn = _get_entry(prev_node, RecNode, snode); + if (!startKey.empty() && SizedBuf::cmp(rn->key, startKey) < 0) { + // Smaller than start key. + skiplist_release_node(prev_node); + return Status::OUT_OF_RANGE; + } + + if ( !rn->validKeyExist(seqUpto, allow_tombstone) ) { + prev_node = skiplist_prev(mTable->idxByKey, prev_node); + skiplist_release_node(&rn->snode); + continue; + } + break; + } + + cursor = prev_node; + skiplist_release_node(node_to_release); + + } else { // BY_SEQ + skiplist_node* node_to_release = cursor; + skiplist_node* prev_node = skiplist_prev(mTable->idxBySeq, cursor); + if (!prev_node) return Status::OUT_OF_RANGE; + + RecNodeSeq* rn = _get_entry(prev_node, RecNodeSeq, snode); + if (rn->rec->seqNum < minSeq) { + skiplist_release_node(prev_node); + return Status::OUT_OF_RANGE; + } + + cursor = prev_node; + skiplist_release_node(node_to_release); + } + + return Status(); +} + +Status MemTable::Iterator::next(bool allow_tombstone) { + if (!mTable) return Status::NOT_INITIALIZED; + if (!cursor) return Status::KEY_NOT_FOUND; + if (type == BY_KEY) { + skiplist_node* node_to_release = cursor; + skiplist_node* next_node = skiplist_next(mTable->idxByKey, cursor); + + for(;;) { + if (!next_node) return Status::OUT_OF_RANGE; + + RecNode* rn = _get_entry(next_node, RecNode, snode); + if (!endKey.empty() && SizedBuf::cmp(rn->key, endKey) > 0) { + // Greater than end key. + skiplist_release_node(next_node); + return Status::OUT_OF_RANGE; + } + + if ( !rn->validKeyExist(seqUpto, allow_tombstone) ) { + // No record belongs to the current snapshot (iterator), + // move further. + next_node = skiplist_next(mTable->idxByKey, next_node); + skiplist_release_node(&rn->snode); + continue; + } + break; + } + + cursor = next_node; + skiplist_release_node(node_to_release); + + } else { // BY_SEQ + skiplist_node* node_to_release = cursor; + skiplist_node* next_node = skiplist_next(mTable->idxBySeq, cursor); + if (!next_node) return Status::OUT_OF_RANGE; + + RecNodeSeq* rn = _get_entry(next_node, RecNodeSeq, snode); + if (rn->rec->seqNum > maxSeq) { + skiplist_release_node(next_node); + return Status::OUT_OF_RANGE; + } + + cursor = next_node; + skiplist_release_node(node_to_release); + } + + return Status(); +} + +Status MemTable::Iterator::seek + ( const SizedBuf& key, SeekOption opt ) +{ + if (type != BY_KEY) return Status::INVALID_HANDLE_USAGE; + if (!cursor) return Status::KEY_NOT_FOUND; + + skiplist_node* node_to_release = cursor; + skiplist_node* seek_node = nullptr; + + const SizedBuf* query_key_ptr = &key; + if (opt == GREATER && key.empty() && !startKey.empty()) { + // gotoBegin, and start key is given. + query_key_ptr = &startKey; + } + RecNode query(query_key_ptr); + + bool fwd_search = true; + + if (opt == GREATER) { + fwd_search = true; // Foward search. + seek_node = skiplist_find_greater_or_equal + ( mTable->idxByKey, &query.snode ); + if (!seek_node) { + if (endKey.empty()) { + // End key is not given. + seek_node = skiplist_end(mTable->idxByKey); + } else { + // End key is given. + RecNode end_query(&endKey); + seek_node = skiplist_find_smaller_or_equal + ( mTable->idxByKey, &end_query.snode ); + } + // If not found, backward search from the end. + fwd_search = false; + } + + } else if (opt == SMALLER) { + fwd_search = false; + seek_node = skiplist_find_smaller_or_equal + ( mTable->idxByKey, &query.snode ); + if (!seek_node) { + if (startKey.empty()) { + // Start key is not given. + seek_node = skiplist_begin(mTable->idxByKey); + } else { + // End key is given. + RecNode start_query(&startKey); + seek_node = skiplist_find_greater_or_equal + ( mTable->idxByKey, &start_query.snode ); + } + // If not found, forward search from the beginning. + fwd_search = true; + } + + } else { + assert(0); + } + + if (fwd_search) { + // Go forward if no record belongs to the snapshot. + for (; seek_node ;) { + RecNode* rn = _get_entry(seek_node, RecNode, snode); + if (!endKey.empty() && SizedBuf::cmp(rn->key, endKey) > 0) { + // Greater than end key. + skiplist_release_node(seek_node); + return Status::OUT_OF_RANGE; + } + + if ( !rn->validKeyExist(seqUpto, true) ) { + // No record belongs to the current snapshot (iterator), + // move further. + seek_node = skiplist_next(mTable->idxByKey, seek_node); + skiplist_release_node(&rn->snode); + continue; + } + break; + } + + } else { + // Go backward if no record belongs to the snapshot. + for (; seek_node ;) { + RecNode* rn = _get_entry(seek_node, RecNode, snode); + if (!startKey.empty() && SizedBuf::cmp(rn->key, startKey) < 0) { + // Smaller than start key. + skiplist_release_node(seek_node); + return Status::OUT_OF_RANGE; + } + + if ( !rn->validKeyExist(seqUpto, true) ) { + // No record belongs to the current snapshot (iterator), + // move further. + seek_node = skiplist_prev(mTable->idxByKey, seek_node); + skiplist_release_node(&rn->snode); + continue; + } + break; + } + } + + if (!seek_node) return Status::OUT_OF_RANGE; + + cursor = seek_node; + skiplist_release_node(node_to_release); + + return Status(); +} + +Status MemTable::Iterator::seekSN + ( const uint64_t seqnum, MemTable::Iterator::SeekOption opt ) +{ + if (type != BY_SEQ) return Status::INVALID_HANDLE_USAGE; + if (!cursor) return Status::KEY_NOT_FOUND; + + skiplist_node* node_to_release = cursor; + skiplist_node* seek_node = nullptr; + + Record rec; + RecNodeSeq query(&rec); + query.rec->seqNum = seqnum; + + if (opt == GREATER) { + if (seqnum < minSeq) query.rec->seqNum = minSeq; + + seek_node = skiplist_find_greater_or_equal + ( mTable->idxBySeq, &query.snode ); + if (!seek_node) { + query.rec->seqNum = maxSeq; + seek_node = skiplist_find_smaller_or_equal + ( mTable->idxBySeq, &query.snode ); + } + } else if (opt == SMALLER) { + if (seqnum < maxSeq) query.rec->seqNum = maxSeq; + + seek_node = skiplist_find_smaller_or_equal + ( mTable->idxBySeq, &query.snode ); + if (!seek_node) { + query.rec->seqNum = minSeq; + seek_node = skiplist_find_greater_or_equal + ( mTable->idxBySeq, &query.snode ); + } + } else { + assert(0); + } + + if (!seek_node) return Status::OUT_OF_RANGE; + + cursor = seek_node; + skiplist_release_node(node_to_release); + + return Status(); +} + +Status MemTable::Iterator::gotoBegin() { + if (!mTable) return Status::NOT_INITIALIZED; + if (!cursor) return Status::KEY_NOT_FOUND; + if (type == BY_KEY) { + skiplist_node* node_to_release = cursor; + skiplist_node* min_node = nullptr; + if (startKey.empty()) { + min_node = skiplist_begin(mTable->idxByKey); + } else { + RecNode query(&startKey); + min_node = skiplist_find_greater_or_equal + ( mTable->idxByKey, &query.snode ); + } + if (!min_node) return Status::OUT_OF_RANGE; + + cursor = min_node; + skiplist_release_node(node_to_release); + + } else { // BY_SEQ + return seekSN(minSeq, GREATER); + } + return Status(); +} + +Status MemTable::Iterator::gotoEnd() { + if (!mTable) return Status::NOT_INITIALIZED; + if (type == BY_KEY) { + skiplist_node* node_to_release = cursor; + skiplist_node* max_node = nullptr; + if (endKey.empty()) { + max_node = skiplist_end(mTable->idxByKey); + } else { + RecNode query(&endKey); + max_node = skiplist_find_smaller_or_equal + ( mTable->idxByKey, &query.snode ); + } + if (!max_node) return Status::OUT_OF_RANGE; + + cursor = max_node; + skiplist_release_node(node_to_release); + + } else { // BY_SEQ + return seekSN(maxSeq, SMALLER); + } + return Status(); +} + +Status MemTable::Iterator::close() { + if (cursor) skiplist_release_node(cursor); + // Nothing to free, just clear them. + mTable = nullptr; + cursor = nullptr; + startKey.free(); + endKey.free(); + return Status(); +} + +}; // namespace jungle diff --git a/src/murmurhash3.cc b/src/murmurhash3.cc new file mode 100644 index 0000000..a8bc139 --- /dev/null +++ b/src/murmurhash3.cc @@ -0,0 +1,358 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2016 Austin Appleby +See URL: https://github.com/aappleby/smhasher/tree/master/src + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "murmurhash3.h" + +// LCOV_EXCL_START + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define FORCE_INLINE inline __attribute__((always_inline)) + +inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix32 ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix64 ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix32(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + const uint32_t c1 = 0x239b961b; + const uint32_t c2 = 0xab0e9789; + const uint32_t c3 = 0x38b34ae5; + const uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i*4+0); + uint32_t k2 = getblock32(blocks,i*4+1); + uint32_t k3 = getblock32(blocks,i*4+2); + uint32_t k4 = getblock32(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix32(h1); + h2 = fmix32(h2); + h3 = fmix32(h3); + h4 = fmix32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(int i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock64(blocks,i*2+0); + uint64_t k2 = getblock64(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= ((uint64_t)tail[14]) << 48; + case 14: k2 ^= ((uint64_t)tail[13]) << 40; + case 13: k2 ^= ((uint64_t)tail[12]) << 32; + case 12: k2 ^= ((uint64_t)tail[11]) << 24; + case 11: k2 ^= ((uint64_t)tail[10]) << 16; + case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; + case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; + case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; + case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; + case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; + case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; + case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; + case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; + case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- + +// LCOV_EXCL_STOP + diff --git a/src/murmurhash3.h b/src/murmurhash3.h new file mode 100644 index 0000000..ecda26d --- /dev/null +++ b/src/murmurhash3.h @@ -0,0 +1,58 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2016 Austin Appleby +See URL: https://github.com/aappleby/smhasher/tree/master/src + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) && (_MSC_VER < 1600) + +/* +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned __int64 uint64_t; +*/ + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ diff --git a/src/simple_thread_pool.h b/src/simple_thread_pool.h new file mode 100644 index 0000000..f5a3f9c --- /dev/null +++ b/src/simple_thread_pool.h @@ -0,0 +1,634 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2019 Jung-Sang Ahn +See URL: https://github.com/greensky00/simple_thread_pool + (v0.1.2) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace simple_thread_pool { + +class EventAwaiter { +private: + enum class AS { + idle = 0x0, + ready = 0x1, + waiting = 0x2, + done = 0x3 + }; + +public: + EventAwaiter() : status(AS::idle) {} + + void reset() { + status.store(AS::idle); + } + + void wait() { + wait_us(0); + } + + void wait_ms(size_t time_ms) { + wait_us(time_ms * 1000); + } + + void wait_us(size_t time_us) { + AS expected = AS::idle; + if (status.compare_exchange_strong(expected, AS::ready)) { + // invoke() has not been invoked yet, wait for it. + std::unique_lock l(cvLock); + expected = AS::ready; + if (status.compare_exchange_strong(expected, AS::waiting)) { + if (time_us) { + cv.wait_for(l, std::chrono::microseconds(time_us)); + } else { + cv.wait(l); + } + status.store(AS::done); + } else { + // invoke() has grabbed `cvLock` earlier than this. + } + } else { + // invoke() already has been called earlier than this. + } + } + + void invoke() { + AS expected = AS::idle; + if (status.compare_exchange_strong(expected, AS::done)) { + // wait() has not been invoked yet, do nothing. + return; + } + + std::unique_lock l(cvLock); + expected = AS::ready; + if (status.compare_exchange_strong(expected, AS::done)) { + // wait() has been called earlier than invoke(), + // but invoke() has grabbed `cvLock` earlier than wait(). + // Do nothing. + } else { + // wait() is waiting for ack. + cv.notify_all(); + } + } + +private: + std::atomic status; + std::mutex cvLock; + std::condition_variable cv; +}; + +enum TaskType { + ONE_TIME = 0x0, + RECURRING = 0x1, +}; + +enum TaskStatus { + WAITING = 0x0, + RUNNING = 0x1, + DONE = 0x2, +}; + +class TaskResult { +public: + enum Value { + OK = 0, + CANCELED = -1, + FAILED = -32768, + }; + + TaskResult() : val(OK) {} + TaskResult(Value v) : val(v) {} + TaskResult(int v) : val((Value)v) {} + inline explicit operator bool() { return ok(); } + inline operator int() const { return (int)val; } + inline bool ok() const { return (val == OK); } + inline Value value() const { return val; } + +private: + Value val; +}; + +using TaskHandler = std::function< void(const TaskResult&) >; + +class ThreadHandle; +class ThreadPoolMgrBase { + friend class TaskHandle; + friend class ThreadHandle; + +public: + ThreadPoolMgrBase() {} + + virtual bool isStopped() const = 0; + + virtual void invoke() = 0; + + virtual void returnThread(const std::shared_ptr& t_handle) = 0; + + virtual bool invokeCanceledTask() const = 0; +}; + +class TaskHandle { +public: + TaskHandle(ThreadPoolMgrBase* m, + const TaskHandler& h, + uint64_t interval_us, + TaskType tt) + : type(tt) + , mgr(m) + , intervalUs(interval_us) + , handler(h) + , status(WAITING) + { + reschedule(intervalUs); + } + + /** + * Check if this task is eligible to execute now. + * + * @param[out] time_left_us_out + * `0` if we can execute this task right now. + * Non-zero if we should wait more. + * @return `true` if we can execute this task. + */ + bool timeToFire(uint64_t& time_left_us_out) { + auto cur = std::chrono::system_clock::now(); + + std::lock_guard l(tRegisteredLock); + std::chrono::duration elapsed = cur - tRegistered; + if (intervalUs < elapsed.count() * 1000000) { + time_left_us_out = 0; + return true; + } + time_left_us_out = intervalUs - ( elapsed.count() * 1000000 ); + return false; + } + + /** + * Execute this task. + * If the task already has been executed, this function will + * do nothing. + * + * @param ret Result value that will be passed to the callback function. + * @return `true` if successfully executed. + */ + bool execute(const TaskResult& ret) { + TaskStatus exp = WAITING; + TaskStatus desired = RUNNING; + if (status.compare_exchange_strong(exp, desired)) { + handler(ret); + if (type == ONE_TIME) { + status = DONE; + } else { + exp = RUNNING; + desired = WAITING; + status.compare_exchange_strong(exp, desired); + } + return true; + } + return false; + } + + /** + * Cancel this task. + * + * @return `true` if successfully canceled. + * `false` if the task already has been executed. + */ + bool cancel() { + if (type == ONE_TIME) { + TaskStatus exp = WAITING; + TaskStatus desired = DONE; + if (status.compare_exchange_strong(exp, desired)) { + if (mgr->invokeCanceledTask()) { + handler( TaskResult(TaskResult::CANCELED) ); + } + return true; + } + return false; + } + status = DONE; + return true; + } + + /** + * Reschedule this task. + * + * @param new_interval_us New interval in microseconds. + * If negetive value, will use existing inteval. + */ + void reschedule(int64_t new_interval_us = -1) { + auto cur = std::chrono::system_clock::now(); + { std::lock_guard l(tRegisteredLock); + tRegistered = cur; + if (new_interval_us >= 0) { + intervalUs = new_interval_us; + } + } + mgr->invoke(); + } + + /** + * Check if this task is one-time job. + * + * @return `true` if one-time job. + */ + bool isOneTime() const { return (type == ONE_TIME); } + + /** + * Check if this task has been executed. + * + * @return `true` if already executed. + */ + bool isDone() const { return (status == DONE); } + +private: + // Type of this task (one time or recurring). + TaskType type; + + // Parent manager. + ThreadPoolMgrBase* mgr; + + // Time when this task is initiated. + std::chrono::time_point tRegistered; + std::mutex tRegisteredLock; + + // Interval if this is timer task. + uint64_t intervalUs; + + // Callback function. + TaskHandler handler; + + // Current running status. + std::atomic status; +}; + +class ThreadHandle { +public: + ThreadHandle(ThreadPoolMgrBase* m, + size_t id) + : myId(id) + , mgr(m) + , myself(nullptr) + , tHandle(nullptr) + , assignedTask(nullptr) + { + (void)myId; + } + + + void init(const std::shared_ptr< ThreadHandle >& itself) { + myself = itself; + tHandle = std::shared_ptr + ( new std::thread(&ThreadHandle::loop, this) ); + } + + void shutdown() { + eaLoop.invoke(); + if (tHandle && tHandle->joinable()) { + tHandle->join(); + } + myself.reset(); + tHandle.reset(); + assignedTask.reset(); + } + + void loop() { +#ifdef __linux__ + std::string thread_name = "stp_" + std::to_string(myId); + pthread_setname_np(pthread_self(), thread_name.c_str()); +#endif + while (!mgr->isStopped()) { + eaLoop.wait(); + eaLoop.reset(); + if (mgr->isStopped()) break; + + if (assignedTask) { + assignedTask->execute(TaskResult()); + } + assignedTask.reset(); + + mgr->returnThread(myself); + mgr->invoke(); + } + } + + void assign(const std::shared_ptr& handle) { + assignedTask = handle; + eaLoop.invoke(); + } + +private: + // Thread ID. + size_t myId; + + // Parent manager. + ThreadPoolMgrBase* mgr; + + // Instance of itself. + std::shared_ptr< ThreadHandle > myself; + + // Thread. + std::shared_ptr< std::thread > tHandle; + + // Assigned task to execute. + std::shared_ptr< TaskHandle > assignedTask; + + // Condition variable for thread loop. + EventAwaiter eaLoop; +}; + +struct ThreadPoolOptions { + ThreadPoolOptions() + : numInitialThreads(4) + , busyWaitingIntervalUs(100) + , invokeCanceledTask(false) + {} + + // Number of threads in the pool. + // If 0, the main coordinator loop will execute the task, + // which may block other pending request. + size_t numInitialThreads; + + // Time interval to do busy waiting instead of sleeping. + // Higher number will provide better accuracy, but + // will consume more CPU. + size_t busyWaitingIntervalUs; + + // If `true`, will invoke task handler with `CANCELED` result code. + bool invokeCanceledTask; +}; + +class ThreadPoolMgr : public ThreadPoolMgrBase { + friend class TaskHandle; + friend class ThreadHandle; + +public: + ThreadPoolMgr() + : ThreadPoolMgrBase() + , stopSignal(false) + , loopThread(nullptr) + {} + + ~ThreadPoolMgr() { + shutdown(); + } + + /** + * Initialize the thread pool with given options. + * + * @param opt Options. + */ + void init(const ThreadPoolOptions& opt) { + myOpt = opt; + stopSignal = false; + + loopThread = std::shared_ptr + ( new std::thread( &ThreadPoolMgr::loop, this ) ); + + { std::lock_guard l(idleThreadsLock); + for (size_t ii = 0; ii < myOpt.numInitialThreads; ++ii) { + std::shared_ptr t_handle( new ThreadHandle(this, ii) ); + idleThreads.push_back(t_handle); + t_handle->init(t_handle); + } + } + } + + /** + * Step down the thread pool. + */ + void shutdown() { + stopSignal = true; + if (loopThread && loopThread->joinable()) { + eaLoop.invoke(); + loopThread->join(); + } + loopThread.reset(); + + do { + std::shared_ptr t_handle_to_free = nullptr; + { std::lock_guard l(idleThreadsLock); + auto entry = idleThreads.begin(); + if (entry == idleThreads.end()) break; + t_handle_to_free = *entry; + idleThreads.erase(entry); + } + t_handle_to_free->shutdown(); + } while (true); + + std::list< std::shared_ptr > tasks_to_cancel; + { std::lock_guard l(timedTasksLock); + for (auto& entry: timedTasks) tasks_to_cancel.push_back(entry); + timedTasks.clear(); + } + { std::lock_guard l(normalTasksLock); + for (auto& entry: normalTasks) tasks_to_cancel.push_back(entry); + normalTasks.clear(); + } + TaskResult tr(TaskResult::CANCELED); + for (auto& entry: tasks_to_cancel) { + std::shared_ptr& tt = entry; + tt->cancel(); + } + } + + /** + * Register an async task. + * + * @param handler Callback function to execute. + * @param interval_us Interval in microseconds, if given task + * is a timer work. + * @param type ONE_TIME or RECURRING. + * @return Handle of registered task. + */ + std::shared_ptr addTask(const TaskHandler& handler, + uint64_t interval_us = 0, + TaskType type = TaskType::ONE_TIME) + { + std::shared_ptr + new_task( new TaskHandle(this, handler, interval_us, type) ); + + if (interval_us) { + std::lock_guard l(timedTasksLock); + timedTasks.push_back(new_task); + } else { + std::lock_guard l(normalTasksLock); + normalTasks.push_back(new_task); + } + eaLoop.invoke(); + + return new_task; + } + + /** + * Check if thread pool manager is stopped. + * + * @return `true` if stopped. + */ + bool isStopped() const { return stopSignal; } + + /** + * Manually invoke the main coordination loop. + */ + void invoke() { eaLoop.invoke(); } + +private: + void returnThread(const std::shared_ptr& t_handle) { + std::lock_guard l(idleThreadsLock); + idleThreads.push_front( t_handle ); + } + + bool invokeCanceledTask() const { + return myOpt.invokeCanceledTask; + } + + void loop() { +#ifdef __linux__ + pthread_setname_np(pthread_self(), "stp_coord"); +#endif + const uint64_t MAX_SLEEP_US = 1000000; + uint64_t next_sleep_us = MAX_SLEEP_US; + + while (!stopSignal) { + if (next_sleep_us > myOpt.busyWaitingIntervalUs) { + eaLoop.wait_us(next_sleep_us - myOpt.busyWaitingIntervalUs); + } else { + // Otherwise: busy waiting. + } + if (stopSignal) break; + + eaLoop.reset(); + next_sleep_us = MAX_SLEEP_US; + + std::shared_ptr thread_to_assign = nullptr; + + if (myOpt.numInitialThreads) { + // Thread pool exists, pick an idle thread. + { std::lock_guard l(idleThreadsLock); + auto entry = idleThreads.begin(); + if (entry != idleThreads.end()) { + thread_to_assign = *entry; + idleThreads.erase(entry); + } + } + if (!thread_to_assign) { + // All threads are busy, skip. + continue; + } + } + // Otherwise (empty thread pool), + // this loop thread will do execution. + + // Check timer task first (higher priority). + std::shared_ptr task_to_run = nullptr; + { std::lock_guard l(timedTasksLock); + auto entry = timedTasks.begin(); + while (entry != timedTasks.end()) { + std::shared_ptr& tt = *entry; + uint64_t remaining_us = 0; + if (tt->timeToFire(remaining_us)) { + task_to_run = tt; + if ( task_to_run->isOneTime() || + task_to_run->isDone() ) { + entry = timedTasks.erase(entry); + } else { + task_to_run->reschedule(); + } + + if (task_to_run->isDone()) task_to_run.reset(); + else break; + } + if (!task_to_run) { + entry++; + // Adjust next sleep time. + next_sleep_us = std::min(next_sleep_us, remaining_us); + next_sleep_us = std::min(next_sleep_us, MAX_SLEEP_US); + } + } + } + + if (!task_to_run) { + // If there is no timer task to be fired for now, + // pick a normal task. + std::lock_guard l(normalTasksLock); + auto entry = normalTasks.begin(); + if (entry != normalTasks.end()) { + task_to_run = *entry; + normalTasks.erase(entry); + } + + if (normalTasks.size()) { + // Still have pending task(s). Do not sleep. + next_sleep_us = 0; + } + } + + if (!task_to_run) { + // No task to run, skip. + if (thread_to_assign) { + std::lock_guard l(idleThreadsLock); + idleThreads.push_front(thread_to_assign); + } + continue; + } + + if (myOpt.numInitialThreads) { + // Assign the task to picked thread. + thread_to_assign->assign(task_to_run); + } else { + // Empty thread pool, execute here. + task_to_run->execute(TaskResult()); + } + } + } + + // Options. + ThreadPoolOptions myOpt; + + // `true` if system is being stopped. + std::atomic stopSignal; + + // Condition variable for main coordination loop. + EventAwaiter eaLoop; + + // Main coordination loop thread. + std::shared_ptr< std::thread > loopThread; + + // List of timer tasks. + std::list< std::shared_ptr > timedTasks; + std::mutex timedTasksLock; + + // List of normal tasks. + std::list< std::shared_ptr > normalTasks; + std::mutex normalTasksLock; + + // List of idle threads. + std::list< std::shared_ptr > idleThreads; + std::mutex idleThreadsLock; +}; + +}; + diff --git a/src/skiplist.cc b/src/skiplist.cc new file mode 100644 index 0000000..5e96eaa --- /dev/null +++ b/src/skiplist.cc @@ -0,0 +1,999 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2017 Jung-Sang Ahn +See URL: https://github.com/greensky00/skiplist + (v0.2.9) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "skiplist.h" + +#include + +#define __SLD_RT_INS(e, n, t, c) +#define __SLD_NC_INS(n, nn, t, c) +#define __SLD_RT_RMV(e, n, t, c) +#define __SLD_NC_RMV(n, nn, t, c) +#define __SLD_BM(n) +#define __SLD_ASSERT(cond) +#define __SLD_P(args...) +#define __SLD_(b) + +//#define __SL_DEBUG (1) +#ifdef __SL_DEBUG + #ifndef __cplusplus + #error "Debug mode is available with C++ compiler only." + #endif + #include "skiplist_debug.h" +#endif + +#define __SL_YIELD (1) +#ifdef __SL_YIELD + #ifdef __cplusplus + #include + #define YIELD() std::this_thread::yield() + #else + #include + #define YIELD() sched_yield() + #endif +#else + #define YIELD() +#endif + +#if defined(_STL_ATOMIC) && defined(__cplusplus) + // C++ (STL) atomic operations + #define MOR std::memory_order_relaxed + #define ATM_GET(var) (var).load(MOR) + #define ATM_LOAD(var, val) (val) = (var).load(MOR) + #define ATM_STORE(var, val) (var).store((val), MOR) + #define ATM_CAS(var, exp, val) (var).compare_exchange_weak((exp), (val)) + #define ATM_FETCH_ADD(var, val) (var).fetch_add(val, MOR) + #define ATM_FETCH_SUB(var, val) (var).fetch_sub(val, MOR) + #define ALLOC_(type, var, count) (var) = new type[count] + #define FREE_(var) delete[] (var) +#else + // C-style atomic operations + #ifndef __cplusplus + typedef uint8_t bool; + #ifndef true + #define true 1 + #endif + #ifndef false + #define false 0 + #endif + #endif + + #ifndef __cplusplus + #define thread_local /*_Thread_local*/ + #endif + + #define MOR __ATOMIC_RELAXED + #define ATM_GET(var) (var) + #define ATM_LOAD(var, val) __atomic_load(&(var), &(val), MOR) + #define ATM_STORE(var, val) __atomic_store(&(var), &(val), MOR) + #define ATM_CAS(var, exp, val) \ + __atomic_compare_exchange(&(var), &(exp), &(val), 1, MOR, MOR) + #define ATM_FETCH_ADD(var, val) __atomic_fetch_add(&(var), (val), MOR) + #define ATM_FETCH_SUB(var, val) __atomic_fetch_sub(&(var), (val), MOR) + #define ALLOC_(type, var, count) \ + (var) = (type*)calloc(count, sizeof(type)) + #define FREE_(var) free(var) +#endif + +static inline void _sl_node_init(skiplist_node *node, + size_t top_layer) +{ + if (top_layer > UINT8_MAX) top_layer = UINT8_MAX; + + __SLD_ASSERT(node->is_fully_linked == false); + __SLD_ASSERT(node->being_modified == false); + + bool bool_val = false; + ATM_STORE(node->is_fully_linked, bool_val); + ATM_STORE(node->being_modified, bool_val); + ATM_STORE(node->removed, bool_val); + + if (node->top_layer != top_layer || + node->next == NULL) { + + node->top_layer = top_layer; + + if (node->next) FREE_(node->next); + ALLOC_(atm_node_ptr, node->next, top_layer+1); + } +} + +void skiplist_init(skiplist_raw *slist, + skiplist_cmp_t *cmp_func) { + + slist->cmp_func = NULL; + slist->aux = NULL; + + // fanout 4 + layer 12: 4^12 ~= upto 17M items under O(lg n) complexity. + // for +17M items, complexity will grow linearly: O(k lg n). + slist->fanout = 4; + slist->max_layer = 12; + slist->num_entries = 0; + + ALLOC_(atm_uint32_t, slist->layer_entries, slist->max_layer); + slist->top_layer = 0; + + skiplist_init_node(&slist->head); + skiplist_init_node(&slist->tail); + + _sl_node_init(&slist->head, slist->max_layer); + _sl_node_init(&slist->tail, slist->max_layer); + + size_t layer; + for (layer = 0; layer < slist->max_layer; ++layer) { + slist->head.next[layer] = &slist->tail; + slist->tail.next[layer] = NULL; + } + + bool bool_val = true; + ATM_STORE(slist->head.is_fully_linked, bool_val); + ATM_STORE(slist->tail.is_fully_linked, bool_val); + slist->cmp_func = cmp_func; +} + +void skiplist_free(skiplist_raw *slist) +{ + skiplist_free_node(&slist->head); + skiplist_free_node(&slist->tail); + + FREE_(slist->layer_entries); + slist->layer_entries = NULL; + + slist->aux = NULL; + slist->cmp_func = NULL; +} + +void skiplist_init_node(skiplist_node *node) +{ + node->next = NULL; + + bool bool_false = false; + ATM_STORE(node->is_fully_linked, bool_false); + ATM_STORE(node->being_modified, bool_false); + ATM_STORE(node->removed, bool_false); + + node->accessing_next = 0; + node->top_layer = 0; + node->ref_count = 0; +} + +void skiplist_free_node(skiplist_node *node) +{ + FREE_(node->next); + node->next = NULL; +} + +size_t skiplist_get_size(skiplist_raw* slist) { + uint32_t val; + ATM_LOAD(slist->num_entries, val); + return val; +} + +skiplist_raw_config skiplist_get_default_config() +{ + skiplist_raw_config ret; + ret.fanout = 4; + ret.maxLayer = 12; + ret.aux = NULL; + return ret; +} + +skiplist_raw_config skiplist_get_config(skiplist_raw *slist) +{ + skiplist_raw_config ret; + ret.fanout = slist->fanout; + ret.maxLayer = slist->max_layer; + ret.aux = slist->aux; + return ret; +} + +void skiplist_set_config(skiplist_raw *slist, + skiplist_raw_config config) +{ + slist->fanout = config.fanout; + + slist->max_layer = config.maxLayer; + if (slist->layer_entries) FREE_(slist->layer_entries); + ALLOC_(atm_uint32_t, slist->layer_entries, slist->max_layer); + + slist->aux = config.aux; +} + +static inline int _sl_cmp(skiplist_raw *slist, + skiplist_node *a, + skiplist_node *b) +{ + if (a == b) return 0; + if (a == &slist->head || b == &slist->tail) return -1; + if (a == &slist->tail || b == &slist->head) return 1; + return slist->cmp_func(a, b, slist->aux); +} + +static inline bool _sl_valid_node(skiplist_node *node) { + bool is_fully_linked = false; + ATM_LOAD(node->is_fully_linked, is_fully_linked); + return is_fully_linked; +} + +static inline void _sl_read_lock_an(skiplist_node* node) { + for(;;) { + // Wait for active writer to release the lock + uint32_t accessing_next = 0; + ATM_LOAD(node->accessing_next, accessing_next); + while (accessing_next & 0xfff00000) { + YIELD(); + ATM_LOAD(node->accessing_next, accessing_next); + } + + ATM_FETCH_ADD(node->accessing_next, 0x1); + ATM_LOAD(node->accessing_next, accessing_next); + if ((accessing_next & 0xfff00000) == 0) { + return; + } + + ATM_FETCH_SUB(node->accessing_next, 0x1); + } +} + +static inline void _sl_read_unlock_an(skiplist_node* node) { + ATM_FETCH_SUB(node->accessing_next, 0x1); +} + +static inline void _sl_write_lock_an(skiplist_node* node) { + for(;;) { + // Wait for active writer to release the lock + uint32_t accessing_next = 0; + ATM_LOAD(node->accessing_next, accessing_next); + while (accessing_next & 0xfff00000) { + YIELD(); + ATM_LOAD(node->accessing_next, accessing_next); + } + + ATM_FETCH_ADD(node->accessing_next, 0x100000); + ATM_LOAD(node->accessing_next, accessing_next); + if((accessing_next & 0xfff00000) == 0x100000) { + // Wait until there's no more readers + while (accessing_next & 0x000fffff) { + YIELD(); + ATM_LOAD(node->accessing_next, accessing_next); + } + return; + } + + ATM_FETCH_SUB(node->accessing_next, 0x100000); + } +} + +static inline void _sl_write_unlock_an(skiplist_node* node) { + ATM_FETCH_SUB(node->accessing_next, 0x100000); +} + +// Note: it increases the `ref_count` of returned node. +// Caller is responsible to decrease it. +static inline skiplist_node* _sl_next(skiplist_raw* slist, + skiplist_node* cur_node, + int layer, + skiplist_node* node_to_find, + bool* found) +{ + skiplist_node *next_node = NULL; + + // Turn on `accessing_next`: + // now `cur_node` is not removable from skiplist, + // which means that `cur_node->next` will be consistent + // until clearing `accessing_next`. + _sl_read_lock_an(cur_node); { + if (!_sl_valid_node(cur_node)) { + _sl_read_unlock_an(cur_node); + return NULL; + } + ATM_LOAD(cur_node->next[layer], next_node); + // Increase ref count of `next_node`: + // now `next_node` is not destroyable. + + // << Remaining issue >> + // 1) initially: A -> B + // 2) T1: call _sl_next(A): + // A.accessing_next := true; + // next_node := B; + // ----- context switch happens here ----- + // 3) T2: insert C: + // A -> C -> B + // 4) T2: and then erase B, and free B. + // A -> C B(freed) + // ----- context switch back again ----- + // 5) T1: try to do something with B, + // but crash happens. + // + // ... maybe resolved using RW spinlock (Aug 21, 2017). + __SLD_ASSERT(next_node); + ATM_FETCH_ADD(next_node->ref_count, 1); + __SLD_ASSERT(next_node->top_layer >= layer); + } _sl_read_unlock_an(cur_node); + + size_t num_nodes = 0; + skiplist_node* nodes[256]; + + while ( (next_node && !_sl_valid_node(next_node)) || + next_node == node_to_find ) { + if (found && node_to_find == next_node) *found = true; + + skiplist_node* temp = next_node; + _sl_read_lock_an(temp); { + __SLD_ASSERT(next_node); + if (!_sl_valid_node(temp)) { + _sl_read_unlock_an(temp); + ATM_FETCH_SUB(temp->ref_count, 1); + next_node = NULL; + break; + } + ATM_LOAD(temp->next[layer], next_node); + ATM_FETCH_ADD(next_node->ref_count, 1); + nodes[num_nodes++] = temp; + __SLD_ASSERT(next_node->top_layer >= layer); + } _sl_read_unlock_an(temp); + } + + for (size_t ii=0; iiref_count, 1); + } + + return next_node; +} + +static inline size_t _sl_decide_top_layer(skiplist_raw *slist) +{ + size_t layer = 0; + while (layer+1 < slist->max_layer) { + // coin filp + if (rand() % slist->fanout == 0) { + // grow: 1/fanout probability + layer++; + } else { + // stop: 1 - 1/fanout probability + break; + } + } + return layer; +} + +static inline void _sl_clr_flags(skiplist_node** node_arr, + int start_layer, + int top_layer) +{ + int layer; + for (layer = start_layer; layer <= top_layer; ++layer) { + if ( layer == top_layer || + node_arr[layer] != node_arr[layer+1] ) { + + bool exp = true; + bool bool_false = false; + if (!ATM_CAS(node_arr[layer]->being_modified, exp, bool_false)) { + __SLD_ASSERT(0); + } + } + } +} + +static inline bool _sl_valid_prev_next(skiplist_node *prev, + skiplist_node *next) { + return _sl_valid_node(prev) && _sl_valid_node(next); +} + +static inline int _skiplist_insert(skiplist_raw *slist, + skiplist_node *node, + bool no_dup) +{ + __SLD_( + thread_local std::thread::id tid = std::this_thread::get_id(); + thread_local size_t tid_hash = std::hash{}(tid) % 256; + (void)tid_hash; + ) + + int top_layer = _sl_decide_top_layer(slist); + bool bool_true = true; + + // init node before insertion + _sl_node_init(node, top_layer); + _sl_write_lock_an(node); + + skiplist_node* prevs[SKIPLIST_MAX_LAYER]; + skiplist_node* nexts[SKIPLIST_MAX_LAYER]; + + __SLD_P("%02x ins %p begin\n", (int)tid_hash, node); + +insert_retry: + // in pure C, a label can only be part of a stmt. + (void)top_layer; + + int cmp = 0, cur_layer = 0, layer; + skiplist_node *cur_node = &slist->head; + ATM_FETCH_ADD(cur_node->ref_count, 1); + + __SLD_(size_t nh = 0); + __SLD_(thread_local skiplist_node* history[1024]; (void)history); + + int sl_top_layer = slist->top_layer; + if (top_layer > sl_top_layer) sl_top_layer = top_layer; + for (cur_layer = sl_top_layer; cur_layer >= 0; --cur_layer) { + do { + __SLD_( history[nh++] = cur_node ); + + skiplist_node *next_node = _sl_next(slist, cur_node, cur_layer, + NULL, NULL); + if (!next_node) { + _sl_clr_flags(prevs, cur_layer+1, top_layer); + ATM_FETCH_SUB(cur_node->ref_count, 1); + YIELD(); + goto insert_retry; + } + cmp = _sl_cmp(slist, node, next_node); + if (cmp > 0) { + // cur_node < next_node < node + // => move to next node + skiplist_node* temp = cur_node; + cur_node = next_node; + ATM_FETCH_SUB(temp->ref_count, 1); + continue; + } else { + // otherwise: cur_node < node <= next_node + ATM_FETCH_SUB(next_node->ref_count, 1); + } + + if (no_dup && cmp == 0) { + // Duplicate key is not allowed. + _sl_clr_flags(prevs, cur_layer+1, top_layer); + ATM_FETCH_SUB(cur_node->ref_count, 1); + return -1; + } + + if (cur_layer <= top_layer) { + prevs[cur_layer] = cur_node; + nexts[cur_layer] = next_node; + // both 'prev' and 'next' should be fully linked before + // insertion, and no other thread should not modify 'prev' + // at the same time. + + int error_code = 0; + int locked_layer = cur_layer + 1; + + // check if prev node is duplicated with upper layer + if (cur_layer < top_layer && + prevs[cur_layer] == prevs[cur_layer+1]) { + // duplicate + // => which means that 'being_modified' flag is already true + // => do nothing + } else { + bool expected = false; + if (ATM_CAS(prevs[cur_layer]->being_modified, + expected, bool_true)) { + locked_layer = cur_layer; + } else { + error_code = -1; + } + } + + if (error_code == 0 && + !_sl_valid_prev_next(prevs[cur_layer], nexts[cur_layer])) { + error_code = -2; + } + + if (error_code != 0) { + __SLD_RT_INS(error_code, node, top_layer, cur_layer); + _sl_clr_flags(prevs, locked_layer, top_layer); + ATM_FETCH_SUB(cur_node->ref_count, 1); + YIELD(); + goto insert_retry; + } + + // set current node's pointers + ATM_STORE(node->next[cur_layer], nexts[cur_layer]); + + // check if `cur_node->next` has been changed from `next_node`. + skiplist_node* next_node_again = + _sl_next(slist, cur_node, cur_layer, NULL, NULL); + ATM_FETCH_SUB(next_node_again->ref_count, 1); + if (next_node_again != next_node) { + __SLD_NC_INS(cur_node, next_node, top_layer, cur_layer); + // clear including the current layer + // as we already set modification flag above. + _sl_clr_flags(prevs, cur_layer, top_layer); + ATM_FETCH_SUB(cur_node->ref_count, 1); + YIELD(); + goto insert_retry; + } + } + + if (cur_layer) { + // non-bottom layer => go down + break; + } + + // bottom layer => insertion succeeded + // change prev/next nodes' prev/next pointers from 0 ~ top_layer + for (layer = 0; layer <= top_layer; ++layer) { + // `accessing_next` works as a spin-lock. + _sl_write_lock_an(prevs[layer]); + skiplist_node* exp = nexts[layer]; + if ( !ATM_CAS(prevs[layer]->next[layer], exp, node) ) { + __SLD_P("%02x ASSERT ins %p[%d] -> %p (expected %p)\n", + (int)tid_hash, prevs[layer], cur_layer, + ATM_GET(prevs[layer]->next[layer]), nexts[layer] ); + __SLD_ASSERT(0); + } + __SLD_P("%02x ins %p[%d] -> %p -> %p\n", + (int)tid_hash, prevs[layer], layer, + node, ATM_GET(node->next[layer]) ); + _sl_write_unlock_an(prevs[layer]); + } + + // now this node is fully linked + ATM_STORE(node->is_fully_linked, bool_true); + + // allow removing next nodes + _sl_write_unlock_an(node); + + __SLD_P("%02x ins %p done\n", (int)tid_hash, node); + + ATM_FETCH_ADD(slist->num_entries, 1); + ATM_FETCH_ADD(slist->layer_entries[node->top_layer], 1); + for (int ii=slist->max_layer-1; ii>=0; --ii) { + if (slist->layer_entries[ii] > 0) { + slist->top_layer = ii; + break; + } + } + + // modification is done for all layers + _sl_clr_flags(prevs, 0, top_layer); + ATM_FETCH_SUB(cur_node->ref_count, 1); + + return 0; + } while (cur_node != &slist->tail); + } + return 0; +} + +int skiplist_insert(skiplist_raw *slist, + skiplist_node *node) +{ + return _skiplist_insert(slist, node, false); +} + +int skiplist_insert_nodup(skiplist_raw *slist, + skiplist_node *node) +{ + return _skiplist_insert(slist, node, true); +} + +typedef enum { + SM = -2, + SMEQ = -1, + EQ = 0, + GTEQ = 1, + GT = 2 +} _sl_find_mode; + +// Note: it increases the `ref_count` of returned node. +// Caller is responsible to decrease it. +static inline skiplist_node* _sl_find(skiplist_raw *slist, + skiplist_node *query, + _sl_find_mode mode) +{ + // mode: + // SM -2: smaller + // SMEQ -1: smaller or equal + // EQ 0: equal + // GTEQ 1: greater or equal + // GT 2: greater +find_retry: + (void)mode; + int cmp = 0; + int cur_layer = 0; + skiplist_node *cur_node = &slist->head; + ATM_FETCH_ADD(cur_node->ref_count, 1); + + __SLD_(size_t nh = 0); + __SLD_(thread_local skiplist_node* history[1024]; (void)history); + + uint8_t sl_top_layer = slist->top_layer; + for (cur_layer = sl_top_layer; cur_layer >= 0; --cur_layer) { + do { + __SLD_(history[nh++] = cur_node); + + skiplist_node *next_node = _sl_next(slist, cur_node, cur_layer, + NULL, NULL); + if (!next_node) { + ATM_FETCH_SUB(cur_node->ref_count, 1); + YIELD(); + goto find_retry; + } + cmp = _sl_cmp(slist, query, next_node); + if (cmp > 0) { + // cur_node < next_node < query + // => move to next node + skiplist_node* temp = cur_node; + cur_node = next_node; + ATM_FETCH_SUB(temp->ref_count, 1); + continue; + } else if (-1 <= mode && mode <= 1 && cmp == 0) { + // cur_node < query == next_node .. return + ATM_FETCH_SUB(cur_node->ref_count, 1); + return next_node; + } + + // otherwise: cur_node < query < next_node + if (cur_layer) { + // non-bottom layer => go down + ATM_FETCH_SUB(next_node->ref_count, 1); + break; + } + + // bottom layer + if (mode < 0 && cur_node != &slist->head) { + // smaller mode + ATM_FETCH_SUB(next_node->ref_count, 1); + return cur_node; + } else if (mode > 0 && next_node != &slist->tail) { + // greater mode + ATM_FETCH_SUB(cur_node->ref_count, 1); + return next_node; + } + // otherwise: exact match mode OR not found + ATM_FETCH_SUB(cur_node->ref_count, 1); + ATM_FETCH_SUB(next_node->ref_count, 1); + return NULL; + } while (cur_node != &slist->tail); + } + + return NULL; +} + +skiplist_node* skiplist_find(skiplist_raw *slist, + skiplist_node *query) +{ + return _sl_find(slist, query, EQ); +} + +skiplist_node* skiplist_find_smaller_or_equal(skiplist_raw *slist, + skiplist_node *query) +{ + return _sl_find(slist, query, SMEQ); +} + +skiplist_node* skiplist_find_greater_or_equal(skiplist_raw *slist, + skiplist_node *query) +{ + return _sl_find(slist, query, GTEQ); +} + +int skiplist_erase_node_passive(skiplist_raw *slist, + skiplist_node *node) +{ + __SLD_( + thread_local std::thread::id tid = std::this_thread::get_id(); + thread_local size_t tid_hash = std::hash{}(tid) % 256; + (void)tid_hash; + ) + + int top_layer = node->top_layer; + bool bool_true = true, bool_false = false; + bool removed = false; + bool is_fully_linked = false; + + ATM_LOAD(node->removed, removed); + if (removed) { + // already removed + return -1; + } + + skiplist_node* prevs[SKIPLIST_MAX_LAYER]; + skiplist_node* nexts[SKIPLIST_MAX_LAYER]; + + bool expected = false; + if (!ATM_CAS(node->being_modified, expected, bool_true)) { + // already being modified .. cannot work on this node for now. + __SLD_BM(node); + return -2; + } + + // set removed flag first, so that reader cannot read this node. + ATM_STORE(node->removed, bool_true); + + __SLD_P("%02x rmv %p begin\n", (int)tid_hash, node); + +erase_node_retry: + ATM_LOAD(node->is_fully_linked, is_fully_linked); + if (!is_fully_linked) { + // already unlinked .. remove is done by other thread + ATM_STORE(node->removed, bool_false); + ATM_STORE(node->being_modified, bool_false); + return -3; + } + + int cmp = 0; + bool found_node_to_erase = false; + (void)found_node_to_erase; + skiplist_node *cur_node = &slist->head; + ATM_FETCH_ADD(cur_node->ref_count, 1); + + __SLD_(size_t nh = 0); + __SLD_(thread_local skiplist_node* history[1024]; (void)history); + + int cur_layer = slist->top_layer; + for (; cur_layer >= 0; --cur_layer) { + do { + __SLD_( history[nh++] = cur_node ); + + bool node_found = false; + skiplist_node *next_node = _sl_next(slist, cur_node, cur_layer, + node, &node_found); + if (!next_node) { + _sl_clr_flags(prevs, cur_layer+1, top_layer); + ATM_FETCH_SUB(cur_node->ref_count, 1); + YIELD(); + goto erase_node_retry; + } + + // Note: unlike insert(), we should find exact position of `node`. + cmp = _sl_cmp(slist, node, next_node); + if (cmp > 0 || (cur_layer <= top_layer && !node_found) ) { + // cur_node <= next_node < node + // => move to next node + skiplist_node* temp = cur_node; + cur_node = next_node; + __SLD_( if (cmp > 0) { + int cmp2 = _sl_cmp(slist, cur_node, node); + if (cmp2 > 0) { + // node < cur_node <= next_node: not found. + _sl_clr_flags(prevs, cur_layer+1, top_layer); + ATM_FETCH_SUB(temp->ref_count, 1); + ATM_FETCH_SUB(next_node->ref_count, 1); + __SLD_ASSERT(0); + } + } ) + ATM_FETCH_SUB(temp->ref_count, 1); + continue; + } else { + // otherwise: cur_node <= node <= next_node + ATM_FETCH_SUB(next_node->ref_count, 1); + } + + if (cur_layer <= top_layer) { + prevs[cur_layer] = cur_node; + // note: 'next_node' and 'node' should not be the same, + // as 'removed' flag is already set. + __SLD_ASSERT(next_node != node); + nexts[cur_layer] = next_node; + + // check if prev node duplicates with upper layer + int error_code = 0; + int locked_layer = cur_layer + 1; + if (cur_layer < top_layer && + prevs[cur_layer] == prevs[cur_layer+1]) { + // duplicate with upper layer + // => which means that 'being_modified' flag is already true + // => do nothing. + } else { + expected = false; + if (ATM_CAS(prevs[cur_layer]->being_modified, + expected, bool_true)) { + locked_layer = cur_layer; + } else { + error_code = -1; + } + } + + if (error_code == 0 && + !_sl_valid_prev_next(prevs[cur_layer], nexts[cur_layer])) { + error_code = -2; + } + + if (error_code != 0) { + __SLD_RT_RMV(error_code, node, top_layer, cur_layer); + _sl_clr_flags(prevs, locked_layer, top_layer); + ATM_FETCH_SUB(cur_node->ref_count, 1); + YIELD(); + goto erase_node_retry; + } + + skiplist_node* next_node_again = + _sl_next(slist, cur_node, cur_layer, node, NULL); + ATM_FETCH_SUB(next_node_again->ref_count, 1); + if (next_node_again != nexts[cur_layer]) { + // `next` pointer has been changed, retry. + __SLD_NC_RMV(cur_node, nexts[cur_layer], top_layer, cur_layer); + _sl_clr_flags(prevs, cur_layer, top_layer); + ATM_FETCH_SUB(cur_node->ref_count, 1); + YIELD(); + goto erase_node_retry; + } + } + if (cur_layer == 0) found_node_to_erase = true; + // go down + break; + } while (cur_node != &slist->tail); + } + // Not exist in the skiplist, should not happen. + __SLD_ASSERT(found_node_to_erase); + // bottom layer => removal succeeded. + // mark this node unlinked + _sl_write_lock_an(node); { + ATM_STORE(node->is_fully_linked, bool_false); + } _sl_write_unlock_an(node); + + // change prev nodes' next pointer from 0 ~ top_layer + for (cur_layer = 0; cur_layer <= top_layer; ++cur_layer) { + _sl_write_lock_an(prevs[cur_layer]); + skiplist_node* exp = node; + __SLD_ASSERT(exp != nexts[cur_layer]); + __SLD_ASSERT(nexts[cur_layer]->is_fully_linked); + if ( !ATM_CAS(prevs[cur_layer]->next[cur_layer], + exp, nexts[cur_layer]) ) { + __SLD_P("%02x ASSERT rmv %p[%d] -> %p (node %p)\n", + (int)tid_hash, prevs[cur_layer], cur_layer, + ATM_GET(prevs[cur_layer]->next[cur_layer]), node ); + __SLD_ASSERT(0); + } + __SLD_ASSERT(nexts[cur_layer]->top_layer >= cur_layer); + __SLD_P("%02x rmv %p[%d] -> %p (node %p)\n", + (int)tid_hash, prevs[cur_layer], cur_layer, + nexts[cur_layer], node); + _sl_write_unlock_an(prevs[cur_layer]); + } + + __SLD_P("%02x rmv %p done\n", (int)tid_hash, node); + + ATM_FETCH_SUB(slist->num_entries, 1); + ATM_FETCH_SUB(slist->layer_entries[node->top_layer], 1); + for (int ii=slist->max_layer-1; ii>=0; --ii) { + if (slist->layer_entries[ii] > 0) { + slist->top_layer = ii; + break; + } + } + + // modification is done for all layers + _sl_clr_flags(prevs, 0, top_layer); + ATM_FETCH_SUB(cur_node->ref_count, 1); + + ATM_STORE(node->being_modified, bool_false); + + return 0; +} + +int skiplist_erase_node(skiplist_raw *slist, + skiplist_node *node) +{ + int ret = 0; + do { + ret = skiplist_erase_node_passive(slist, node); + // if ret == -2, other thread is accessing the same node + // at the same time. try again. + } while (ret == -2); + return ret; +} + +int skiplist_erase(skiplist_raw *slist, + skiplist_node *query) +{ + skiplist_node *found = skiplist_find(slist, query); + if (!found) { + // key not found + return -4; + } + + int ret = 0; + do { + ret = skiplist_erase_node_passive(slist, found); + // if ret == -2, other thread is accessing the same node + // at the same time. try again. + } while (ret == -2); + + ATM_FETCH_SUB(found->ref_count, 1); + return ret; +} + +int skiplist_is_valid_node(skiplist_node* node) { + return _sl_valid_node(node); +} + +int skiplist_is_safe_to_free(skiplist_node* node) { + if (node->accessing_next) return 0; + if (node->being_modified) return 0; + if (!node->removed) return 0; + + uint16_t ref_count = 0; + ATM_LOAD(node->ref_count, ref_count); + if (ref_count) return 0; + return 1; +} + +void skiplist_wait_for_free(skiplist_node* node) { + while (!skiplist_is_safe_to_free(node)) { + YIELD(); + } +} + +void skiplist_grab_node(skiplist_node* node) { + ATM_FETCH_ADD(node->ref_count, 1); +} + +void skiplist_release_node(skiplist_node* node) { + __SLD_ASSERT(node->ref_count); + ATM_FETCH_SUB(node->ref_count, 1); +} + +skiplist_node* skiplist_next(skiplist_raw *slist, + skiplist_node *node) { + // << Issue >> + // If `node` is already removed and its next node is also removed + // and then released, the link update will not be applied to `node` + // as it is already unrechable from skiplist. `node` still points to + // the released node so that `_sl_next(node)` may return corrupted + // memory region. + // + // 0) initial: + // A -> B -> C -> D + // + // 1) B is `node`, which is removed but not yet released: + // B --+-> C -> D + // | + // A --+ + // + // 2) remove C, and then release: + // B -> !C! +-> D + // | + // A --------+ + // + // 3) skiplist_next(B): + // will fetch C, which is already released so that + // may contain garbage data. + // + // In this case, start over from the top layer, + // to find valid link (same as in prev()). + + skiplist_node *next = _sl_next(slist, node, 0, NULL, NULL); + if (!next) next = _sl_find(slist, node, GT); + + if (next == &slist->tail) return NULL; + return next; +} + +skiplist_node* skiplist_prev(skiplist_raw *slist, + skiplist_node *node) { + skiplist_node *prev = _sl_find(slist, node, SM); + if (prev == &slist->head) return NULL; + return prev; +} + +skiplist_node* skiplist_begin(skiplist_raw *slist) { + skiplist_node *next = NULL; + while (!next) { + next = _sl_next(slist, &slist->head, 0, NULL, NULL); + } + if (next == &slist->tail) return NULL; + return next; +} + +skiplist_node* skiplist_end(skiplist_raw *slist) { + return skiplist_prev(slist, &slist->tail); +} + diff --git a/src/skiplist.h b/src/skiplist.h new file mode 100644 index 0000000..15f5fb7 --- /dev/null +++ b/src/skiplist.h @@ -0,0 +1,144 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2017 Jung-Sang Ahn +See URL: https://github.com/greensky00/skiplist + (v0.2.9) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#ifndef _JSAHN_SKIPLIST_H +#define _JSAHN_SKIPLIST_H (1) + +#include +#include + +#define SKIPLIST_MAX_LAYER (64) + +struct _skiplist_node; + +//#define _STL_ATOMIC (1) +#ifdef __APPLE__ + #define _STL_ATOMIC (1) +#endif +#if defined(_STL_ATOMIC) && defined(__cplusplus) + #include + typedef std::atomic<_skiplist_node*> atm_node_ptr; + typedef std::atomic atm_bool; + typedef std::atomic atm_uint8_t; + typedef std::atomic atm_uint16_t; + typedef std::atomic atm_uint32_t; +#else + typedef struct _skiplist_node* atm_node_ptr; + typedef uint8_t atm_bool; + typedef uint8_t atm_uint8_t; + typedef uint16_t atm_uint16_t; + typedef uint32_t atm_uint32_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _skiplist_node { + atm_node_ptr *next; + atm_bool is_fully_linked; + atm_bool being_modified; + atm_bool removed; + uint8_t top_layer; // 0: bottom + atm_uint16_t ref_count; + atm_uint32_t accessing_next; +} skiplist_node; + +// *a < *b : return neg +// *a == *b : return 0 +// *a > *b : return pos +typedef int skiplist_cmp_t(skiplist_node *a, skiplist_node *b, void *aux); + +typedef struct { + size_t fanout; + size_t maxLayer; + void *aux; +} skiplist_raw_config; + +typedef struct { + skiplist_node head; + skiplist_node tail; + skiplist_cmp_t *cmp_func; + void *aux; + atm_uint32_t num_entries; + atm_uint32_t* layer_entries; + atm_uint8_t top_layer; + uint8_t fanout; + uint8_t max_layer; +} skiplist_raw; + +#ifndef _get_entry +#define _get_entry(ELEM, STRUCT, MEMBER) \ + ((STRUCT *) ((uint8_t *) (ELEM) - offsetof (STRUCT, MEMBER))) +#endif + +void skiplist_init(skiplist_raw* slist, + skiplist_cmp_t* cmp_func); +void skiplist_free(skiplist_raw* slist); + +void skiplist_init_node(skiplist_node* node); +void skiplist_free_node(skiplist_node* node); + +size_t skiplist_get_size(skiplist_raw* slist); + +skiplist_raw_config skiplist_get_default_config(); +skiplist_raw_config skiplist_get_config(skiplist_raw* slist); + +void skiplist_set_config(skiplist_raw* slist, + skiplist_raw_config config); + +int skiplist_insert(skiplist_raw* slist, + skiplist_node* node); +int skiplist_insert_nodup(skiplist_raw *slist, + skiplist_node *node); + +skiplist_node* skiplist_find(skiplist_raw* slist, + skiplist_node* query); +skiplist_node* skiplist_find_smaller_or_equal(skiplist_raw* slist, + skiplist_node* query); +skiplist_node* skiplist_find_greater_or_equal(skiplist_raw* slist, + skiplist_node* query); + +int skiplist_erase_node_passive(skiplist_raw* slist, + skiplist_node* node); +int skiplist_erase_node(skiplist_raw *slist, + skiplist_node *node); +int skiplist_erase(skiplist_raw* slist, + skiplist_node* query); + +int skiplist_is_valid_node(skiplist_node* node); +int skiplist_is_safe_to_free(skiplist_node* node); +void skiplist_wait_for_free(skiplist_node* node); + +void skiplist_grab_node(skiplist_node* node); +void skiplist_release_node(skiplist_node* node); + +skiplist_node* skiplist_next(skiplist_raw* slist, + skiplist_node* node); +skiplist_node* skiplist_prev(skiplist_raw* slist, + skiplist_node* node); +skiplist_node* skiplist_begin(skiplist_raw* slist); +skiplist_node* skiplist_end(skiplist_raw* slist); + +#ifdef __cplusplus +} +#endif + +#endif // _JSAHN_SKIPLIST_H diff --git a/src/table_append.cc b/src/table_append.cc new file mode 100644 index 0000000..9ffbcec --- /dev/null +++ b/src/table_append.cc @@ -0,0 +1,48 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_mgr.h" + +#include "db_internal.h" + +#include _MACRO_TO_STR(LOGGER_H) + +namespace jungle { + +void TableMgr::putLsmFlushResult( TableFile* cur_file, + const std::list& local_records, + std::list& res_out ) +{ + LsmFlushResult res; + if (local_records.size()) { + Record* min_rec = *(local_records.begin()); + res = LsmFlushResult(cur_file, min_rec->kv.key); + } else { + res = LsmFlushResult(cur_file); + } + res_out.push_back(res); +} + +void TableMgr::putLsmFlushResultWithKey( TableFile* cur_file, + const SizedBuf& key, + std::list& res_out ) +{ + LsmFlushResult res(cur_file, key); + res_out.push_back(res); +} + +} + diff --git a/src/table_compact_condition.cc b/src/table_compact_condition.cc new file mode 100644 index 0000000..e7e5d50 --- /dev/null +++ b/src/table_compact_condition.cc @@ -0,0 +1,605 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_mgr.h" + +#include "db_internal.h" + +namespace jungle { + +struct VictimCandidate { + VictimCandidate(TableInfo* _table = nullptr, + uint64_t _wss = 0, + uint64_t _total = 0) + : table(_table), wss(_wss), total(_total) {} + TableInfo* table; + uint64_t wss; + uint64_t total; +}; + +Status TableMgr::pickVictimTable(size_t level, + VictimPolicy policy, + bool honor_limit, + TableInfo*& victim_table_out, + uint64_t& wss_out, + uint64_t& total_out) +{ + TableInfo* victim_table = nullptr; + Status s; + + DBMgr* db_mgr = DBMgr::getWithoutInit(); + GlobalConfig* g_config = db_mgr->getGlobalConfig(); + DebugParams d_params = db_mgr->getDebugParams(); + bool d_params_effective = db_mgr->isDebugParamsEffective(); + + const DBConfig* db_config = getDbConfig(); + uint64_t MAX_TABLE_SIZE = db_config->getMaxTableSize(level); + + std::list tables; + SizedBuf empty_key; + mani->getTablesRange(level, empty_key, empty_key, tables); + + // No table to compact, return. + if (!tables.size()) return Status::TABLE_NOT_FOUND; + + std::vector candidates; + + uint64_t total_local = 0; + uint64_t wss_local = 0; + size_t max_ratio = 0; + uint64_t min_wss = std::numeric_limits::max(); + uint64_t wss_avg = 0; + size_t cur_idx = 0; + for (auto& entry: tables) { + cur_idx++; + TableInfo* t_info = entry; + + // Stack of other base table, skip. + if (!t_info->baseTable) continue; + + // This table shouldn't be involved in other compaction. + if ( isTableLocked(t_info->number) ) continue; + + TableStats t_stats; + s = t_info->file->getStats(t_stats); + if (!s) continue; + + uint64_t w_size = t_stats.workingSetSizeByte; + uint64_t t_size = t_stats.totalSizeByte; + size_t stack_size = 1; + + // If stack exists, sum up all. + TableStack* stack = t_info->stack; + if (stack) { + std::lock_guard l(stack->lock); + for (TableInfo* tt: stack->tables) { + TableStats stack_stats; + s = tt->file->getStats(stack_stats); + if (!s) continue; + + w_size += t_stats.workingSetSizeByte; + t_size += t_stats.totalSizeByte; + stack_size++; + } + } + + _log_trace( myLog, "table %zu wss %zu total %zu policy %d", + t_info->number, w_size, t_size, policy ); + + if ( policy == WORKING_SET_SIZE || + policy == WORKING_SET_SIZE_SPLIT ) { + + double factor = (policy == WORKING_SET_SIZE_SPLIT) + ? 1.5 : 1.0; + // Find any table whose WSS exceeds the limit. + if ( honor_limit && + w_size <= MAX_TABLE_SIZE * factor ) { + // If we honor limit, skip if + // 1) Table size is smaller than the limit. + continue; + } + // Push to list for random picking. + candidates.push_back( VictimCandidate(t_info, w_size, t_size) ); + + } else if (policy == STACK_SIZE) { + // TODO: Deprecated, should remove relative code. + candidates.push_back( VictimCandidate(t_info, w_size, t_size) ); + + } else if (policy == STALE_RATIO) { + // Find biggest stale ratio table. + if (!w_size) continue; + if (t_size < db_config->minFileSizeToCompact) continue; + + size_t ratio = t_size * 100 / w_size; + if (honor_limit) { + if ( d_params_effective && + d_params.urgentCompactionRatio ) { + // Debugging ratio is set. Ignore the original ratio. + if ( ratio < d_params.urgentCompactionRatio ) continue; + } else if ( db_mgr->isIdleTraffic() ) { + // Idle traffic compaction. + if ( ratio < g_config->itcOpt.factor ) continue; + } else { + // Otherwise. + if ( ratio < db_config->compactionFactor ) continue; + } + } + + if (ratio > max_ratio) { + max_ratio = ratio; + wss_local = w_size; + total_local = t_size; + victim_table = t_info; + } + + } else if (policy == SMALL_WORKING_SET) { + // Find a table with the smallest working set size. + + // The first (smallest key) and the last (greatest key) + // table cannot be merged. + if ( !t_info->minKey.empty() && + cur_idx < tables.size() && + w_size < min_wss ) { + min_wss = w_size; + wss_local = w_size; + total_local = t_size; + victim_table = t_info; + } + + } else { + assert(0); + } + wss_avg += w_size; + } + if (tables.size()) { + wss_avg /= tables.size(); + } + + if (policy == SMALL_WORKING_SET) { + bool do_merge = false; + if (min_wss < std::numeric_limits::max()) { + if (honor_limit) { + if(min_wss < wss_avg * 0.2) { + // If we honor the limit, merge the table if the smallest + // table's WSS is smaller than 20% of average. + do_merge = true; + } + } else { + // If we don't honor the limit, just merge the smallest table. + do_merge = true; + } + } + if (!do_merge) { + victim_table = nullptr; + } + } + + // Leveling: randomly pick among candidates. + // Others: choose table whose number is minimum (i.e., oldest). + if (!candidates.empty()) { + uint64_t min_num = std::numeric_limits::max(); + for (VictimCandidate& vc: candidates) { + if (vc.table->number < min_num) { + min_num = vc.table->number; + victim_table = vc.table; + wss_local = vc.wss; + total_local = vc.total; + } + } + } + + // Remove other tables except for victim family. + auto entry = tables.begin(); + while (entry != tables.end()) { + TableInfo* tt = *entry; + if ( tt == victim_table || + ( victim_table && + tt->minKey == victim_table->minKey ) ) { + // Keep it. + } else { + tt->done(); + } + entry++; + } + + if (victim_table) { + TableStats v_stats; + victim_table->file->getStats(v_stats); + + _log_( SimpleLogger::INFO, myLog, + "table lv %zu num %zu min key %s policy %d: " + "file size %zu (%s), active data %zu (%s), table max %zu (%s), " + "(ratio %zu vs. factor %zu), block reuse cycle %zu, " + "num records %zu, level active data average %zu (%s) out of %zu", + victim_table->level, victim_table->number, + victim_table->minKey.toReadableString().c_str(), + (int)policy, + total_local, Formatter::sizeToString(total_local, 2).c_str(), + wss_local, Formatter::sizeToString(wss_local, 2).c_str(), + MAX_TABLE_SIZE, Formatter::sizeToString(MAX_TABLE_SIZE, 2).c_str(), + ( wss_local + ? total_local * 100 / wss_local + : 0 ), + db_config->compactionFactor, + v_stats.blockReuseCycle, + v_stats.approxDocCount, + wss_avg, Formatter::sizeToString(wss_avg, 2).c_str(), + tables.size() ); + } + + victim_table_out = victim_table; + wss_out = wss_local; + total_out = total_local; + + return Status::OK; +} + +TableInfo* TableMgr::findLocalVictim(size_t level, + TableInfo* given_victim, + VictimPolicy policy, + bool honor_limit) +{ + if (given_victim) { + _log_info(myLog, "victim table is given: %zu_%zu at %zu", + opt.prefixNum, given_victim->number, level); + return given_victim; + } + + // If victim table is not given, + // pick a table whose active data size is the biggest in the level. + TableInfo* local_victim = nullptr; + uint64_t wss = 0; + uint64_t total = 0; + Status s = pickVictimTable( level, policy, honor_limit, + local_victim, wss, total ); + if (s && local_victim) { + _log_info(myLog, "victim table is not given, " + "table manager locally picked up %zu_%zu level %zu, " + "WSS %zu total %zu, policy %d, honor limit %d", + opt.prefixNum, local_victim->number, level, + wss, total, policy, honor_limit); + } else { + _log_info(myLog, "victim table is not given, and cannot find one, " + "level %zu, policy %d, honor limit %d", + level, policy, honor_limit); + } + return local_victim; +} + +bool TableMgr::isL0CompactionInProg() { + size_t num_p = getNumL0Partitions(); + for (size_t ii=0; iiload()) return true; + } + return false; +} + +bool TableMgr::chkL0CompactCond(uint32_t hash_num) { + const DBConfig* db_config = getDbConfig(); + + // Read-only mode. + if ( db_config->readOnly ) return false; + + // Compaction is disabled. + if ( !allowCompaction ) return false; + + // Log section mode. + if ( db_config->logSectionOnly ) return false; + + // Validity of `hash_num`. + if ( hash_num >= numL0Partitions ) return false; + + // Check if compaction is already in progress. + if ( compactStatus[hash_num]->load() == true ) return false; + + // If compaction factor is zero, do not compact DB. + if ( !db_config->compactionFactor ) return false; + + // If level extension mode, L1 compaction shouldn't be happening. + if ( db_config->nextLevelExtension && + numL1Compactions > 0 ) return false; + + // If sequential loading, delay L0 -> L1 compaction. + // TODO: We cannot delay forever, should check disk free space. + if ( db_config->nextLevelExtension && + parentDb->p->flags.seqLoading ) return false; + + Status s; + std::list tables; + s = mani->getL0Tables(hash_num, tables); + if (!s) return s; + + try { + // If src table exists, that means that the engine was closed + // while compaction is running. We need to compact this table first. + TableInfo* target_table = nullptr; + TableInfo* sup_table = nullptr; + bool force_compaction = false; + getTwoSmallSrcTables(tables, hash_num, target_table, sup_table); + if (!target_table) { + getTwoSmallNormalTables(tables, hash_num, target_table, sup_table); + if (!target_table) throw Status(Status::TABLE_NOT_FOUND); + } else { + _log_warn(myLog, "previously engine terminated while table %zu is " + "being compacted", target_table->number); + force_compaction = true; + } + + TableStats t_stats, s_stats; + TC( target_table->file->getStats(t_stats) ); + if (sup_table) { + TC( sup_table->file->getStats(s_stats) ); + } + + size_t ratio = 0; + if (t_stats.workingSetSizeByte) { + ratio = t_stats.totalSizeByte * 100 / t_stats.workingSetSizeByte; + } + + size_t s_ratio = (uint64_t)0xffffffff; + if (sup_table && s_stats.workingSetSizeByte) { + // If one more normal file exists, + // we should check the ratio between this file and that file as well. + s_ratio = (t_stats.totalSizeByte + s_stats.totalSizeByte) * 100 / + (s_stats.workingSetSizeByte); + } + + bool decision = false; + DBMgr* db_mgr = DBMgr::getWithoutInit(); + GlobalConfig* g_config = db_mgr->getGlobalConfig(); + DebugParams d_params = db_mgr->getDebugParams(); + bool d_params_effective = db_mgr->isDebugParamsEffective(); + + // Urgent compaction: + // => If block reuse cycle goes beyond 2x of threshold, + if ( db_config->minBlockReuseCycleToCompact && + t_stats.blockReuseCycle >= db_config->minBlockReuseCycleToCompact * 2 ) + { + decision = true; + } + + // Urgent compaction (not in next-level mode): + // => If stale block ratio goes beyond 2x of threshold, + // when file size is bigger than 64MB. + if ( !db_config->nextLevelExtension && + t_stats.totalSizeByte > (uint64_t)64*1024*1024 && + ratio > db_config->compactionFactor * 2 && + s_ratio > db_config->compactionFactor * 2 ) + { + decision = true; + } + + // Urgent compaction: + // => If file size is bigger than debugging parameter. + if ( d_params_effective && + d_params.urgentCompactionFilesize && + t_stats.totalSizeByte > d_params.urgentCompactionFilesize ) + { + _log_info(myLog, "[URGENT COMPACTION] by size: %zu > %zu", + t_stats.totalSizeByte, d_params.urgentCompactionFilesize); + decision = true; + } + + // Urgent compaction: + // => If stale ratio is bigger than debugging parameter. + int urgent_or_itc = 0; + if ( d_params_effective && + d_params.urgentCompactionRatio && + t_stats.totalSizeByte > db_config->minFileSizeToCompact && + ratio > d_params.urgentCompactionRatio ) { + urgent_or_itc = 1; + } + + // Idle traffic compaction: + // => If stale ratio is bigger than idle traffic factor. + if ( db_mgr->isIdleTraffic() && + g_config->itcOpt.factor && + t_stats.totalSizeByte > db_config->minFileSizeToCompact && + ratio > g_config->itcOpt.factor ) { + urgent_or_itc = 2; + } + + if (urgent_or_itc) { + std::string type_str = (urgent_or_itc == 1) + ? "URGENT COMPACTION" + : "IDLE COMPACTION"; + if (sup_table) { + // Sup table exists. + // To avoid small files being compacted (the benefit is very + // marginal), check the actual file size. It should be bigger + // than some proportion of the sup-table size. + if ( t_stats.totalSizeByte * 100 > + s_stats.totalSizeByte * + (d_params.urgentCompactionRatio - 100) ) { + _log_info(myLog, "[%s] by ratio: %zu > %zu, " + "size: %zu %zu", + type_str.c_str(), + ratio, d_params.urgentCompactionRatio, + t_stats.totalSizeByte, + s_stats.totalSizeByte); + decision = true; + } + + } else { + // Just single file. + _log_info(myLog, "[%s] by ratio: %zu > %zu", + type_str.c_str(), + ratio, d_params.urgentCompactionRatio); + decision = true; + } + } + + // Urgent compaction (in level extension mode): + // => If file size is bigger than given L0 limit. + if ( db_config->nextLevelExtension && + ( t_stats.workingSetSizeByte + + s_stats.workingSetSizeByte ) > db_config->maxL0TableSize ) + { + decision = true; + } + + // Normal condition: + if ( t_stats.totalSizeByte > db_config->minFileSizeToCompact && + t_stats.blockReuseCycle >= db_config->minBlockReuseCycleToCompact && + ratio > db_config->compactionFactor && + s_ratio > db_config->compactionFactor ) + { + if (!db_config->nextLevelExtension) { + decision = true; + } else { + // Next-level extension mode: + // File size should exceed either + // 1) the given L0 threshold, or + // 2) current L1 size / num L0 tables (if L1 exists). + uint64_t wss = 0, total = 0, max_stack = 0; + s = getLevelSize(1, wss, total, max_stack); + if (!s) decision = true; // Level-1 doesn't exist. + + if ( t_stats.totalSizeByte > db_config->maxL0TableSize || + t_stats.totalSizeByte * db_config->numL0Partitions > total ) { + decision = true; + } + } + } + + // If `force_compaction` is set, always set `decision`. + if (force_compaction) decision = true; + + SimpleLogger::Levels log_lv = (decision) + ? SimpleLogger::INFO + : SimpleLogger::DEBUG; + _log_(log_lv, myLog, + "table lv %zu num %zu hash %zu: " + "file size %zu (%s), block reuse cycle %zu, active data %zu (%s), " + "sup file size %zu (%s), sup active data %zu (%s), " + "(ratio %zu, s_ratio %zu vs. factor %zu), " + "decision: %s", + target_table->level, target_table->number, target_table->hashNum, + t_stats.totalSizeByte, + Formatter::sizeToString(t_stats.totalSizeByte, 2).c_str(), + t_stats.blockReuseCycle, + t_stats.workingSetSizeByte, + Formatter::sizeToString(t_stats.workingSetSizeByte, 2).c_str(), + s_stats.totalSizeByte, + Formatter::sizeToString(s_stats.totalSizeByte, 2).c_str(), + s_stats.workingSetSizeByte, + Formatter::sizeToString(s_stats.workingSetSizeByte, 2).c_str(), + ratio, + (s_ratio == (uint64_t)0xffffffff) ? 0 : s_ratio, + db_config->compactionFactor, + (decision ? "COMPACT" : "x") ); + + for (TableInfo*& table: tables) table->done(); + return decision; + + } catch (Status s) { + for (TableInfo*& table: tables) table->done(); + return false; + } +} + +Status TableMgr::chkLPCompactCond(size_t level, + TableMgr::MergeStrategy& s_out, + TableInfo*& victim_table_out) +{ + const DBConfig* db_config = getDbConfig(); + Status s; + + // Read-only mode. + if ( db_config->readOnly ) return Status::INVALID_MODE; + + // Compaction is disabled. + if ( !allowCompaction ) return Status::COMPACTION_IS_NOT_ALLOWED; + + // Log section mode. + if ( db_config->logSectionOnly ) return Status::INVALID_MODE; + + // We can't do L1 compaction when L0 compaction is in progress. + if ( level == 1 && isL0CompactionInProg() ) { + return Status::OPERATION_IN_PROGRESS; + } + + uint64_t wss = 0, + total = 0, + max_stack = 0; + + bool force_interlevel = false; + // TODO: If we want to force inter-level compaction + // for intermediate levels, need to add some + // logic to make this flag true here. + (void)force_interlevel; + + // Check if inter-level merge is needed. + if ( !isLevelLocked(level) ) { + s = getLevelSize(level, wss, total, max_stack); + if (!s) return s; + + // FIXME: + // Only one inter-level compaction is allowed at a time now. + // If not, data loss will happen as different tables will + // have empty key as their min keys, at the same time. + uint64_t level_limit = getLevelSizeLimit(level); + if (wss > level_limit) { + _log_info(myLog, "[INTERLEVEL] level %zu wss %zu limit %zu", + level, wss, level_limit); + s_out = TableMgr::INTERLEVEL; + victim_table_out = nullptr; + return Status(); + } + } + + // Find table to split first (when WSS > 1.5x table limit). + TableMgr::VictimPolicy v_policy = TableMgr::WORKING_SET_SIZE_SPLIT; + TableInfo* victim_table = nullptr; + wss = total = 0; + s = pickVictimTable( level, v_policy, true, + victim_table, wss, total ); + if (s && victim_table) { + s_out = force_interlevel ? TableMgr::INTERLEVEL : TableMgr::SPLIT; + victim_table_out = victim_table; + return s; + } + + // Not found, then find table to (in-place) compact + // (when TOTAL > WSS * C). + v_policy = TableMgr::STALE_RATIO; + victim_table = nullptr; + wss = total = 0; + s = pickVictimTable( level, v_policy, true, + victim_table, wss, total ); + if (s && victim_table) { + s_out = force_interlevel ? TableMgr::INTERLEVEL : TableMgr::INPLACE; + victim_table_out = victim_table; + return s; + } + + // At lowest priority, find table to merge. + v_policy = TableMgr::SMALL_WORKING_SET; + victim_table = nullptr; + wss = total = 0; + s = pickVictimTable( level, v_policy, true, + victim_table, wss, total ); + if (s && victim_table) { + s_out = TableMgr::MERGE; + victim_table_out = victim_table; + return s; + } + + return Status::TABLE_NOT_FOUND; +} + +} + diff --git a/src/table_compaction.cc b/src/table_compaction.cc new file mode 100644 index 0000000..f67d068 --- /dev/null +++ b/src/table_compaction.cc @@ -0,0 +1,754 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_mgr.h" + +#include "db_internal.h" + +namespace jungle { + +// Logically do the same thing as `compactLevel()`, +// but does not keep records in memory, and instead +// use iterator directly. +// +// Trade-off: less memory, but needs two-phase scan. +// +Status TableMgr::compactLevelItr(const CompactOptions& options, + TableInfo* victim_table, + size_t level) +{ + if (level >= mani->getNumLevels()) return Status::INVALID_LEVEL; + + Status s; + Timer tt; + + const DBConfig* db_config = getDbConfig(); + DBMgr* mgr = DBMgr::getWithoutInit(); + DebugParams d_params = mgr->getDebugParams(); + + bool is_last_level = (level + 1 == mani->getNumLevels()); + TableInfo* local_victim = findLocalVictim( level, victim_table, + WORKING_SET_SIZE, false ); + if (!local_victim) return Status::TABLE_NOT_FOUND; + + _log_info( myLog, "interlevel compaction %zu -> %zu, victim %zu_%zu, " + "min key %s", + level, level+1, opt.prefixNum, local_victim->number, + local_victim->minKey.toReadableString().c_str() ); + + // List of overlapping tables (before compaction) in the next level. + std::list tables; + + // List of tables to write in the next level. + // If we need to create a new table, `TableInfo` of that + // element will be `nullptr`. + // `minKey` of every element occupies newly allocated memory region, + // so that should be freed at the end of this function. + std::vector new_tables; + TableFile::Iterator* itr = nullptr; + + try { + // Lock level (only one interlevel compaction is allowed at a time). + LevelLockHolder lv_holder(this, level); + if (!lv_holder.ownsLock()) throw Status(Status::OPERATION_IN_PROGRESS); + + // Lock the victim table. + // The other tables will be locked below (in `setBatchLsm()`). + TableLockHolder tl_src_holder(this, {local_victim->number}); + if (!tl_src_holder.ownsLock()) throw Status(Status::OPERATION_IN_PROGRESS); + + // Get overlapping tables. + SizedBuf min_key_victim = local_victim->minKey; + SizedBuf max_key_victim; + SizedBuf::Holder h_max_key_victim(max_key_victim); + local_victim->file->getMaxKey(max_key_victim); + mani->getTablesRange(level+1, min_key_victim, max_key_victim, tables); + _log_info( myLog, "victim table's min key %s max key %s, " + "%zu overlapping table in level %zu", + min_key_victim.toReadableString().c_str(), + max_key_victim.toReadableString().c_str(), + tables.size(), level + 1 ); + for (auto& entry: tables) { + TableInfo* tt = entry; + _log_info( myLog, "table %zu %s at level %zu", + tt->number, + tt->minKey.toReadableString().c_str(), + level + 1 ); + } + + // Lock destination tables. + TableLockHolder tl_dst_holder( this, table_list_to_number(tables) ); + if (!tl_dst_holder.ownsLock()) throw Status(Status::OPERATION_IN_PROGRESS); + + TableStats victim_stats; + local_victim->file->getStats(victim_stats); + + auto t_itr = tables.begin(); + auto t_itr_end = tables.rbegin(); + + TableInfo* min_table = (t_itr != tables.end()) ? *t_itr : nullptr; + TableInfo* max_table = (t_itr_end != tables.rend()) ? *t_itr_end : nullptr; + TableInfo* next_table = nullptr; + RecGroupItr* cur_group = nullptr; + SizedBuf max_key_table; + SizedBuf::Holder h_max_key_table(max_key_table); + if (max_table) { + max_table->file->getMaxKey(max_key_table); + _log_info( myLog, "max key among tables %s", + max_key_table.toReadableString().c_str() ); + } + + uint64_t acc_size = 0; + if ( !min_table || min_key_victim < min_table->minKey ) { + // Should create a new table for min key. + cur_group = new RecGroupItr(SizedBuf(), 0, nullptr); + new_tables.push_back(cur_group); + if (min_table) next_table = min_table; + + } else { + // `min_table` should be the first table. + cur_group = new RecGroupItr(min_table->minKey, 0, min_table); + new_tables.push_back(cur_group); + while (t_itr != tables.end()) { + TableInfo* tt = *t_itr; + if (tt->minKey <= min_key_victim) { + cur_group->table = tt; + } else { + break; + } + t_itr++; + } + next_table = (t_itr == tables.end()) ? nullptr : *t_itr; + + if (cur_group->table) { + TableStats stats_out; + cur_group->table->file->getStats(stats_out); + acc_size = stats_out.workingSetSizeByte; + _log_info( myLog, "acc size of table %zu: %zu", + cur_group->table->number, + acc_size ); + } + } + + // Read all records from the victim table. + uint64_t num_records_read = 0; + uint64_t TABLE_LIMIT = db_config->getMaxTableSize(level + 1); + size_t min_num_tables_for_new_level = + std::max((size_t)4, db_config->getMaxParallelWriters()); + if ( max_key_table.empty() && + db_config->nextLevelExtension && + min_num_tables_for_new_level ) { + // It means that this is the flush flush to L1. + // Ignore given table size, and divide evenly + // (basic assumption is that L0 is hash-partitioned so that it + // won't harm the key distribution of L1). + uint64_t tmp_backup = TABLE_LIMIT; + TABLE_LIMIT = victim_stats.workingSetSizeByte / min_num_tables_for_new_level; + if (!TABLE_LIMIT) TABLE_LIMIT = tmp_backup; + _log_info(myLog, "table limit is adjusted to %zu, " + "num tables %zu, victim WSS %zu", + TABLE_LIMIT, + min_num_tables_for_new_level, + victim_stats.workingSetSizeByte); + } + + SizedBuf empty_key; + itr = new TableFile::Iterator(); + TC( itr->init(nullptr, local_victim->file, empty_key, empty_key) ); + + std::vector offsets; + // Reserve 10% more headroom, just in case. + offsets.reserve(victim_stats.approxDocCount * 11 / 10); + + // Initial scan to get + // 1) number of files after split, and + // 2) min keys for each new file. + do { + if (!isCompactionAllowed()) { + throw Status(Status::COMPACTION_CANCELLED); + } + + Record rec_out; + Record::Holder h_rec_out(rec_out); + size_t value_size_out = 0; + uint64_t offset_out = 0; + s = itr->getMeta(rec_out, value_size_out, offset_out); + if (!s) break; + + offsets.push_back(offset_out); + uint64_t cur_index = offsets.size() - 1; + + if ( next_table && + rec_out.kv.key >= next_table->minKey ) { + // New table. + cur_group = new RecGroupItr( next_table->minKey, + cur_index, + next_table ); + new_tables.push_back(cur_group); + acc_size = 0; + + // Skip tables whose range is not overlapping. + while (t_itr != tables.end()) { + TableInfo* tt = *t_itr; + if (tt->minKey <= rec_out.kv.key) { + cur_group->table = tt; + } else { + break; + } + t_itr++; + } + next_table = (t_itr == tables.end()) ? nullptr : *t_itr; + + if (cur_group->table) { + TableStats stats_out; + cur_group->table->file->getStats(stats_out); + acc_size = stats_out.workingSetSizeByte; + _log_info( myLog, "acc size of table %zu: %zu", + cur_group->table->number, + acc_size ); + } + + if (next_table) { + _log_info( myLog, "next table changed to %zu, %s, cur rec %s", + next_table->number, + next_table->minKey.toReadableString().c_str(), + rec_out.kv.key.toReadableString().c_str() ); + } else { + _log_info( myLog, "next table changed to NULL, cur rec %s", + rec_out.kv.key.toReadableString().c_str() ); + } + + } else if ( acc_size > TABLE_LIMIT ) { + // If not, but accumulated size exceeds the limit, + // move to a new table that is not in `tables`. + // + // BUT, ONLY WHEN THE KEY IS BIGGER THAN THE LAST TABLE'S MAX KEY. + if (rec_out.kv.key > max_key_table) { + _log_info( myLog, "rec key %s is greater than max table key %s, " + "urgent split at level %zu, acc size %zu, limit %zu", + rec_out.kv.key.toReadableString().c_str(), + max_key_table.toReadableString().c_str(), + level + 1, + acc_size, + TABLE_LIMIT ); + cur_group = new RecGroupItr( rec_out.kv.key, + cur_index, + nullptr); + new_tables.push_back(cur_group); + acc_size = 0; + } + // We SHOULD NOT move table cursor (`t_itr`) here. + } + + acc_size += (rec_out.size() + value_size_out + APPROX_META_SIZE); + num_records_read++; + + if (d_params.compactionDelayUs) { + // If debug parameter is given, sleep here. + Timer::sleepUs(d_params.compactionDelayUs); + } + + } while (itr->next().ok()); + itr->close(); + DELETE(itr); + + uint64_t elapsed_us = std::max( tt.getUs(), (uint64_t)1 ); + double scan_rate = (double)num_records_read * 1000000 / elapsed_us; + _log_info(myLog, "reading table %zu_%zu done %zu us, " + "tables before %zu after %zu, " + "%zu records, %.1f iops", + opt.prefixNum, local_victim->number, elapsed_us, + tables.size(), new_tables.size(), + num_records_read, scan_rate); + + size_t num_new_tables = new_tables.size(); + size_t max_writers = getDbConfig()->getMaxParallelWriters(); + std::list dummy_chk; + std::list results; + + if (num_new_tables && num_records_read) { + // Extend new level if needed. + if (is_last_level) mani->extendLevel(); + } + + for (size_t ii = 0; ii < num_new_tables; ) { + size_t upto_orig = std::min(ii + max_writers, num_new_tables); + + // NOTE: request `req_writers - 1`, as the other one is this thread. + size_t req_writers = upto_orig - ii; + TableWriterHolder twh(mgr->tableWriterMgr(), req_writers - 1); + + // Lease may not succeed, adjust `upto`. + size_t leased_writers = twh.leasedWriters.size(); + size_t upto = ii + leased_writers + 1; + + for (size_t jj = ii; jj < upto; jj++) { + size_t worker_idx = jj - ii; + bool leased_thread = (jj + 1 < upto); + + // Create a new file if necessary. + TableFile* cur_file = nullptr; + if (!new_tables[jj]->table) { + TableFileOptions t_opt; + s = createNewTableFile(level + 1, cur_file, t_opt); + if (!s) continue; + } else { + cur_file = new_tables[jj]->table->file; + } + + TableWriterArgs local_args; + local_args.myLog = myLog; + + TableWriterArgs* w_args = (leased_thread) + ? &twh.leasedWriters[worker_idx]->writerArgs + : &local_args; + w_args->callerAwaiter.reset(); + + uint64_t count = (jj + 1 == num_new_tables) + ? offsets.size() - new_tables[jj]->index + : new_tables[jj+1]->index - new_tables[jj]->index; + w_args->payload = TableWritePayload( this, + &offsets, + new_tables[jj]->index, + count, + &dummy_chk, + local_victim->file, + cur_file ); + if (!new_tables[jj]->table) { + // Newly created table, put into the list. + putLsmFlushResultWithKey( cur_file, + new_tables[jj]->minKey, + results ); + } + + if (leased_thread) { + // Leased threads. + w_args->invoke(); + } else { + // This thread. + TableWriterMgr::doTableWrite(w_args); + } + } + + // Wait for workers. + for (size_t jj = ii; jj < upto - 1; jj++) { + size_t worker_idx = jj - ii; + TableWriterArgs* w_args = &twh.leasedWriters[worker_idx]->writerArgs; + while ( !w_args->payload.isEmpty() ) { + w_args->callerAwaiter.wait_ms(1000); + w_args->callerAwaiter.reset(); + } + } + + if (!isCompactionAllowed()) { + // NOTE: keys will be freed below. + for (LsmFlushResult& rr: results) delete rr.tFile; + throw Status(Status::COMPACTION_CANCELLED); + } + + ii += leased_writers + 1; + } + + { // Grab lock, add first, and then remove next. + std::lock_guard l(mani->getLock()); + // WARNING: + // We should add in descending order of min key. + // Otherwise, if there is a point query in the middle, + // it may go to wrong (newly created) table which + // causes false "key not found". + results.sort( LsmFlushResult::cmp ); + for (LsmFlushResult& rr: results) { + TC( mani->addTableFile( level + 1, 0, rr.minKey, rr.tFile ) ); + rr.tFile = nullptr; + } + mani->removeTableFile(level, local_victim); + } + + // WARNING: + // Release it ONLY WHEN this table is not given by caller. + // If not, caller is responsible to release the table. + if (!victim_table) local_victim->done(); + + mani->store(); + mani->sync(); + + // Update throttling rate. + elapsed_us = std::max( tt.getUs(), (uint64_t)1 ); + double write_rate = (double)num_records_read * 1000000 / elapsed_us; + if (parentDb && level == 0) { + parentDb->p->tStats.lastTableFlushRate = write_rate; + parentDb->p->tStats.lastTableFlushRateExpiry.reset(); + } + _log_info( myLog, "L%zu write done: compaction %zu -> %zu, " + "%zu records, %zu target tables, %zu us, %.1f iops", + level+1, level, level+1, num_records_read, + num_new_tables, elapsed_us, write_rate ); + + for (TableInfo*& entry: tables) entry->done(); + for (RecGroupItr*& entry: new_tables) delete entry; + return Status(); + + } catch (Status s) { // ------------------------------------------------ + _log_err(myLog, "compaction failed: %d", (int)s); + + if (itr) { + itr->close(); + DELETE(itr); + } + + // WARNING: Ditto. + if (!victim_table) local_victim->done(); + + for (TableInfo*& entry: tables) entry->done(); + for (RecGroupItr*& entry: new_tables) delete entry; + return s; + } +} + +Status TableMgr::compactInPlace(const CompactOptions& options, + TableInfo* victim_table, + size_t level) +{ + if (level >= mani->getNumLevels()) return Status::INVALID_LEVEL; + + Status s; + Timer tt; + TableInfo* local_victim = findLocalVictim( level, victim_table, + STALE_RATIO, false ); + if (!local_victim) return Status::TABLE_NOT_FOUND; + + _log_info( myLog, "in-place compaction at level %zu victim %zu_%zu, " + "min key %s", + level, opt.prefixNum, local_victim->number, + local_victim->minKey.toReadableString().c_str() ); + + try { + if (level == 1) numL1Compactions.fetch_add(1); + + // Lock victim table. + TableLockHolder tl_holder(this, {local_victim->number}); + if (!tl_holder.ownsLock()) throw Status(Status::OPERATION_IN_PROGRESS); + + uint64_t dst_table_num = 0; + std::string dst_filename; + EP( mani->issueTableNumber(dst_table_num) ); + dst_filename = TableFile::getTableFileName(opt.path, opt.prefixNum, dst_table_num); + + if (opt.fOps->exist(dst_filename)) { + // Previous file exists, which means that there is a legacy log file. + // We should overwrite it. + _log_warn( myLog, "table %s already exists, remove it", + dst_filename.c_str() ); + opt.fOps->remove(dst_filename); + } + + TC( local_victim->file->compactTo(dst_filename, options) ); + + // Open newly compacted file, and add it to manifest. + TableFile* newly_compacted_file = new TableFile(this); + newly_compacted_file->setLogger(myLog); + newly_compacted_file->load( level, dst_table_num, dst_filename, + opt.fOps, TableFileOptions() ); + mani->addTableFile(level, 0, local_victim->minKey, newly_compacted_file); + + // Add special checkpoint 0, for base snapshot. + uint64_t new_file_last_seqnum = 0; + newly_compacted_file->getLatestSnapMarker(new_file_last_seqnum); + newly_compacted_file->addCheckpoint(0, new_file_last_seqnum); + + { // Remove source table file. + std::lock_guard l(mani->getLock()); + mani->removeTableFile(level, local_victim); + } + + // WARNING: + // Release it ONLY WHEN this table is not given by caller. + // If not, caller is responsible to release the table. + if (!victim_table) local_victim->done(); + + mani->store(); + mani->sync(); + _log_info(myLog, "in-place compaction at level %zu done, %zu us", + level, tt.getUs()); + + if (level == 1) numL1Compactions.fetch_sub(1); + return Status::OK; + + } catch (Status s) { // ------------------------------------ + + // WARNING: Ditto. + if (!victim_table) local_victim->done(); + + if (level == 1) numL1Compactions.fetch_sub(1); + return s; + } +} + +Status TableMgr::compactL0(const CompactOptions& options, + uint32_t hash_num) +{ + if (!allowCompaction) { + _log_warn(myLog, "compaction is now allowed"); + return Status::COMPACTION_IS_NOT_ALLOWED; + } + + Status s; + const DBConfig* db_config = getDbConfig(); + + std::list tables; + s = mani->getL0Tables(hash_num, tables); + if (!s) { + _log_warn(myLog, "tables of hash %zu not found", hash_num); + return s; + } + + bool exp = false; + bool val = true; + if (!compactStatus[hash_num]->compare_exchange_strong(exp, val)) { + // Compaction is already in progress. + _log_warn(myLog, "compaction hash %zu is in progress", hash_num); + return Status::OPERATION_IN_PROGRESS; + } + + std::stringstream s_log_msg_prefix; + DBMgr* db_mgr = DBMgr::getWithoutInit(); + SimpleLogger* global_log = db_mgr->getLogger(); + + try { + // NOTE: + // Case 1) only one file exists: e.g.) table_000. + // - Create a new table: table_001, it will serve new writes. + // - Compact from table_000 to table_002. + // Case 2) two files exist: e.g.) table_001 and table_002. + // - Compact from table_001 to table_003. + // - table_002 will serve new writes. + + // WARNING: Should compact src file first if exists. + TableInfo* origin_table = nullptr; + TableInfo* merge_table = nullptr; + getTwoSmallSrcTables(tables, hash_num, origin_table, merge_table); + if (!origin_table) { + getTwoSmallNormalTables(tables, hash_num, origin_table, merge_table); + if (!origin_table) throw Status(Status::TABLE_NOT_FOUND); + } + + TableStats origin_stat; + origin_table->file->getStats(origin_stat); + uint64_t target_filesize = origin_stat.workingSetSizeByte; + target_filesize *= db_config->blockReuseFactor; + target_filesize /= 100; + + uint64_t min_block_reuse_size = (uint64_t)64 * 1024 * 1024; + if (!db_config->nextLevelExtension) { + min_block_reuse_size = std::max(min_block_reuse_size, target_filesize); + } + _log_info(myLog, "target block reuse size %zu", min_block_reuse_size); + + // Create a new table file to serve writes from now on. + // + // WARNING: Only when the origin table is in NORMAL status. + uint64_t new_normal_table_num = 0; + TableFileOptions t_opt; + if (origin_table->isNormal()) { + if (db_config->nextLevelExtension && merge_table) { + // Mode change happened. `merge_table` is older one, so that + // we should compact it, and keep `origin_table`. + new_normal_table_num = origin_table->number; + origin_table = merge_table; + merge_table = nullptr; + _log_info( myLog, "detected legacy file format, " + "keep new normal table %zu and " + "compact the old table %zu to L1", + new_normal_table_num, + origin_table->number ); + + } else { + TableFile* new_normal_file = nullptr; + t_opt.minBlockReuseFileSize = min_block_reuse_size; + if ( db_config->bloomFilterBitsPerUnit > 0.0 ) { + // New table's bloom filter size is based on the current WSS. + t_opt.bloomFilterSize = + TableFile::getBfSizeByWss(db_config, + origin_stat.workingSetSizeByte); + if (!t_opt.bloomFilterSize) { + t_opt.bloomFilterSize = + TableFile::getBfSizeByLevel(db_config, 0); + } + } + createNewTableFile(0, new_normal_file, t_opt); + EP( mani->addTableFile(0, hash_num, SizedBuf(), new_normal_file) ); + new_normal_table_num = new_normal_file->getNumber(); + } + + } else { + TableInfo* existing_normal_info = + getSmallestNormalTable(tables, hash_num); + new_normal_table_num = existing_normal_info->number; + } + + uint64_t dst_table_num = 0; + std::string dst_filename; + { // NOTE: + // To avoid the situation that new updates are written to + // the old file (i.e., compaction source), both setBatch() + // and compactL0() should be mutual exclusive. + std::unique_lock l(L0Lock); + + if (db_config->nextLevelExtension && !merge_table) { + // Level-extension mode. + s_log_msg_prefix + << "hash " << hash_num << ", " + << "table " << opt.prefixNum << "_" << origin_table->number + << " -> next-level LSM, " + << "new normal table " << opt.prefixNum << "_" + << new_normal_table_num; + + } else { + // L0-only mode + // (or level-extension mode but cancelled legacy compaction). + + // Get DST file name. + TC( mani->issueTableNumber(dst_table_num) ); + dst_filename = TableFile::getTableFileName + ( opt.path, opt.prefixNum, dst_table_num ); + + if (merge_table) { + s_log_msg_prefix + << "hash " << hash_num << ", " + << "table " << opt.prefixNum << "_" << origin_table->number + << " + " << opt.prefixNum << "_" << merge_table->number + << " -> " << opt.prefixNum << "_" << dst_table_num << ", " + << "new normal table " << opt.prefixNum << "_" + << new_normal_table_num; + } else { + s_log_msg_prefix + << "hash " << hash_num << ", " + << "table " << opt.prefixNum << "_" << origin_table->number + << " -> " << opt.prefixNum << "_" << dst_table_num << ", " + << "new normal table " << opt.prefixNum << "_" + << new_normal_table_num; + } + } + _log_info( myLog, "[COMPACTION BEGIN] %s", + s_log_msg_prefix.str().c_str() ); + _log_info( global_log, "[COMPACTION BEGIN] %s", + s_log_msg_prefix.str().c_str() ); + + // Before executing compaction, change the status. + // Now all incoming writes will go to the next file. + origin_table->setCompactSrc(); + if (merge_table) merge_table->setCompactSrc(); + } + + // Store manifest file. + mani->store(); + mani->sync(); + + if ( !dst_filename.empty() && + opt.fOps->exist(dst_filename) ) { + // Previous file exists, which means that there is a legacy log file. + // We should overwrite it. + _log_warn( myLog, "table %s already exists, remove it", + dst_filename.c_str() ); + opt.fOps->remove(dst_filename); + } + + Timer tt; + + // Do compaction. + if (merge_table) { + // Merge table exists: do merge compaction. + s = origin_table->file->mergeCompactTo(merge_table->file->getName(), + dst_filename, + options); + if (!s) throw s; + + } else { + // Otherwise: single compaction + if (db_config->nextLevelExtension) { + // 1) Level-extension mode: merge (append) to the next level. + s = compactLevelItr(options, origin_table, 0); + + } else { + // 2) L0-only mode: only happens at the beginning. + s = origin_table->file->compactTo(dst_filename, options); + } + if (!s) throw s; + } + + _log_info( myLog, "[COMPACTION END] %s, %zu us elapsed", + s_log_msg_prefix.str().c_str(), tt.getUs() ); + _log_info( global_log, "[COMPACTION END] %s, %zu us elapsed", + s_log_msg_prefix.str().c_str(), tt.getUs() ); + + if (!db_config->nextLevelExtension || merge_table) { + // L0-only mode + // (or extension mode but merge table exists due to legacy file format). + // Open newly compacted file, and add it to manifest. + TableFile* newly_compacted_file = new TableFile(this); + newly_compacted_file->setLogger(myLog); + newly_compacted_file->load(0, dst_table_num, dst_filename, opt.fOps, t_opt); + mani->addTableFile(0, hash_num, SizedBuf(), newly_compacted_file); + + // Add special checkpoint 0, for base snapshot. + uint64_t new_file_last_seqnum = 0; + newly_compacted_file->getLatestSnapMarker(new_file_last_seqnum); + newly_compacted_file->addCheckpoint(0, new_file_last_seqnum); + + // Remove old file from manifest, and delete it. + mani->removeTableFile(0, origin_table); + if (merge_table) mani->removeTableFile(0, merge_table); + + for (TableInfo*& table: tables) table->done(); + + // Store manifest file. + mani->store(); + mani->sync(); + + } else { + // Level-extension mode: + // `compactLevel` already removed & synced files. + // Just release. + for (TableInfo*& table: tables) table->done(); + } + + compactStatus[hash_num]->store(false); + return Status(); + + } catch (Status s) { // ======== + + if (s == Status::COMPACTION_CANCELLED) { + _log_warn( myLog, "[COMPACTION CANCELLED] %s", + s_log_msg_prefix.str().c_str() ); + _log_warn( global_log, "[COMPACTION CANCELLED] %s", + s_log_msg_prefix.str().c_str() ); + } else { + _log_warn( myLog, "[COMPACTION ERROR]: %s, %d", + s_log_msg_prefix.str().c_str(), s ); + _log_warn( global_log, "[COMPACTION ERROR]: %s, %d", + s_log_msg_prefix.str().c_str(), s ); + } + mani->store(); + mani->sync(); + + for (TableInfo*& table: tables) table->done(); + compactStatus[hash_num]->store(false); + return s; + } +} + +}; // namespace jungle + diff --git a/src/table_file.cc b/src/table_file.cc new file mode 100644 index 0000000..b3eb7f1 --- /dev/null +++ b/src/table_file.cc @@ -0,0 +1,1263 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_file.h" + +#include "bloomfilter.h" +#include "db_mgr.h" +#include "internal_helper.h" +#include "table_mgr.h" + +#include _MACRO_TO_STR(LOGGER_H) + +namespace jungle { + +static inline fdb_compact_decision fdb_cb_bridge + ( fdb_file_handle *fhandle, + fdb_compaction_status status, + const char *kv_store_name, + fdb_doc *doc, + uint64_t last_oldfile_offset, + uint64_t last_newfile_offset, + void *ctx) +{ + CompactionCbParams params; + params.rec.kv.key = SizedBuf(doc->keylen, doc->key); + params.rec.kv.value = SizedBuf(doc->bodylen, doc->body); + params.rec.meta = SizedBuf(doc->metalen, doc->meta); + params.rec.seqNum = doc->seqnum; + + const DBConfig* db_config = (const DBConfig*)(ctx); + CompactionCbDecision dec = db_config->compactionCbFunc(params); + if (dec == CompactionCbDecision::DROP) return FDB_CS_DROP_DOC; + + return FDB_CS_KEEP_DOC; +} + +static inline void fdb_log_cb(int level, + int ec, + const char* file, + const char* func, + size_t line, + const char* err_msg, + void* ctx) +{ + SimpleLogger* my_log = (SimpleLogger*)ctx; + my_log->put(level, file, func, line, "[FDB][%d] %s", ec, err_msg); +} + +TableFile::FdbHandle::FdbHandle(TableFile* _parent, + const DBConfig* db_config, + const TableFileOptions& t_file_opt) + : parent(_parent) + , dbConfig(db_config) + , tFileOpt(t_file_opt) + , dbFile(nullptr) + , db(nullptr) + , config(getFdbSettings(db_config)) + , kvsConfig(getKvsSettings()) + {} + +TableFile::FdbHandle::~FdbHandle() { + close(); +} + +fdb_config TableFile::FdbHandle::getFdbSettings(const DBConfig* db_config) { + fdb_config config = fdb_get_default_config(); + + DBMgr* mgr = DBMgr::getWithoutInit(); + if (mgr) { + config.buffercache_size = mgr->getGlobalConfig()->fdbCacheSize; + fdb_set_log_callback_ex_global(fdb_log_cb, + mgr->getLogger()); + } + + if (db_config && db_config->bulkLoading) { + // Bulk loading mode: enable WAL flush before commit. + config.wal_flush_before_commit = true; + config.bulk_load_mode = true; + } else { + // Otherwise: + // Jungle will manually control WAL flushing. + config.wal_flush_before_commit = false; + config.bulk_load_mode = false; + } + config.do_not_search_wal = true; + + // Disable auto compaction, + // temporarily enable block reuse. + config.compaction_threshold = 0; + if ( db_config->blockReuseFactor && + db_config->blockReuseFactor > 100 ) { + size_t F = db_config->blockReuseFactor; + // 300% -> 66.6% stale ratio. + // 333% -> 70% stale ratio. + config.block_reusing_threshold = (F - 100) * 100 / F; + } else { + // Disabled. + config.block_reusing_threshold = 100; + } + config.max_block_reusing_cycle = db_config->maxBlockReuseCycle; + config.min_block_reuse_filesize = tFileOpt.minBlockReuseFileSize; + config.seqtree_opt = FDB_SEQTREE_USE; + config.purging_interval = 60 * 60; // 1 hour. + config.num_keeping_headers = 10; + config.do_not_move_to_compacted_file = true; + //config.enable_reusable_block_reservation = true; + + // If compaction callback function is given, enable it. + if (db_config->compactionCbFunc) { + config.compaction_cb = fdb_cb_bridge; + config.compaction_cb_ctx = (void*)db_config; + // Callback function will be invoked for every document. + config.compaction_cb_mask = FDB_CS_MOVE_DOC; + } + + // We SHOULD have at least one ForestDB background compactor, + // to do lazy file deletion. + // NOTE: + // We can also disable both compactor and lazy deletion, + // but deleting large size file may have bad impact on latency, + // as foreground deletion usually happens on close of iterator. + config.enable_background_compactor = true; + config.num_compactor_threads = 1; + + config.log_msg_level = 4; + return config; +} + +fdb_kvs_config TableFile::FdbHandle::getKvsSettings() { + return fdb_get_default_kvs_config(); +} + +void TableFile::FdbHandle::refreshSettings() { + config = getFdbSettings(dbConfig); + kvsConfig = getKvsSettings(); +} + +Status TableFile::FdbHandle::open(const std::string& filename) { + fdb_status fs; + + fs = fdb_open(&dbFile, filename.c_str(), &config); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_OPEN_FILE_FAIL; + + fs = fdb_kvs_open(dbFile, &db, NULL, &kvsConfig); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_OPEN_KVS_FAIL; + + return Status(); +} + +Status TableFile::FdbHandle::openCustomCmp(const std::string& filename, + fdb_custom_cmp_variable cmp_func, + void* cmp_func_param) +{ + fdb_status fs; + + char* kvs_names[1] = {nullptr}; + fdb_custom_cmp_variable functions[1] = {cmp_func}; + void* user_params[1] = {cmp_func_param}; + fs = fdb_open_custom_cmp(&dbFile, filename.c_str(), &config, + 1, kvs_names, functions, user_params); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_OPEN_FILE_FAIL; + + fs = fdb_kvs_open(dbFile, &db, NULL, &kvsConfig); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_OPEN_KVS_FAIL; + + return Status(); +} + +Status TableFile::FdbHandle::commit() { + fdb_status fs; + fs = fdb_commit(dbFile, FDB_COMMIT_MANUAL_WAL_FLUSH); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_COMMIT_FAIL; + return Status(); +} + +Status TableFile::FdbHandle::close() { + fdb_status fs = FDB_RESULT_SUCCESS; + if (db) { + fs = fdb_kvs_close(db); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_KVS_CLOSE_FAIL; + db = nullptr; + } + if (dbFile) { + fdb_close(dbFile); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_CLOSE_FAIL; + dbFile = nullptr; + } + return Status(); +} + +TableFile::FdbHandleGuard::FdbHandleGuard( TableFile* _t_file, + FdbHandle* _handle ) + : tFile(_t_file), handle(_handle) + {} + +TableFile::FdbHandleGuard::~FdbHandleGuard() { + if (handle) tFile->returnHandle(handle); +} + + +TableFile::TableFile(const TableMgr* table_mgr) + : myNumber(NOT_INITIALIZED) + , fOps(nullptr) + , tableMgr(table_mgr) + , tableInfo(nullptr) + , writer(nullptr) + , bfByKey(nullptr) + , tlbByKey(nullptr) + , myLog(nullptr) +{} + +TableFile::~TableFile() { + assert(snapHandles.size() == 0); + { std::lock_guard l(latestSnapshotLock); + for (Snapshot*& cur_snp: latestSnapshot) { + // Remaining snapshot's reference counter should be 1, + // which is referred to by this file. + // Note that all iterators derived from this file + // should be closed before calling this destructor. + assert(cur_snp->refCount == 1); + fdb_kvs_close(cur_snp->fdbSnap); + delete cur_snp; + } + } + if (writer) { + DELETE(writer); + } + for (auto& entry: readers) { + delete entry; + } + DELETE(bfByKey); + DELETE(tlbByKey); +} + +std::string TableFile::getTableFileName(const std::string& path, + uint64_t prefix_num, + uint64_t table_file_num) +{ + // Table file name example: table0001_00000001 + // table0001_00000002 + // ... + char p_num[16]; + char t_num[16]; + sprintf(p_num, "%04" PRIu64, prefix_num); + sprintf(t_num, "%08" PRIu64, table_file_num); + std::string t_filename = path + "/table" + p_num + "_" + t_num; + return t_filename; +} + +TableFile::FdbHandle* TableFile::getIdleHandle() { + mGuard l(readersLock); + FdbHandle* ret = nullptr; + auto entry = readers.begin(); + if (entry == readers.end()) { + l.unlock(); + + ret = new FdbHandle(this, tableMgr->getDbConfig(), myOpt); + openFdbHandle(tableMgr->getDbConfig(), filename, ret); + + l.lock(); + } else { + ret = *entry; + readers.pop_front(); + } + return ret; +} + +void TableFile::returnHandle(FdbHandle* f_handle) { + mGuard l(readersLock); + readers.push_front(f_handle); +} + +Status TableFile::openFdbHandle(const DBConfig* db_config, + const std::string& f_name, + FdbHandle* f_handle) +{ + Status s; + if (db_config->cmpFunc) { + // Custom cmp mode. + EP( f_handle->openCustomCmp( f_name, + db_config->cmpFunc, + db_config->cmpFuncParam ) ); + } else { + EP( f_handle->open(f_name) ); + } + return Status::OK; +} + +uint64_t TableFile::getBfSizeByLevel(const DBConfig* db_config, size_t level) { + uint64_t MAX_TABLE_SIZE = db_config->getMaxTableSize(level); + uint64_t bf_bitmap_size = MAX_TABLE_SIZE / 1024 * + db_config->bloomFilterBitsPerUnit; + return bf_bitmap_size; +} + +uint64_t TableFile::getBfSizeByWss(const DBConfig* db_config, uint64_t wss) { + uint64_t bf_bitmap_size = wss / 1024 * + db_config->bloomFilterBitsPerUnit; + return bf_bitmap_size; +} + +uint64_t TableFile::getBfSize() const { + if (!bfByKey) return 0; + return bfByKey->size(); +} + +void TableFile::initBooster(size_t level, const DBConfig* db_config) { + uint64_t limit = tableMgr->getBoosterLimit(level); + if (!limit) return; + tlbByKey = new TableLookupBooster( limit, tableMgr, this ); +} + +Status TableFile::create(size_t level, + uint64_t table_number, + const std::string& f_name, + FileOps* f_ops, + const TableFileOptions& opt) +{ + if (writer) return Status::ALREADY_INITIALIZED; + + Status s; + filename = f_name; + myNumber = table_number; + fOps = f_ops; + myOpt = opt; + + if (fOps->exist(filename)) { + // Previous file exists, which means that there is a legacy log file. + // We should overwrite it. + _log_warn(myLog, "table %s already exists, remove it", filename.c_str()); + fOps->remove(filename); + } + + const DBConfig* db_config = tableMgr->getDbConfig(); + + // Create a ForestDB file. + writer = new FdbHandle(this, tableMgr->getDbConfig(), myOpt); + EP( openFdbHandle(db_config, filename, writer) ); + + // Bloom filter (LSM mode only). + if ( db_config->bloomFilterBitsPerUnit > 0.0 && + !bfByKey ) { + uint64_t bf_bitmap_size = myOpt.bloomFilterSize; + if (!bf_bitmap_size) bf_bitmap_size = getBfSizeByLevel(db_config, level); + bfByKey = new BloomFilter(bf_bitmap_size, 3); + + // Initial save. + saveBloomFilter(filename + ".bf", bfByKey, true); + } + + // Lookup booster. + initBooster(level, db_config); + + // Initial commit. + EP( writer->commit() ); + updateSnapshot(); + + return Status(); +} + +Status TableFile::load(size_t level, + uint64_t table_number, + const std::string& f_name, + FileOps* f_ops, + const TableFileOptions& opt) +{ + if (writer) return Status::ALREADY_INITIALIZED; + if (!f_ops->exist(f_name.c_str())) return Status::FILE_NOT_EXIST; + + Status s; + filename = f_name; + myNumber = table_number; + fOps = f_ops; + myOpt = opt; + + const DBConfig* db_config = tableMgr->getDbConfig(); + + // Handle for writer. + writer = new FdbHandle(this, tableMgr->getDbConfig(), myOpt); + EP( openFdbHandle(db_config, filename, writer) ); + + // Bloom filter (LSM mode only). + if ( db_config->bloomFilterBitsPerUnit > 0.0 && + !bfByKey ) { + std::string bf_filename = filename + ".bf"; + loadBloomFilter(bf_filename, bfByKey); + } + + // Lookup booster. + initBooster(level, db_config); + + // Pre-load snapshot. + updateSnapshot(); + + return Status(); +} + +Status TableFile::loadBloomFilter(const std::string& filename, + BloomFilter*& bf_out) +{ + // Bloom filter file doesn't exist, just OK. + if (!fOps->exist(filename)) { + bf_out = nullptr; + return Status::OK; + } + + Status s; + FileHandle* b_file = nullptr; + EP( fOps->open(&b_file, filename.c_str()) ); + + try { + size_t file_size = fOps->eof(b_file); + if (!file_size) throw Status(); + + SizedBuf header( sizeof(uint32_t) * 2 ); + SizedBuf::Holder h_header(header); + SizedBuf buf(file_size - header.size); + + TC( fOps->pread(b_file, header.data, header.size, 0) ); + RwSerializer ss(header); + + // << Format >> + // Version 4 bytes + // Length (X) 4 bytes + // Bitmap X bytes + uint32_t ver = ss.getU32(s); + (void)ver; + uint32_t data_size = ss.getU32(); + (void)data_size; + assert(data_size == buf.size); + + TC( fOps->pread(b_file, buf.data, data_size, header.size) ); + bf_out = new BloomFilter(0, 3); + // Memory region of `buf` will be moved to Bloom filter. + bf_out->moveBitmapFrom(buf.data, buf.size); + + EP( fOps->close(b_file) ); + DELETE(b_file); + return Status::OK; + + } catch (Status s) { + EP( fOps->close(b_file) ); + DELETE(b_file); + return Status::OK; + } +} + +Status TableFile::saveBloomFilter(const std::string& filename, + BloomFilter* bf, + bool call_fsync) +{ + if (filename.empty() || !bf || !bf->size()) return Status::OK; + + Status s; + FileHandle* b_file = nullptr; + EP( fOps->open(&b_file, filename.c_str()) ); + + try { + size_t data_size = bf->size() / 8; + SizedBuf buf( sizeof(uint32_t) * 2); + SizedBuf::Holder h_buf(buf); + + RwSerializer ss(buf); + ss.putU32(0); + ss.putU32(data_size); + TC( fOps->pwrite(b_file, buf.data, buf.size, 0) ); + TC( fOps->pwrite(b_file, bf->getPtr(), data_size, buf.size) ); + if (call_fsync) fOps->fsync(b_file); + + EP( fOps->close(b_file) ); + DELETE(b_file); + return Status::OK; + + } catch (Status s) { + EP( fOps->close(b_file) ); + DELETE(b_file); + return Status::OK; + } +} + +Status TableFile::changeOptions(const TableFileOptions& new_opt) { + Status s; + + _log_info(myLog, "table %zu_%zu changed minBlockReuseFileSize %zu -> %zu", + tableMgr->getTableMgrOptions()->prefixNum, myNumber, + myOpt.minBlockReuseFileSize, new_opt.minBlockReuseFileSize); + myOpt = new_opt; + + // Close and reopen to apply the new configuration. + writer->close(); + writer->refreshSettings(); + EP( openFdbHandle(tableMgr->getDbConfig(), filename, writer) ); + + return Status(); +} + +Status TableFile::openSnapshot(DB* snap_handle, + const uint64_t checkpoint) +{ + Status s; + uint64_t snap_seqnum = 0; + + { mGuard l(chkMapLock); + auto entry = chkMap.find(checkpoint); + if (entry == chkMap.end()) { + // Exact match doesn't exist. + auto e_max = chkMap.rbegin(); + if ( e_max == chkMap.rend() || + checkpoint > e_max->second ) { + // Beyond the table's checkpoint. + // Take the latest marker. + l.unlock(); + getLatestSnapMarker(snap_seqnum); + + } else { + // Find greatest one smaller than chk. + auto entry = chkMap.begin(); + while (entry != chkMap.end()) { + if (entry->first <= checkpoint) { + snap_seqnum = entry->second; + } + entry++; + } + } + } else { + // Exact match exists. + snap_seqnum = entry->second; + } + } + if (!snap_seqnum) return Status::INVALID_CHECKPOINT; + + FdbHandleGuard g(this, getIdleHandle()); + fdb_kvs_handle* kvs_db = g.handle->db; + + fdb_status fs; + fdb_kvs_handle* fdbSnap; + fs = fdb_snapshot_open(kvs_db, &fdbSnap, snap_seqnum); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_OPEN_KVS_FAIL; + + { mGuard l(snapHandlesLock); + snapHandles.insert( std::make_pair(snap_handle, fdbSnap) ); + } + return Status(); +} + +Status TableFile::closeSnapshot(DB* snap_handle) { + Status s; + fdb_kvs_handle* fdb_snap = nullptr; + { mGuard l(snapHandlesLock); + auto entry = snapHandles.find(snap_handle); + if (entry == snapHandles.end()) return Status::INVALID_SNAPSHOT; + fdb_snap = entry->second; + snapHandles.erase(entry); + } + + fdb_status fs = fdb_kvs_close(fdb_snap); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_CLOSE_FAIL; + return Status(); +} + +void TableFile::addCheckpoint(uint64_t chk, uint64_t commit_seqnum) { + if (tableInfo) { + _log_info(myLog, "file lv %zu num %zu hash %zu checkpoint %zu %zu", + tableInfo->level, tableInfo->number, tableInfo->hashNum, + chk, commit_seqnum); + } + mGuard l(chkMapLock); + chkMap.insert( std::make_pair(chk, commit_seqnum) ); +} + +Status TableFile::setCheckpoint(Record* rec, + uint64_t prev_seqnum, + std::list& checkpoints, + bool remaining_all) +{ + Status s; + for (auto& chk_entry: checkpoints) { + uint64_t chk = chk_entry; + if ( prev_seqnum == chk || + (prev_seqnum < chk && rec && chk < rec->seqNum) || + (prev_seqnum <= chk && remaining_all) ) { + // Commit for the checkpoint. + fdb_seqnum_t commit_seqnum; + fdb_get_kvs_seqnum(writer->db, &commit_seqnum); + EP( writer->commit() ); + + addCheckpoint(chk, commit_seqnum); + } + } + return Status(); +} + +void TableFile::userMetaToRawMeta(const SizedBuf& user_meta, + bool is_tombstone, + SizedBuf& raw_meta_out) +{ + // Add 9 bytes in front: + // identifier (1 byte) + version (4 bytes) + flags (4 bytes). + + // NOTE: Even though `user_meta` is empty, we should put 9 bytes. + if (!raw_meta_out.size) { + raw_meta_out.alloc(user_meta.size + META_ADD_SIZE); + } + RwSerializer rw(raw_meta_out); + + // Put 0x1 as an identifier. + rw.putU8(0x1); + // Version 1. + rw.putU32(0x0); + + // Flags. + uint32_t flags = 0x0; + if (is_tombstone) flags |= 0x1; + rw.putU32(flags); + + // User meta. + rw.put(user_meta.data, user_meta.size); +} + +void TableFile::rawMetaToUserMeta(const SizedBuf& raw_meta, + bool& is_tombstone_out, + SizedBuf& user_meta_out) +{ + if (raw_meta.empty()) return; + + RwSerializer rw(raw_meta); + + // Check identifier. + uint8_t identifier = rw.getU8(); + if (identifier != 0x1) { + // No conversion. + raw_meta.copyTo(user_meta_out); + return; + } + + // Version. + uint32_t version = rw.getU32(); + (void)version; // TODO: version. + + // Flags. + uint32_t flags = rw.getU32(); + if (flags & 0x1) is_tombstone_out = true; + + // User meta. + if (raw_meta.size <= META_ADD_SIZE) { + // Empty user meta. + return; + } + + user_meta_out.alloc(raw_meta.size - META_ADD_SIZE); + rw.get(user_meta_out.data, user_meta_out.size); +} + +Status TableFile::setSingle(uint32_t key_hash_val, + const Record& rec, + uint64_t& offset_out) +{ + fdb_doc doc; + fdb_status fs; + fdb_kvs_handle* kvs_db = writer->db; + + memset(&doc, 0x0, sizeof(doc)); + doc.key = rec.kv.key.data; + doc.keylen = rec.kv.key.size; + + char tmp_buf[512]; + SizedBuf raw_meta_static(rec.meta.size + META_ADD_SIZE, tmp_buf); + + SizedBuf raw_meta_alloc; + SizedBuf::Holder h_raw_meta(raw_meta_alloc); + + if (rec.meta.size < 500) { + userMetaToRawMeta(rec.meta, rec.isDel(), raw_meta_static); + doc.meta = raw_meta_static.data; + doc.metalen = raw_meta_static.size; + + } else { + userMetaToRawMeta(rec.meta, rec.isDel(), raw_meta_alloc); + doc.meta = raw_meta_alloc.data; + doc.metalen = raw_meta_alloc.size; + } + + doc.body = rec.kv.value.data; + doc.bodylen = rec.kv.value.size; + doc.seqnum = rec.seqNum; + doc.flags = FDB_CUSTOM_SEQNUM; + + fs = fdb_set(kvs_db, &doc); + if (fs != FDB_RESULT_SUCCESS) { + return Status::FDB_SET_FAIL; + } + + offset_out = doc.offset; + + if (rec.isIns()) { + // Set bloom filter if exists. + if (bfByKey) { + bfByKey->set(rec.kv.key.data, rec.kv.key.size); + } + } + // Put into booster if exists. + if (tlbByKey) { + TableLookupBooster::Elem ee( key_hash_val, rec.seqNum, offset_out ); + tlbByKey->setIfNew(ee); + } + + return Status(); +} + +Status TableFile::setBatch(std::list& batch, + std::list& checkpoints, + const SizedBuf& min_key, + const SizedBuf& min_key_next_table, + uint32_t target_hash, + bool bulk_load_mode) +{ + Timer tt; + + uint64_t prev_seqnum = 0; + size_t num_l0 = tableMgr->getNumL0Partitions(); + size_t set_count = 0; + size_t del_count = 0; + + for (auto& entry: batch) { + Record* rec = entry; + + // If hash is given, check hash. + uint32_t hash_val = getMurmurHash32(rec->kv.key); + if (target_hash != _SCU32(-1)) { + size_t key_hash = hash_val % num_l0; + if (key_hash != target_hash) continue; + } + + // If range is given, check range: + // [min_key, min_key_next_table) + if ( !min_key.empty() && + rec->kv.key < min_key) continue; + if ( !min_key_next_table.empty() && + rec->kv.key >= min_key_next_table ) continue; + + // Append all checkpoints that + // `record[n-1] seqnum <= chk < record[n] seqnum` + setCheckpoint(rec, prev_seqnum, checkpoints); + + if (rec->isCmd()) continue; + + uint64_t offset_out = 0; + Status s = setSingle(hash_val, *rec, offset_out); + if (!s) return s; + + if (rec->isDel()) del_count++; + else set_count++; + + prev_seqnum = rec->seqNum; + } + + // Set all remaining (record[n] <= chk) checkpoints. + setCheckpoint(nullptr, prev_seqnum, checkpoints, true); + + // Save bloom filter. + // WARNING: Writing bloom filter SHOULD BE DONE BEFORE COMMIT. + Timer tt_bf; + if (bfByKey) saveBloomFilter(filename + ".bf", bfByKey, true); + uint64_t bf_elapsed = tt_bf.getUs(); + + if (!bulk_load_mode) { + // Commit and update index node (not in bulk load mode). + writer->commit(); + + // Pre-load & keep the snapshot of latest table file data. + updateSnapshot(); + } + + SimpleLogger::Levels ll = SimpleLogger::INFO; + if (tableInfo) { + if (tableInfo->level) { + _log_( ll, myLog, + "L%zu: file %zu_%zu, set %zu del %zu, %zu us, %zu us", + tableInfo->level, + tableMgr->getTableMgrOptions()->prefixNum, + myNumber, + set_count, del_count, tt.getUs(), bf_elapsed ); + } else { + _log_( ll, myLog, + "L%zu: hash %zu, file %zu_%zu, set %zu del %zu, %zu us, %zu us", + tableInfo->level, + tableInfo->hashNum, + tableMgr->getTableMgrOptions()->prefixNum, + myNumber, + set_count, del_count, tt.getUs(), bf_elapsed ); + } + } else { + _log_( ll, myLog, + "brand new table: file %zu_%zu, set %zu del %zu, %zu us, %zu us", + tableMgr->getTableMgrOptions()->prefixNum, + myNumber, + set_count, del_count, tt.getUs(), bf_elapsed ); + } + + // Bulk load mode: all done here. + if (bulk_load_mode) return Status(); + + { + // Remove all checkpoints earlier than the oldest seqnum. + uint64_t oldest_seq = 0; + getOldestSnapMarker(oldest_seq); + + mGuard l(chkMapLock); + auto entry = chkMap.begin(); + while (entry != chkMap.end()) { + if ( entry->first < oldest_seq || + entry->second < oldest_seq ) { + if (tableInfo) { + _log_debug( myLog, + "file lv %zu num %zu hash %zu removed " + "checkpoint %zu %zu", + tableInfo->level, tableInfo->number, + tableInfo->hashNum, + entry->first, entry->second ); + } + entry = chkMap.erase(entry); + } else { + entry++; + } + } + } + + return Status(); +} + +Status TableFile::get(DB* snap_handle, + Record& rec_io, + bool meta_only) +{ + const DBConfig* db_config = tableMgr->getDbConfig(); + + // Search bloom filter first if exists. + if ( bfByKey && + db_config->useBloomFilterForGet && + !bfByKey->check(rec_io.kv.key.data, rec_io.kv.key.size) ) { + return Status::KEY_NOT_FOUND; + } + + fdb_status fs; + fdb_doc doc_base; + fdb_doc doc_by_offset; + memset(&doc_base, 0x0, sizeof(doc_base)); + doc_base.key = rec_io.kv.key.data; + doc_base.keylen = rec_io.kv.key.size; + + fdb_doc* doc = &doc_base; + + if (snap_handle) { + // Snapshot (does not use booster). + fdb_kvs_handle* kvs_db = nullptr; + { mGuard l(snapHandlesLock); + auto entry = snapHandles.find(snap_handle); + if (entry == snapHandles.end()) return Status::SNAPSHOT_NOT_FOUND; + kvs_db = entry->second; + } + + if (meta_only) { + fs = fdb_get_metaonly(kvs_db, doc); + } else { + fs = fdb_get(kvs_db, doc); + } + + } else { + // Normal. + FdbHandleGuard g(this, getIdleHandle()); + fdb_kvs_handle* kvs_db = g.handle->db; + + bool skip_normal_search = false; + uint32_t key_hash = getMurmurHash32(rec_io.kv.key); + IF ( !meta_only && tlbByKey ) { + // Search booster if exists. + memset(&doc_by_offset, 0x0, sizeof(doc_by_offset)); + Status s = tlbByKey->get( key_hash, doc_by_offset.offset ); + if (!s) break; + + fs = fdb_get_byoffset_raw(kvs_db, &doc_by_offset); + if (fs != FDB_RESULT_SUCCESS) { + break; + } + + if ( rec_io.kv.key == SizedBuf( doc_by_offset.keylen, + doc_by_offset.key ) ) { + skip_normal_search = true; + free(doc_by_offset.key); + doc_by_offset.key = rec_io.kv.key.data; + doc_by_offset.keylen = rec_io.kv.key.size; + doc = &doc_by_offset; + } else { + free(doc_by_offset.key); + free(doc_by_offset.meta); + free(doc_by_offset.body); + } + } + + if (!skip_normal_search) { + if (meta_only) { + fs = fdb_get_metaonly(kvs_db, doc); + } else { + fs = fdb_get(kvs_db, doc); + if ( fs == FDB_RESULT_SUCCESS && tlbByKey ) { + // Put into booster if exists. + tlbByKey->setIfNew( TableLookupBooster::Elem + ( key_hash, doc->seqnum, doc->offset ) ); + } + } + } + } + if (fs != FDB_RESULT_SUCCESS) { + return Status::KEY_NOT_FOUND; + } + + rec_io.kv.value.set(doc->bodylen, doc->body); + rec_io.kv.value.setNeedToFree(); + + // Decode meta. + SizedBuf user_meta_out; + SizedBuf raw_meta(doc->metalen, doc->meta);; + SizedBuf::Holder h_raw_meta(raw_meta); // auto free raw meta. + raw_meta.setNeedToFree(); + bool is_tombstone_out = false; + rawMetaToUserMeta(raw_meta, is_tombstone_out, user_meta_out); + + user_meta_out.moveTo( rec_io.meta ); + + rec_io.seqNum = doc->seqnum; + rec_io.type = (is_tombstone_out || doc->deleted) + ? Record::DELETION + : Record::INSERTION; + + return Status(); +} + +Status TableFile::getByOffset(DB* snap_handle, + uint64_t offset, + Record& rec_out) +{ + fdb_status fs; + fdb_doc doc_by_offset; + memset(&doc_by_offset, 0x0, sizeof(doc_by_offset)); + doc_by_offset.offset = offset; + + fdb_doc* doc = &doc_by_offset; + + if (snap_handle) { + // Snapshot (does not use booster). + fdb_kvs_handle* kvs_db = nullptr; + { mGuard l(snapHandlesLock); + auto entry = snapHandles.find(snap_handle); + if (entry == snapHandles.end()) return Status::SNAPSHOT_NOT_FOUND; + kvs_db = entry->second; + } + + fs = fdb_get_byoffset_raw(kvs_db, &doc_by_offset); + + } else { + // Normal. + FdbHandleGuard g(this, getIdleHandle()); + fdb_kvs_handle* kvs_db = g.handle->db; + + fs = fdb_get_byoffset_raw(kvs_db, &doc_by_offset); + } + if (fs != FDB_RESULT_SUCCESS) { + return Status::INVALID_OFFSET; + } + + rec_out.kv.key.set(doc->keylen, doc->key); + rec_out.kv.key.setNeedToFree(); + + rec_out.kv.value.set(doc->bodylen, doc->body); + rec_out.kv.value.setNeedToFree(); + + // Decode meta. + SizedBuf user_meta_out; + SizedBuf raw_meta(doc->metalen, doc->meta);; + SizedBuf::Holder h_raw_meta(raw_meta); // auto free raw meta. + raw_meta.setNeedToFree(); + bool is_tombstone_out = false; + rawMetaToUserMeta(raw_meta, is_tombstone_out, user_meta_out); + + user_meta_out.moveTo( rec_out.meta ); + + rec_out.seqNum = doc->seqnum; + rec_out.type = (is_tombstone_out || doc->deleted) + ? Record::DELETION + : Record::INSERTION; + + return Status(); +} + +Status TableFile::appendCheckpoints(RwSerializer& file_s) +{ + mGuard l(chkMapLock); + file_s.putU32(chkMap.size()); + for (auto& entry: chkMap) { + uint64_t chk = entry.first; + uint64_t fdb_seq = entry.second; + file_s.putU64(chk); + file_s.putU64(fdb_seq); + } + return Status(); +} + +Status TableFile::loadCheckpoints(RwSerializer& file_s) +{ + mGuard l(chkMapLock); + Status s; + uint32_t num_chks = file_s.getU32(s); + for (size_t ii=0; ii& chk_out) { + mGuard l(chkMapLock); + for (auto& entry: chkMap) { + uint64_t chk_num = entry.first; + chk_out.push_back(chk_num); + } + return Status(); +} + +Status TableFile::getCheckpointSeqnum(uint64_t chk, uint64_t& seqnum_out) { + mGuard l(chkMapLock); + auto entry = chkMap.find(chk); + if (entry != chkMap.end()) { + seqnum_out = entry->second; + return Status(); + } + return Status::ERROR; +} + +Status TableFile::destroySelf() { + if (fOps->exist(filename.c_str())) { + // Instead removing it immediately, + // put it into remove list. + DBMgr* dbm = DBMgr::getWithoutInit(); + std::string bf_filename = filename + ".bf"; + if (!dbm) { + fOps->remove(filename.c_str()); + fOps->remove(bf_filename.c_str()); + } else { + dbm->addFileToRemove(filename); + dbm->addFileToRemove(bf_filename); + } + } + return Status(); +} + +Status TableFile::getLatestSnapMarker(uint64_t& last_snap_seqnum) { + FdbHandleGuard g(this, this->getIdleHandle()); + fdb_file_handle* db_file = g.handle->dbFile; + + // Get last snap marker. + fdb_snapshot_info_t* markers = nullptr; + uint64_t num_markers = 0; + fdb_status fs = fdb_get_all_snap_markers(db_file, &markers, &num_markers); + if (fs != FDB_RESULT_SUCCESS) return Status::ERROR; + if (!markers || !num_markers) return Status::SNAPSHOT_NOT_FOUND; + + last_snap_seqnum = markers[0].kvs_markers[0].seqnum; + fdb_free_snap_markers(markers, num_markers); + return Status(); +} + +Status TableFile::getSnapMarkerUpto(uint64_t upto, + uint64_t& snap_seqnum_out) +{ + FdbHandleGuard g(this, this->getIdleHandle()); + fdb_file_handle* db_file = g.handle->dbFile; + + // Get last snap marker. + fdb_snapshot_info_t* markers = nullptr; + uint64_t num_markers = 0; + fdb_status fs = fdb_get_all_snap_markers(db_file, &markers, &num_markers); + if (fs != FDB_RESULT_SUCCESS) return Status::ERROR; + if (!markers || !num_markers) return Status::SNAPSHOT_NOT_FOUND; + + snap_seqnum_out = 0; + for (size_t ii=0; ii= markers[ii].kvs_markers[0].seqnum) { + snap_seqnum_out = markers[ii].kvs_markers[0].seqnum; + break; + } + } + fdb_free_snap_markers(markers, num_markers); + return Status(); +} + +Status TableFile::getOldestSnapMarker(uint64_t& oldest_snap_seqnum) { + FdbHandleGuard g(this, this->getIdleHandle()); + fdb_file_handle* db_file = g.handle->dbFile; + + // Get first snap marker. + fdb_snapshot_info_t* markers = nullptr; + uint64_t num_markers = 0; + fdb_status fs = fdb_get_all_snap_markers(db_file, &markers, &num_markers); + if (fs != FDB_RESULT_SUCCESS) return Status::ERROR; + if (!markers || !num_markers) return Status::SNAPSHOT_NOT_FOUND; + + oldest_snap_seqnum = markers[num_markers-1].kvs_markers[0].seqnum; + fdb_free_snap_markers(markers, num_markers); + return Status(); +} + +Status TableFile::getStats(TableStats& stats_out) { + FdbHandleGuard g(this, this->getIdleHandle()); + fdb_file_handle* db_file = g.handle->dbFile; + fdb_kvs_handle* kvs_db = g.handle->db; + + fdb_file_info info; + fdb_status fs = fdb_get_file_info(db_file, &info); + if (fs != FDB_RESULT_SUCCESS) return Status::ERROR; + + fdb_kvs_info kvs_info; + fs = fdb_get_kvs_info(kvs_db, &kvs_info); + + stats_out.numKvs = info.doc_count; + stats_out.workingSetSizeByte = info.space_used; + stats_out.totalSizeByte = info.file_size; + + // This should be a bug. + assert(stats_out.workingSetSizeByte < stats_out.totalSizeByte * 2); + if (stats_out.workingSetSizeByte > stats_out.totalSizeByte * 2) { + _log_fatal(myLog, "found wrong WSS, %s, %zu / %zu", + filename.c_str(), + stats_out.workingSetSizeByte, + stats_out.totalSizeByte); + + DBMgr* dbm = DBMgr::getWithoutInit(); + if (dbm) { + _log_fatal(dbm->getLogger(), + "found wrong WSS, %s, %zu / %zu", + filename.c_str(), + stats_out.workingSetSizeByte, + stats_out.totalSizeByte); + } + // Make it small so as to compact quickly + stats_out.workingSetSizeByte = stats_out.totalSizeByte / 10; + } + + stats_out.blockReuseCycle = info.sb_bmp_revnum; + + stats_out.lastSeqnum = kvs_info.last_seqnum; + stats_out.approxDocCount = kvs_info.doc_count; + stats_out.approxDelCount = kvs_info.deleted_count; + + return Status(); +} + +Status TableFile::getMaxKey(SizedBuf& max_key_out) { + Status s; + TableFile::Iterator itr; + EP( itr.init(nullptr, this, SizedBuf(), SizedBuf()) ); + + try { + TC( itr.gotoEnd() ); + + Record rec_out; + Record::Holder h_rec_out(rec_out); + TC( itr.get(rec_out) ); + + rec_out.kv.key.moveTo(max_key_out); + return Status(); + + } catch (Status s) { + return s; + } +} + +Status TableFile::updateSnapshot() { + fdb_seqnum_t snap_seqnum = 0; + getLatestSnapMarker(snap_seqnum); + + FdbHandleGuard g(this, getIdleHandle()); + fdb_kvs_handle* kvs_db = g.handle->db; + fdb_kvs_handle* snap_handle = nullptr; + fdb_status fs = fdb_snapshot_open(kvs_db, &snap_handle, snap_seqnum); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_OPEN_KVS_FAIL; + + Snapshot* new_snp = new Snapshot(this, snap_handle, snap_seqnum); + + std::list stale_snps; + { std::lock_guard l(latestSnapshotLock); + auto entry = latestSnapshot.begin(); + while (entry != latestSnapshot.end()) { + Snapshot*& cur_snp = *entry; + cur_snp->refCount--; + if (!cur_snp->refCount) { + stale_snps.push_back(cur_snp); + entry = latestSnapshot.erase(entry); + } else { + entry++; + } + } + latestSnapshot.push_front(new_snp); + } + + // Close all stale snapshots (refCount == 0). + for (Snapshot*& cur_snp: stale_snps) { + fdb_kvs_close(cur_snp->fdbSnap); + delete cur_snp; + } + return Status(); +} + +Status TableFile::leaseSnapshot(TableFile::Snapshot*& snp_out) { + std::lock_guard l(latestSnapshotLock); + auto entry = latestSnapshot.begin(); + assert(entry != latestSnapshot.end()); + Snapshot* snp = *entry; + snp->refCount++; + snp_out = snp; + + return Status(); +} + +Status TableFile::returnSnapshot(TableFile::Snapshot* snapshot) { + std::list stale_snps; + { std::lock_guard l(latestSnapshotLock); + snapshot->refCount--; + auto entry = latestSnapshot.begin(); + while (entry != latestSnapshot.end()) { + Snapshot*& cur_snp = *entry; + if (!cur_snp->refCount) { + stale_snps.push_back(cur_snp); + entry = latestSnapshot.erase(entry); + } else { + entry++; + } + } + } + + // Close all stale snapshots (refCount == 0). + for (Snapshot*& cur_snp: stale_snps) { + fdb_kvs_close(cur_snp->fdbSnap); + delete cur_snp; + } + return Status(); +} + +} // namespace jungle + diff --git a/src/table_file.h b/src/table_file.h new file mode 100644 index 0000000..e174e89 --- /dev/null +++ b/src/table_file.h @@ -0,0 +1,366 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "fileops_base.h" +#include "internal_helper.h" +#include "table_lookup_booster.h" + +#include +#include + +#include +#include +#include +#include + +class BloomFilter; +class SimpleLogger; + +namespace jungle { + +class TableFileOptions { +public: + TableFileOptions() + : minBlockReuseFileSize(64*1024*1024) // 64 MB. + , bloomFilterSize(0) + {} + + uint64_t minBlockReuseFileSize; + + // Pre-defined bloom filter size for this table + // (the total number of bits). + // If zero, it will be automatically calculated. + uint64_t bloomFilterSize; +}; + +class RwSerializer; +struct TableInfo; +class TableMgr; +class TableStats; +class TableFile { + struct FdbHandle; + +public: + static const TableFileOptions DEFAULT_OPT; + + TableFile(const TableMgr* table_mgr); + + ~TableFile(); + + void fdbLogCb(int level, + int ec, + const char* file, + const char* func, + size_t line, + const char* err_msg, + void* ctx); + + static std::string getTableFileName(const std::string& path, + uint64_t prefix_num, + uint64_t table_file_num); + + static uint64_t getBfSizeByLevel(const DBConfig* db_config, + size_t level); + + static uint64_t getBfSizeByWss(const DBConfig* db_config, + uint64_t wss); + + FdbHandle* getIdleHandle(); + + void returnHandle(FdbHandle* f_handle); + + Status openFdbHandle(const DBConfig* db_config, + const std::string& f_name, + FdbHandle* f_handle); + + uint64_t getBfSize() const; + + Status create(size_t level, + uint64_t table_number, + const std::string& filename, + FileOps* f_ops, + const TableFileOptions& opt = DEFAULT_OPT); + + Status load(size_t level, + uint64_t table_number, + const std::string& f_name, + FileOps* f_ops, + const TableFileOptions& opt = DEFAULT_OPT); + + Status loadBloomFilter(const std::string& filename, + BloomFilter*& bf_out); + + Status saveBloomFilter(const std::string& filename, + BloomFilter* bf, + bool call_fsync); + + Status changeOptions(const TableFileOptions& new_opt); + + Status openSnapshot(DB* snap_handle, + const uint64_t checkpoint); + Status closeSnapshot(DB* snap_handle); + + static void userMetaToRawMeta(const SizedBuf& user_meta, + bool is_tombstone, + SizedBuf& raw_meta_out); + + static void rawMetaToUserMeta(const SizedBuf& raw_meta, + bool& is_tombstone_out, + SizedBuf& user_meta_out); + + Status setSingle(uint32_t key_hash_val, + const Record& rec, + uint64_t& offset_out); + + Status setBatch(std::list& batch, + std::list& checkpoints, + const SizedBuf& min_key, + const SizedBuf& min_key_next_table, + uint32_t target_hash = _SCU32(-1), + bool bulk_load_mode = false); + + Status get(DB* snap_handle, + Record& rec_inout, + bool meta_only = false); + + Status getByOffset(DB* snap_handle, + uint64_t offset, + Record& rec_out); + + Status appendCheckpoints(RwSerializer& file_s); + + Status loadCheckpoints(RwSerializer& file_s); + + Status getAvailCheckpoints(std::list& chk_out); + + Status getCheckpointSeqnum(uint64_t chk, uint64_t& seqnum_out); + + bool isFdbDocTombstone(fdb_doc* doc); + + Status compactTo(const std::string& dst_filename, + const CompactOptions& options); + + Status mergeCompactTo(const std::string& file_to_merge, + const std::string& dst_filename, + const CompactOptions& options); + + Status destroySelf(); + + Status getLatestSnapMarker(uint64_t& last_snap_seqnum); + + Status getSnapMarkerUpto(uint64_t upto, + uint64_t& snap_seqnum_out); + + Status getOldestSnapMarker(uint64_t& oldest_snap_seqnum); + + void addCheckpoint(uint64_t chk, uint64_t commit_seqnum); + + void setLogger(SimpleLogger* logger) { myLog = logger; } + + void setTableInfo(TableInfo* t_info) { tableInfo = t_info; } + + SimpleLogger* getLogger() const { return myLog; } + + Status getStats(TableStats& stats_out); + + const std::string& getName() const { return filename; } + + uint64_t getNumber() const { return myNumber; } + + Status getMaxKey(SizedBuf& max_key_out); + + class Snapshot { + public: + Snapshot(TableFile* t_file, + fdb_kvs_handle* fdb_snap, + uint64_t fdb_seqnum) + : tFile(t_file) + , fdbSnap(fdb_snap) + , fdbSeqnum(fdb_seqnum) + , refCount(1) + {} + TableFile* tFile; + fdb_kvs_handle* fdbSnap; + uint64_t fdbSeqnum; + uint32_t refCount; + }; + + class Iterator { + public: + Iterator(); + ~Iterator(); + + enum SeekOption { + GREATER = 0, + SMALLER = 1, + }; + + Status init(DB* snap_handle, + TableFile* t_file, + const SizedBuf& start_key, + const SizedBuf& end_key); + Status initSN(DB* snap_handle, + TableFile* t_file, + const uint64_t min_seq, + const uint64_t max_seq); + Status get(Record& rec_out); + Status getMeta(Record& rec_out, + size_t& valuelen_out, + uint64_t& offset_out); + Status prev(); + Status next(); + Status seek(const SizedBuf& key, SeekOption opt = GREATER); + Status seekSN(const uint64_t seqnum, SeekOption opt = GREATER); + Status gotoBegin(); + Status gotoEnd(); + Status close(); + + enum Type { + BY_KEY = 0, + BY_SEQ = 1, + } type; + TableFile* tFile; + // Snapshot of Table file. + Snapshot* tFileSnap; + // Snapshot handle of ForestDB. + fdb_kvs_handle* fdbSnap; + // Iterator handle of ForestDB, derived from `fdbSnap`. + fdb_iterator* fdbItr; + uint64_t minSeq; + uint64_t maxSeq; + }; + + Status updateSnapshot(); + + Status leaseSnapshot(Snapshot*& snapshot_out); + + Status returnSnapshot(Snapshot* snapshot); + +private: +// === TYPES + // Compaction is triggered only when file size is bigger than 4MB. + static const uint64_t MIN_COMPACT_FILE_SIZE = 4*1024*1024; + // Compact when `active size < file size * 50 %`. + static const uint64_t COMPACT_RATIO = 50; + // Additional size added to user meta. + static const size_t META_ADD_SIZE = 9; + + struct FdbHandle { + FdbHandle(TableFile* _parent, + const DBConfig* db_config, + const TableFileOptions& t_file_opt); + ~FdbHandle(); + + fdb_config getFdbSettings(const DBConfig* db_config); + fdb_kvs_config getKvsSettings(); + void refreshSettings(); + Status open(const std::string& filename); + Status openCustomCmp(const std::string& filename, + fdb_custom_cmp_variable cmp_func, + void* cmp_func_param); + Status commit(); + Status close(); + + TableFile* parent; + const DBConfig* dbConfig; + const TableFileOptions& tFileOpt; + fdb_file_handle* dbFile; + fdb_kvs_handle* db; + fdb_config config; + fdb_kvs_config kvsConfig; + }; + + struct FdbHandleGuard { + FdbHandleGuard(TableFile* _t_file, FdbHandle* _handle); + ~FdbHandleGuard(); + TableFile* tFile; + FdbHandle* handle; + }; + +// === FUNCTIONS + void initBooster(size_t level, const DBConfig* db_config); + + Status setCheckpoint(Record* rec, + uint64_t prev_seqnum, + std::list& checkpoints, + bool remaining_all = false); + + Status compactToManully(FdbHandle* compact_handle, + const std::string& dst_filename, + const CompactOptions& options); + +// === VARIABLES + // File name. + std::string filename; + + // Table number. + uint64_t myNumber; + + // File operations. + FileOps* fOps; + + // File options. + TableFileOptions myOpt; + + // Parent table manager. + const TableMgr* tableMgr; + + // Corresponding table info. + TableInfo* tableInfo; + + // ForestDB writer handles + FdbHandle* writer; + + // List of ForestDB reader handles + std::list readers; + + // Lock for `readers`. + std::mutex readersLock; + + // Map {Checkpoint seqnum, ForestDB commit num} + std::map chkMap; + + // Lock for `chkMap`. + std::mutex chkMapLock; + + // Map {Jungle snapshot handle, ForestDB snapshot handles} + std::unordered_map snapHandles; + + // Lock of `snapHandles`. + std::mutex snapHandlesLock; + + // Pre-opened the snapshot of the latest Table. + // The first one is the newest one. + std::list latestSnapshot; + + // Lock of `latestSnapshot`. + std::mutex latestSnapshotLock; + + // Bloom filter for key. + BloomFilter* bfByKey; + + // Lookup booster. + TableLookupBooster* tlbByKey; + + // Logger. + SimpleLogger* myLog; +}; + + + } // namespace jungle + diff --git a/src/table_file_compaction.cc b/src/table_file_compaction.cc new file mode 100644 index 0000000..0496172 --- /dev/null +++ b/src/table_file_compaction.cc @@ -0,0 +1,433 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_file.h" + +#include "bloomfilter.h" +#include "db_mgr.h" +#include "internal_helper.h" +#include "table_mgr.h" + +#include _MACRO_TO_STR(LOGGER_H) + +namespace jungle { + +Status TableFile::compactTo(const std::string& dst_filename, + const CompactOptions& options) +{ + Status s; + const DBConfig* db_config = tableMgr->getDbConfig(); + + FdbHandle* compact_handle = new FdbHandle(this, db_config, myOpt); + EP( openFdbHandle(db_config, filename, compact_handle) ); + + s = compactToManully( compact_handle, + dst_filename, + options ); + + delete compact_handle; + return s; +} + +// Check +// 1) Jungle's own meta section, and +// 2) User defined custom tombstone checking function. +bool TableFile::isFdbDocTombstone(fdb_doc* doc) +{ + const DBConfig* db_config = tableMgr->getDbConfig(); + + // Decode meta. + SizedBuf user_meta_out; + SizedBuf::Holder h_user_meta_out(user_meta_out); // auto free. + + SizedBuf raw_meta(doc->metalen, doc->meta);; + bool is_tombstone_out = false; + rawMetaToUserMeta(raw_meta, is_tombstone_out, user_meta_out); + + if (doc->deleted) is_tombstone_out = true; + + // If custom tombstone function exists, call and check it. + if (db_config->compactionCbFunc) { + CompactionCbParams params; + params.rec.kv.key = SizedBuf(doc->keylen, doc->key); + params.rec.kv.value = SizedBuf(doc->bodylen, doc->body); + params.rec.meta = SizedBuf(user_meta_out.size, user_meta_out.data); + params.rec.seqNum = doc->seqnum; + + CompactionCbDecision dec = db_config->compactionCbFunc(params); + if (dec == CompactionCbDecision::DROP) { + is_tombstone_out = true; + } + } + return is_tombstone_out; +} + +Status TableFile::compactToManully(FdbHandle* compact_handle, + const std::string& dst_filename, + const CompactOptions& options) +{ + _log_info(myLog, "doing manual compaction"); + Timer tt; + Status s; + + bool is_last_level = (tableInfo->level == tableMgr->getNumLevels() - 1); + DBConfig local_config = *(tableMgr->getDbConfig()); + // Set bulk loading true to set WAL-flush-before-commit. + local_config.bulkLoading = true; + + // Block reuse shouldn't happen during compaction. + TableFileOptions dst_opt; + dst_opt.minBlockReuseFileSize = std::numeric_limits::max(); + + FdbHandle* dst_handle = new FdbHandle(this, &local_config, dst_opt); + GcDelete gc_dst(dst_handle); + + EP( openFdbHandle(&local_config, dst_filename, dst_handle) ); + + // Create bloom filter for destination file. + BloomFilter* dst_bf = nullptr; + if ( local_config.bloomFilterBitsPerUnit > 0.0 ) { + // Calculate based on WSS. + uint64_t bf_bitmap_size = myOpt.bloomFilterSize; + TableStats my_stats; + getStats(my_stats); + if (my_stats.workingSetSizeByte) { + bf_bitmap_size = getBfSizeByWss(&local_config, my_stats.workingSetSizeByte); + } + if (!bf_bitmap_size) { + bf_bitmap_size = getBfSizeByLevel(&local_config, tableInfo->level); + } + dst_bf = new BloomFilter(bf_bitmap_size, 3); + } + + fdb_iterator* itr = nullptr; + fdb_status fs = FDB_RESULT_SUCCESS; + fs = fdb_iterator_init( compact_handle->db, + &itr, + nullptr, 0, nullptr, 0, + FDB_ITR_NO_DELETES ); + if (fs != FDB_RESULT_SUCCESS) return Status::MANUAL_COMPACTION_OPEN_FAILED; + + uint64_t cnt = 0; + uint64_t discards = 0; + s = Status::OK; + + DBMgr* mgr = DBMgr::getWithoutInit(); + DebugParams d_params = mgr->getDebugParams(); + + // Flush block cache for every 5 second. + Timer sync_timer; + sync_timer.setDurationMs(5000); + + do { + fdb_doc tmp_doc; + memset(&tmp_doc, 0x0, sizeof(tmp_doc)); + + fdb_doc *ret_doc = &tmp_doc; + fs = fdb_iterator_get(itr, &ret_doc); + if (fs != FDB_RESULT_SUCCESS) break; + + // If 1) flag (for not to delete tombstone) is set, OR + // 2) LSM / level extension mode AND + // current level is not the last level, + // then skip checking whether the given record is tombstone. + bool is_tombstone_out = false; + bool check_tombstone = true; + if (options.preserveTombstone) { + check_tombstone = false; + } + if ( local_config.nextLevelExtension && + !is_last_level ) { + check_tombstone = false; + } + if (check_tombstone) { + is_tombstone_out = isFdbDocTombstone(ret_doc); + } + + if (is_tombstone_out) { + // Tombstone. + discards++; + } else { + // WARNING: SHOULD KEEP THE SAME SEQUENCE NUMBER! + ret_doc->flags = FDB_CUSTOM_SEQNUM; + fdb_set(dst_handle->db, ret_doc); + cnt++; + + if (dst_bf) dst_bf->set(ret_doc->key, ret_doc->keylen); + } + + free(ret_doc->key); + free(ret_doc->meta); + free(ret_doc->body); + + if (!tableMgr->isCompactionAllowed()) { + s = Status::COMPACTION_CANCELLED; + break; + } + + if (sync_timer.timeout()) { + fdb_sync_file(dst_handle->dbFile); + sync_timer.reset(); + } + + if (d_params.compactionDelayUs) { + // If debug parameter is given, sleep here. + Timer::sleepUs(d_params.compactionDelayUs); + } + + } while (fdb_iterator_next(itr) == FDB_RESULT_SUCCESS); + + uint64_t elapsed_us = std::max( tt.getUs(), (uint64_t)1 ); + uint64_t bf_size = (dst_bf) ? dst_bf->size() : 0; + _log_info( myLog, "in-place compaction moved %zu live docs, " + "%zu tombstones, BF size %zu bytes (%zu bits), %zu us, %.1f iops", + cnt, discards, + bf_size / 8, bf_size, + elapsed_us, + (double)(cnt + discards) * 1000000 / elapsed_us ); + + // WARNING: Should be done before commit. + if (dst_bf) { + saveBloomFilter(dst_filename + ".bf", dst_bf, true); + DELETE(dst_bf); + } + dst_handle->commit(); + + fs = fdb_iterator_close(itr); + itr = nullptr; + + return s; +} + +Status TableFile::mergeCompactTo(const std::string& file_to_merge, + const std::string& dst_filename, + const CompactOptions& options) +{ + Status s; + fdb_status fs = FDB_RESULT_SUCCESS; + const DBConfig* db_config = tableMgr->getDbConfig(); + DBConfig dst_config = *db_config; + dst_config.bulkLoading = true; + + Timer tt; + + // Open ForestDB handles. + FdbHandle* my_handle = new FdbHandle(this, db_config, myOpt); + FdbHandle* merge_handle = new FdbHandle(this, db_config, myOpt); + + // Block reuse shouldn't happen during compaction. + TableFileOptions dst_opt; + dst_opt.minBlockReuseFileSize = std::numeric_limits::max(); + FdbHandle* dst_handle = new FdbHandle(this, &dst_config, dst_opt); + + // Create bloom filter for destination file. + BloomFilter* dst_bf = nullptr; + if ( db_config->bloomFilterBitsPerUnit > 0.0 ) { + // Calculate based on WSS if this (origin) table. + uint64_t bf_bitmap_size = myOpt.bloomFilterSize; + TableStats my_stats; + getStats(my_stats); + if (my_stats.workingSetSizeByte) { + bf_bitmap_size = getBfSizeByWss(db_config, my_stats.workingSetSizeByte); + } + if (!bf_bitmap_size) { + bf_bitmap_size = getBfSizeByLevel(db_config, tableInfo->level); + } + dst_bf = new BloomFilter(bf_bitmap_size, 3); + } + + // Auto free. + GcDelete gc_my_handle(my_handle); + GcDelete gc_merge_handle(merge_handle); + GcDelete gc_dst_handle(dst_handle); + + EP( openFdbHandle(db_config, filename, my_handle) ); + EP( openFdbHandle(db_config, file_to_merge, merge_handle) ); + EP( openFdbHandle(db_config, dst_filename, dst_handle) ); + + // Open iterators. + fdb_iterator* my_itr = nullptr; + fdb_iterator* merge_itr = nullptr; + + fs = fdb_iterator_init( my_handle->db, &my_itr, + nullptr, 0, nullptr, 0, FDB_ITR_NO_DELETES ); + if (fs != FDB_RESULT_SUCCESS) return Status::MANUAL_COMPACTION_OPEN_FAILED; + // Auto close. + GcFunc gc_my_itr( std::bind(fdb_iterator_close, my_itr) ); + + fs = fdb_iterator_init( merge_handle->db, &merge_itr, + nullptr, 0, nullptr, 0, FDB_ITR_NO_DELETES ); + if (fs != FDB_RESULT_SUCCESS) return Status::MANUAL_COMPACTION_OPEN_FAILED; + // Auto close. + GcFunc gc_merge_itr( std::bind(fdb_iterator_close, merge_itr) ); + + DBMgr* mgr = DBMgr::getWithoutInit(); + DebugParams d_params = mgr->getDebugParams(); + + // Flush block cache for every 5 second. + Timer sync_timer; + sync_timer.setDurationMs(5000); + + uint64_t my_cnt = 0, my_discards = 0, + merge_cnt = 0, merge_discards = 0; + uint64_t final_cnt = 0; + bool my_itr_ended = false; + bool merge_itr_ended = false; + fdb_doc* my_doc = nullptr; + fdb_doc* merge_doc = nullptr; + do { + fdb_status my_fs = FDB_RESULT_SUCCESS; + fdb_status merge_fs = FDB_RESULT_SUCCESS; + + if (!my_doc) my_fs = fdb_iterator_get(my_itr, &my_doc); + if (!merge_doc) merge_fs = fdb_iterator_get(merge_itr, &merge_doc); + if ( my_fs != FDB_RESULT_SUCCESS && + merge_fs != FDB_RESULT_SUCCESS ) break; + + int cmp = 0; + if ( my_doc && !merge_doc) cmp = -1; + if (!my_doc && merge_doc) cmp = 1; + if (!my_doc && !merge_doc) assert(0); + if ( my_doc && merge_doc) { + if (db_config->cmpFunc) { + // Custom cmp mode. + CustomCmpFunc func = db_config->cmpFunc; + void* param = db_config->cmpFuncParam; + cmp = func( my_doc->key, my_doc->keylen, + merge_doc->key, merge_doc->keylen, param ); + } else { + SizedBuf l(my_doc->keylen, my_doc->key); + SizedBuf r(merge_doc->keylen, merge_doc->key); + cmp = SizedBuf::cmp(l, r); + } + } + + fdb_doc* doc_choosen = nullptr; + + uint64_t* cnt = &my_cnt; + uint64_t* discards = &my_discards; + + if (cmp < 0) { // `my_doc < merge_doc` + doc_choosen = my_doc; + my_doc = nullptr; + + } else if (cmp > 0) { // `my_doc > merge_doc` + doc_choosen = merge_doc; + merge_doc = nullptr; + cnt = &merge_cnt; + discards = &merge_discards; + + } else { // `my_doc == merge_doc` + // We should compare sequence number, + // and pick fresher one only. + if (my_doc->seqnum > merge_doc->seqnum) { + doc_choosen = my_doc; + fdb_doc_free(merge_doc); + merge_cnt++; + + // WARNING: The same sequence number should be allowed. + } else if (my_doc->seqnum <= merge_doc->seqnum) { + doc_choosen = merge_doc; + fdb_doc_free(my_doc); + cnt = &merge_cnt; + discards = &merge_discards; + my_cnt++; + } + + // And also move both cursors. + my_doc = nullptr; + merge_doc = nullptr; + } + + bool is_tombstone_out = false; + if (!options.preserveTombstone) { + is_tombstone_out = isFdbDocTombstone(doc_choosen); + } + if (is_tombstone_out) { + // Tombstone. + (*discards)++; + + } else { + // WARNING: SHOULD KEEP THE SAME SEQUENCE NUMBER! + doc_choosen->flags = FDB_CUSTOM_SEQNUM; + fdb_set(dst_handle->db, doc_choosen); + (*cnt)++; + final_cnt++; + + if (dst_bf) dst_bf->set(doc_choosen->key, doc_choosen->keylen); + } + fdb_doc_free(doc_choosen); + + // Move iterator of choosen doc. + if (!my_doc) { + my_fs = fdb_iterator_next(my_itr); + if (my_fs != FDB_RESULT_SUCCESS) my_itr_ended = true; + } + if (!merge_doc) { + merge_fs = fdb_iterator_next(merge_itr); + if (merge_fs != FDB_RESULT_SUCCESS) merge_itr_ended = true; + } + + if (!tableMgr->isCompactionAllowed()) { + s = Status::COMPACTION_CANCELLED; + if (my_doc) fdb_doc_free(my_doc); + if (merge_doc) fdb_doc_free(merge_doc); + break; + } + + if (sync_timer.timeout()) { + fdb_sync_file(dst_handle->dbFile); + sync_timer.reset(); + } + + if (d_params.compactionDelayUs) { + // If debug parameter is given, sleep here. + Timer::sleepUs(d_params.compactionDelayUs); + } + + // Until both iterators reach end. + } while ( !my_itr_ended || !merge_itr_ended ); + + // WARNING: Should be done before commit. + if (dst_bf) { + saveBloomFilter(dst_filename + ".bf", dst_bf, true); + DELETE(dst_bf); + } + dst_handle->commit(); + + // Close iterator first. + gc_my_itr.gcNow(); + gc_merge_itr.gcNow(); + + uint64_t elapsed_us = std::max( tt.getUs(), (uint64_t)1 ); + uint64_t bf_size = (dst_bf) ? dst_bf->size() : 0; + _log_info( myLog, "in-place merge compaction " + "moved %zu live docs %zu tombstones from mine, " + "%zu live docs %zu tombstones from merge, " + "%zu docs in new file, BF size %zu byets (%zu bits), " + "%zu us elapsed, %.1f iops", + my_cnt, my_discards, merge_cnt, merge_discards, + final_cnt, + bf_size / 8, bf_size, + elapsed_us, + (double)(my_cnt + my_discards + + merge_cnt + merge_discards) * 1000000 / elapsed_us ); + + return s; +} + +} + diff --git a/src/table_file_iterator.cc b/src/table_file_iterator.cc new file mode 100644 index 0000000..cf665bd --- /dev/null +++ b/src/table_file_iterator.cc @@ -0,0 +1,328 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_file.h" + +#include "table_mgr.h" + +namespace jungle { + +// === iterator =============================================================== + +TableFile::Iterator::Iterator() + : tFile(nullptr) + , tFileSnap(nullptr) + , fdbSnap(nullptr) + , fdbItr(nullptr) + , minSeq(NOT_INITIALIZED) + , maxSeq(NOT_INITIALIZED) + {} + +TableFile::Iterator::~Iterator() { + close(); +} + +Status TableFile::Iterator::init(DB* snap_handle, + TableFile* t_file, + const SizedBuf& start_key, + const SizedBuf& end_key) +{ + tFile = t_file; + fdb_status fs; + Status s; + + // Get last snap marker. + fdb_kvs_handle* fdb_snap = nullptr; + + if (snap_handle) { + // Snapshot. + mGuard l(t_file->snapHandlesLock); + auto entry = t_file->snapHandles.find(snap_handle); + if (entry == t_file->snapHandles.end()) return Status::SNAPSHOT_NOT_FOUND; + + // WARNING: + // Currently one snapshot only allows one iterator. + // + // TODO: + // Should open a separate snapshot handle! + // Multiple iterators derived from the same snapshot handle will share + // `dhandle` and `bhandle` which will cause race condition. + fdb_snap = entry->second; + + } else { + // Normal, use latest snapshot. + tFile->leaseSnapshot(tFileSnap); + fs = fdb_snapshot_open( tFileSnap->fdbSnap, + &fdbSnap, + tFileSnap->fdbSeqnum ); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_OPEN_KVS_FAIL; + + fdb_snap = fdbSnap; + } + + // if (valid_number(chk) && snap_seqnum > chk) snap_seqnum = chk; + + fs = fdb_iterator_init(fdb_snap, &fdbItr, + start_key.data, start_key.size, + end_key.data, end_key.size, + FDB_ITR_NO_DELETES); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_OPEN_KVS_FAIL; + + type = BY_KEY; + + return Status(); +} + +Status TableFile::Iterator::initSN(DB* snap_handle, + TableFile* t_file, + const uint64_t min_seq, + const uint64_t max_seq) +{ + tFile = t_file; + fdb_status fs; + Status s; + + // Get last snap marker. + fdb_seqnum_t snap_seqnum = 0; + fdb_kvs_handle* fdb_snap = nullptr; + + if (snap_handle) { + // Snapshot. + mGuard l(t_file->snapHandlesLock); + auto entry = t_file->snapHandles.find(snap_handle); + if (entry == t_file->snapHandles.end()) return Status::SNAPSHOT_NOT_FOUND; + + // WARNING: See `init()` above. + fdb_snap = entry->second; + + fs = fdb_get_kvs_seqnum(fdb_snap, &snap_seqnum); + if (fs != FDB_RESULT_SUCCESS) return Status::INVALID_SNAPSHOT; + + } else { + // Normal, use latest snapshot. + tFile->leaseSnapshot(tFileSnap); + fs = fdb_snapshot_open( tFileSnap->fdbSnap, + &fdbSnap, + tFileSnap->fdbSeqnum ); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_OPEN_KVS_FAIL; + + fdb_snap = fdbSnap; + } + + // if (valid_number(chk) && snap_seqnum > chk) snap_seqnum = chk; + + if (valid_number(min_seq)) { + minSeq = min_seq; + } else { + minSeq = 0; + } + if (valid_number(max_seq)) { + maxSeq = std::min(snap_seqnum, max_seq); + } else { + maxSeq = 0; + } + + fs = fdb_iterator_sequence_init(fdb_snap, &fdbItr, + minSeq, maxSeq, + FDB_ITR_NO_DELETES); + if (fs != FDB_RESULT_SUCCESS) return Status::FDB_OPEN_KVS_FAIL; + + type = BY_SEQ; + + return Status(); +} + +Status TableFile::Iterator::get(Record& rec_out) { + if (!tFile || !fdbItr) return Status::NOT_INITIALIZED; + + fdb_status fs; + + fdb_doc tmp_doc; + memset(&tmp_doc, 0x0, sizeof(tmp_doc)); + + fdb_doc *doc = &tmp_doc; + fs = fdb_iterator_get(fdbItr, &doc); + if (fs != FDB_RESULT_SUCCESS) { + return Status::ERROR; + } + + rec_out.kv.key.set(doc->keylen, doc->key); + rec_out.kv.key.setNeedToFree(); + rec_out.kv.value.set(doc->bodylen, doc->body); + rec_out.kv.value.setNeedToFree(); + + // Decode meta. + SizedBuf user_meta_out; + SizedBuf raw_meta(doc->metalen, doc->meta);; + SizedBuf::Holder h_raw_meta(raw_meta); // auto free raw meta. + raw_meta.setNeedToFree(); + bool is_tombstone_out = false; + TableFile::rawMetaToUserMeta(raw_meta, is_tombstone_out, user_meta_out); + + user_meta_out.moveTo( rec_out.meta ); + + rec_out.seqNum = doc->seqnum; + rec_out.type = (is_tombstone_out || doc->deleted) + ? Record::DELETION + : Record::INSERTION; + + return Status(); +} + +Status TableFile::Iterator::getMeta(Record& rec_out, + size_t& valuelen_out, + uint64_t& offset_out) +{ + if (!tFile || !fdbItr) return Status::NOT_INITIALIZED; + + fdb_status fs; + + fdb_doc tmp_doc; + memset(&tmp_doc, 0x0, sizeof(tmp_doc)); + + fdb_doc *doc = &tmp_doc; + fs = fdb_iterator_get_metaonly(fdbItr, &doc); + if (fs != FDB_RESULT_SUCCESS) { + return Status::ERROR; + } + + rec_out.kv.key.set(doc->keylen, doc->key); + rec_out.kv.key.setNeedToFree(); + valuelen_out = doc->bodylen; + offset_out = doc->offset; + + // Decode meta. + SizedBuf user_meta_out; + SizedBuf raw_meta(doc->metalen, doc->meta);; + SizedBuf::Holder h_raw_meta(raw_meta); // auto free raw meta. + raw_meta.setNeedToFree(); + bool is_tombstone_out = false; + TableFile::rawMetaToUserMeta(raw_meta, is_tombstone_out, user_meta_out); + + user_meta_out.moveTo( rec_out.meta ); + + rec_out.seqNum = doc->seqnum; + rec_out.type = (is_tombstone_out || doc->deleted) + ? Record::DELETION + : Record::INSERTION; + + return Status(); +} + +Status TableFile::Iterator::prev() { + if (!tFile || !fdbItr) return Status::NOT_INITIALIZED; + + fdb_status fs; + fs = fdb_iterator_prev(fdbItr); + if (fs != FDB_RESULT_SUCCESS) { + fs = fdb_iterator_next(fdbItr); + assert(fs == FDB_RESULT_SUCCESS); + return Status::OUT_OF_RANGE; + } + return Status(); +} + +Status TableFile::Iterator::next() { + if (!tFile || !fdbItr) return Status::NOT_INITIALIZED; + + fdb_status fs; + fs = fdb_iterator_next(fdbItr); + if (fs != FDB_RESULT_SUCCESS) { + fs = fdb_iterator_prev(fdbItr); + assert(fs == FDB_RESULT_SUCCESS); + return Status::OUT_OF_RANGE; + } + return Status(); +} + +Status TableFile::Iterator::seek(const SizedBuf& key, SeekOption opt) { + if (key.empty()) return gotoBegin(); + + if (!tFile || !fdbItr) return Status::NOT_INITIALIZED; + + fdb_status fs; + fdb_iterator_seek_opt_t fdb_seek_opt = + (opt == GREATER) + ? FDB_ITR_SEEK_HIGHER + : FDB_ITR_SEEK_LOWER; + fs = fdb_iterator_seek(fdbItr, key.data, key.size, fdb_seek_opt); + if (fs != FDB_RESULT_SUCCESS) { + if (opt == GREATER) { + fs = fdb_iterator_seek_to_max(fdbItr); + } else { + fs = fdb_iterator_seek_to_min(fdbItr); + } + if (fs != FDB_RESULT_SUCCESS) return Status::OUT_OF_RANGE; + } + return Status(); +} + +Status TableFile::Iterator::seekSN(const uint64_t seqnum, SeekOption opt) { + if (!tFile || !fdbItr) return Status::NOT_INITIALIZED; + + fdb_status fs; + fdb_iterator_seek_opt_t fdb_seek_opt = + (opt == GREATER) + ? FDB_ITR_SEEK_HIGHER + : FDB_ITR_SEEK_LOWER; + fs = fdb_iterator_seek_byseq(fdbItr, seqnum, fdb_seek_opt); + if (fs != FDB_RESULT_SUCCESS) { + if (opt == GREATER) { + fs = fdb_iterator_seek_to_max(fdbItr); + } else { + fs = fdb_iterator_seek_to_min(fdbItr); + } + assert(fs == FDB_RESULT_SUCCESS); + } + return Status(); +} + +Status TableFile::Iterator::gotoBegin() { + if (!tFile || !fdbItr) return Status::NOT_INITIALIZED; + + fdb_status fs; + fs = fdb_iterator_seek_to_min(fdbItr); + (void)fs; + return Status(); +} + +Status TableFile::Iterator::gotoEnd() { + if (!tFile || !fdbItr) return Status::NOT_INITIALIZED; + + fdb_status fs; + fs = fdb_iterator_seek_to_max(fdbItr); + (void)fs; + return Status(); +} + +Status TableFile::Iterator::close() { + if (fdbItr) { + fdb_iterator_close(fdbItr); + fdbItr = nullptr; + } + if (fdbSnap) { + fdb_kvs_close(fdbSnap); + fdbSnap = nullptr; + } + if (tFileSnap) { + tFile->returnSnapshot(tFileSnap); + tFileSnap = nullptr; + } + return Status(); +} + +}; // namespace jungle + diff --git a/src/table_helper.cc b/src/table_helper.cc new file mode 100644 index 0000000..0bb479f --- /dev/null +++ b/src/table_helper.cc @@ -0,0 +1,211 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_helper.h" + +#include "hex_dump.h" +#include "table_file.h" +#include "table_manifest.h" +#include "table_mgr.h" + +#include _MACRO_TO_STR(LOGGER_H) + +namespace jungle { + +inline void hex_dump_key(const SizedBuf& key, size_t maxlen, char** bp, size_t* len) { + print_hex_options hex_opt = PRINT_HEX_OPTIONS_INITIALIZER; + hex_opt.enable_colors = 0; + hex_opt.actual_address = 0; + print_hex_to_buf(bp, len, + key.data, std::min((size_t)key.size, (size_t)maxlen), + hex_opt); +} + +std::string hex_dump_to_string(const SizedBuf& key, size_t max_len) { + char* bp = nullptr; + size_t bp_size = 0; + hex_dump_key(key, max_len, &bp, &bp_size); + std::string ret(bp, bp_size); + free(bp); + return ret; +} + +int cmp_records(const DBConfig* db_config, + const Record* l, + const Record* r) +{ + // Rigth is null, pick Left. + if ( l && !r) return -1; + // Left is null, pick Right. + if (!l && r) return 1; + + // Both are null, shouldn't happen. + if (!l && !r) assert(0); + + if (db_config->cmpFunc) { + // Custom cmp mode. + CustomCmpFunc func = db_config->cmpFunc; + void* param = db_config->cmpFuncParam; + return func(l->kv.key.data, l->kv.key.size, + r->kv.key.data, r->kv.key.size, param); + } + return SizedBuf::cmp(l->kv.key, r->kv.key); +} + +bool cmp_records_bool(const DBConfig* db_config, + const Record* l, + const Record* r) +{ + return (cmp_records(db_config, l, r) < 0); +} + +std::vector table_list_to_number(const std::list src) { + std::vector ret(src.size()); + + size_t idx = 0; + for (auto& entry: src) { + TableInfo* t_info = entry; + ret[idx++] = t_info->number; + } + return std::move(ret); +} + +bool TableMgr::isTableLocked(uint64_t t_number) { + std::lock_guard l(lockedTablesLock); + auto found = lockedTables.find( t_number ); + return (found != lockedTables.end()); +} + +bool TableMgr::isLevelLocked(uint64_t l_number) { + std::lock_guard l(lockedLevelsLock); + auto found = lockedLevels.find( l_number ); + return (found != lockedLevels.end()); +} + + +// ========== + +TableLockHolder::TableLockHolder + ( TableMgr* t_mgr, + const std::vector& tables ) + : tMgr(t_mgr), fOwnsLock(false) +{ + if (!tables.size()) { + fOwnsLock = true; + return; + } + + std::string msg = "try to lock table "; + for (auto& entry: tables) msg += std::to_string(entry) + " "; + + { std::lock_guard l(tMgr->lockedTablesLock); + for (auto& entry: tables) { + auto found = tMgr->lockedTables.find(entry); + if (found != tMgr->lockedTables.end()) { + // Already locked, fail. + _s_info(tMgr->myLog) + << msg << "=> locking failed due to table " << entry; + return; + } + } + + // No existing locked table. Lock them all. + for (auto& entry: tables) tMgr->lockedTables.insert(entry); + // Clone. + holdingTables = tables; + fOwnsLock = true; + } + _s_info(tMgr->myLog) << msg << "=> succeeded"; +} + +TableLockHolder::~TableLockHolder() { + unlock(); +} + +bool TableLockHolder::ownsLock() const { + return fOwnsLock; +} + +void TableLockHolder::unlock() { + if (!fOwnsLock) return; + if (!holdingTables.size()) return; + + std::string msg = "unlock table "; + for (auto& entry: holdingTables) msg += std::to_string(entry) + " "; + _s_info(tMgr->myLog) << msg; + + std::lock_guard l(tMgr->lockedTablesLock); + for (auto& entry: holdingTables) { + auto found = tMgr->lockedTables.find(entry); + // If not exist, that must be a bug. + assert(found != tMgr->lockedTables.end()); + tMgr->lockedTables.erase(found); + } + holdingTables.clear(); + fOwnsLock = false; +} + + +// ========= + +LevelLockHolder::LevelLockHolder(TableMgr* t_mgr, + size_t level_to_lock) + : tMgr(t_mgr) + , fOwnsLock(false) + , levelLocked(level_to_lock) +{ + if (level_to_lock >= tMgr->mani->getNumLevels()) return; + + + std::string msg = "try to lock level " + std::to_string(level_to_lock); + { std::lock_guard l(tMgr->lockedLevelsLock); + auto entry = tMgr->lockedLevels.find(level_to_lock); + if (entry != tMgr->lockedLevels.end()) { + _s_info(tMgr->myLog) << msg << " => locking failed"; + return; + } + + tMgr->lockedLevels.insert(level_to_lock); + fOwnsLock = true; + } + _s_info(tMgr->myLog) << msg << " => succeeded"; +} + +LevelLockHolder::~LevelLockHolder() { + unlock(); +} + +bool LevelLockHolder::ownsLock() const { + return fOwnsLock; +} + +void LevelLockHolder::unlock() { + if (!fOwnsLock) return; + + std::string msg = "unlock level " + std::to_string(levelLocked); + _s_info(tMgr->myLog) << msg; + + std::lock_guard l(tMgr->lockedLevelsLock); + auto entry = tMgr->lockedLevels.find(levelLocked); + // If not exist, that must be a bug. + assert(entry != tMgr->lockedLevels.end()); + tMgr->lockedLevels.erase(entry); + + fOwnsLock = false; +} + +}; // namespace jungle + diff --git a/src/table_helper.h b/src/table_helper.h new file mode 100644 index 0000000..3945e47 --- /dev/null +++ b/src/table_helper.h @@ -0,0 +1,78 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "event_awaiter.h" +#include "internal_helper.h" + +#include + +#include +#include +#include + +class SimpleLogger; + +namespace jungle { + +class TableFile; +class TableMgr; +struct TableInfo; + +void hex_dump_key(const SizedBuf& key, size_t maxlen, char** bp, size_t* len); + +std::string hex_dump_to_string(const SizedBuf& key, size_t max_len = 256); + +int cmp_records(const DBConfig* db_config, + const Record* l, + const Record* r); + +bool cmp_records_bool(const DBConfig* db_config, + const Record* l, + const Record* r); + +std::vector table_list_to_number(const std::list src); + +class TableLockHolder { +public: + TableLockHolder(TableMgr* t_mgr, + const std::vector& tables); + ~TableLockHolder(); + bool ownsLock() const; + void unlock(); + +private: + TableMgr* tMgr; + bool fOwnsLock; + std::vector holdingTables; +}; + +class LevelLockHolder { +public: + LevelLockHolder(TableMgr* t_mgr, + size_t level_to_lock); + ~LevelLockHolder(); + bool ownsLock() const; + void unlock(); +private: + TableMgr* tMgr; + bool fOwnsLock; + size_t levelLocked; +}; + +}; // namespace jungle + diff --git a/src/table_iterator.cc b/src/table_iterator.cc new file mode 100644 index 0000000..659f42c --- /dev/null +++ b/src/table_iterator.cc @@ -0,0 +1,499 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_mgr.h" + +#include "db_internal.h" + +namespace jungle { + +// === Iterator + +TableMgr::Iterator::Iterator() + : tMgr(nullptr), + snapTableList(nullptr), + minSeqSnap(NOT_INITIALIZED), + maxSeqSnap(NOT_INITIALIZED) +{ + avl_init(&curWindow, nullptr); +} + +TableMgr::Iterator::~Iterator() { + close(); +} + +void TableMgr::Iterator::addTableItr(DB* snap_handle, TableInfo* t_info) { + // Open iterator. + TableFile::Iterator* t_itr = new TableFile::Iterator(); + if (type == BY_SEQ) { + t_itr->initSN(snap_handle, t_info->file, minSeqSnap, maxSeqSnap); + } else if (type == BY_KEY) { + t_itr->init(snap_handle, t_info->file, startKey, endKey); + } + + ItrItem* ctx = new ItrItem(); + ctx->tInfo = t_info; + ctx->tItr = t_itr; + // If this iterator is out-of-range, `lastRec` will be empty. + Status s = t_itr->get(ctx->lastRec); + if (s) { + // Insert available iterator only. + avl_cmp_func* cmp_func = (type == BY_SEQ) + ? (ItrItem::cmpSeq) + : (ItrItem::cmpKey); + avl_insert(&curWindow, &ctx->an, cmp_func); + } + tables[t_info->level].push_back(ctx); +} + +Status TableMgr::Iterator::init(DB* snap_handle, + TableMgr* table_mgr, + const SizedBuf& start_key, + const SizedBuf& end_key) +{ + uint64_t empty_seq = NOT_INITIALIZED; + return initInternal(snap_handle, table_mgr, + empty_seq, empty_seq, + start_key, end_key, + BY_KEY); +} + +Status TableMgr::Iterator::initSN(DB* snap_handle, + TableMgr* table_mgr, + uint64_t min_seq, + uint64_t max_seq) +{ + SizedBuf empty_key; + return initInternal(snap_handle, table_mgr, + min_seq, max_seq, + empty_key, empty_key, + BY_SEQ); +} + +Status TableMgr::Iterator::initInternal(DB* snap_handle, + TableMgr* table_mgr, + uint64_t min_seq, + uint64_t max_seq, + const SizedBuf& start_key, + const SizedBuf& end_key, + TableMgr::Iterator::Type _type) +{ + if (table_mgr->getDbConfig()->logSectionOnly) + return Status::TABLES_ARE_DISABLED; + + tMgr = table_mgr; + type = _type; + if (tMgr->getDbConfig()->cmpFunc) { + // Custom cmp mode. + avl_set_aux(&curWindow, (void*)tMgr); + } + + minSeqSnap = min_seq; + maxSeqSnap = max_seq; + if ( snap_handle && + maxSeqSnap > snap_handle->sn->chkNum ) { + maxSeqSnap = snap_handle->sn->chkNum; + if (!valid_number(min_seq)) minSeqSnap = 0; + } + startKey.alloc(start_key); + endKey.alloc(end_key); + + try { + size_t n_levels = tMgr->mani->getNumLevels(); + tables.resize(n_levels); + + if (snap_handle) { + // Snapshot + assert(snap_handle->sn->tableList); + for (auto& entry: *snap_handle->sn->tableList) { + TableInfo* t_info = entry; + addTableItr(snap_handle, t_info); + } + snapTableList = snap_handle->sn->tableList; + } else { + // Normal + for (size_t ii=0; ii t_info_ret; + SizedBuf empty_key; + if (ii == 0) { + tMgr->mani->getTablesRange(ii, empty_key, empty_key, t_info_ret); + } else { + tMgr->mani->getTablesRange(ii, start_key, end_key, t_info_ret); + } + + for (auto& entry: t_info_ret) { + TableInfo* t_info = entry; + addTableItr(snap_handle, t_info); + } + } + } + + windowCursor = avl_first(&curWindow); + if (!windowCursor) throw Status(Status::OUT_OF_RANGE); + return Status(); + + } catch (Status s) { + startKey.free(); + endKey.free(); + return s; + } +} + +Status TableMgr::Iterator::get(Record& rec_out) { + if (!windowCursor) return Status::KEY_NOT_FOUND; + + ItrItem* item = _get_entry(windowCursor, ItrItem, an); + rec_out = item->lastRec; + return Status(); +} + +Status TableMgr::Iterator::prev() { + Status s; + + ItrItem* cur_item = _get_entry(windowCursor, ItrItem, an); + uint64_t cur_seq = cur_item->lastRec.seqNum; + SizedBuf cur_key; + cur_key.alloc(cur_item->lastRec.kv.key); + + // Do next() for all iterators GTEQ windowCursor. + avl_node* cursor = avl_last(&curWindow); + while (cursor) { + ItrItem* item = _get_entry(cursor, ItrItem, an); + if (item->flags & ItrItem::no_more_prev) { + s = Status::ERROR; + } else { + if ( type == BY_SEQ && + item->lastRec.seqNum < cur_seq) break; + if ( type == BY_KEY && + cmpSizedBuf(item->lastRec.kv.key, cur_key) < 0 ) break; + s = item->tItr->prev(); + } + + if (s) { + avl_remove(&curWindow, &item->an); + item->flags = ItrItem::none; + item->lastRec.free(); + s = item->tItr->get(item->lastRec); + assert(s); + + avl_cmp_func* cmp_func = (type == BY_SEQ) + ? (ItrItem::cmpSeq) + : (ItrItem::cmpKey); + avl_insert(&curWindow, &item->an, cmp_func); + cursor = avl_last(&curWindow); + } else { + item->flags |= ItrItem::no_more_prev; + cursor = avl_prev(&item->an); + } + } + + windowCursor = avl_last(&curWindow); + ItrItem* last_valid_item = nullptr; + while (windowCursor) { + // Find *LAST* valid item (only for BY_KEY). + ItrItem* item = _get_entry(windowCursor, ItrItem, an); + + bool valid = false; + if (type == BY_SEQ) { + valid = checkValidBySeq(item, cur_seq, true); + if (!valid) windowCursor = avl_prev(windowCursor); + else break; + + } else if (type == BY_KEY) { + valid = checkValidByKey(item, cur_key, true); + if (last_valid_item && + cmpSizedBuf(item->lastRec.kv.key, + last_valid_item->lastRec.kv.key) < 0) break; + if (valid) last_valid_item = item; + windowCursor = avl_prev(windowCursor); + } + } + + if (last_valid_item) windowCursor = &last_valid_item->an; + + cur_key.free(); + if (!windowCursor) { + // Reached the end. + windowCursor = avl_first(&curWindow); + return Status::OUT_OF_RANGE; + } + return Status(); +} + +Status TableMgr::Iterator::next() { + Status s; + + ItrItem* cur_item = _get_entry(windowCursor, ItrItem, an); + uint64_t cur_seq = cur_item->lastRec.seqNum; + SizedBuf cur_key; + cur_key.alloc(cur_item->lastRec.kv.key); + + // Do next() for all iterators SMEQ windowCursor. + avl_node* cursor = avl_first(&curWindow); + while (cursor) { + ItrItem* item = _get_entry(cursor, ItrItem, an); + if (item->flags & ItrItem::no_more_next) { + s = Status::ERROR; + } else { + if ( type == BY_SEQ && + item->lastRec.seqNum > cur_seq) break; + if ( type == BY_KEY && + cmpSizedBuf(item->lastRec.kv.key, cur_key) > 0 ) break; + s = item->tItr->next(); + } + + if (s) { + avl_remove(&curWindow, &item->an); + item->flags = ItrItem::none; + item->lastRec.free(); + s = item->tItr->get(item->lastRec); + assert(s); + + avl_cmp_func* cmp_func = (type == BY_SEQ) + ? (ItrItem::cmpSeq) + : (ItrItem::cmpKey); + avl_insert(&curWindow, &item->an, cmp_func); + cursor = avl_first(&curWindow); + } else { + item->flags |= ItrItem::no_more_next; + cursor = avl_next(&item->an); + } + } + + windowCursor = avl_first(&curWindow); + while (windowCursor) { + // Find first valid item. + ItrItem* item = _get_entry(windowCursor, ItrItem, an); + + bool valid = false; + if (type == BY_SEQ) { + valid = checkValidBySeq(item, cur_seq); + } else if (type == BY_KEY) { + valid = checkValidByKey(item, cur_key); + } + + if (!valid) { + windowCursor = avl_next(windowCursor); + } else { + break; + } + } + + cur_key.free(); + if (!windowCursor) { + // Reached the end. + windowCursor = avl_last(&curWindow); + return Status::OUT_OF_RANGE; + } + return Status(); +} + +Status TableMgr::Iterator::seek(const SizedBuf& key, SeekOption opt) { + return seekInternal(key, NOT_INITIALIZED, opt); +} + +Status TableMgr::Iterator::seekSN(const uint64_t seqnum, SeekOption opt) { + SizedBuf dummy_key; + return seekInternal(dummy_key, seqnum, opt); +} + +Status TableMgr::Iterator::gotoBegin() { + SizedBuf empty_key; + return seekInternal(empty_key, 0, GREATER); +} + +Status TableMgr::Iterator::gotoEnd() { + SizedBuf empty_key; + return seekInternal(empty_key, 0, SMALLER, true); +} + + +Status TableMgr::Iterator::seekInternal + ( const SizedBuf& key, + const uint64_t seqnum, + SeekOption opt, + bool goto_end ) +{ + Status s; + + // Remove current items from `curWindow`. + std::vector items; + avl_node* cursor = avl_first(&curWindow); + while (cursor) { + ItrItem* item = _get_entry(cursor, ItrItem, an); + cursor = avl_next(&item->an); + avl_remove(&curWindow, &item->an); + items.push_back(item); + } + + // Seek for all items. + for (auto& entry: items) { + ItrItem*& item = entry; + + if (goto_end) { + // Goto end: special case. + s = item->tItr->gotoEnd(); + + } else { + if (type == BY_SEQ) { + s = item->tItr->seekSN(seqnum, (TableFile::Iterator::SeekOption)opt); + } else { + s = item->tItr->seek(key, (TableFile::Iterator::SeekOption)opt); + } + } + + if (s) { + item->lastRec.free(); + s = item->tItr->get(item->lastRec); + } + if (s) { + int cmp = 0; + if (goto_end) { + // Goto end: special case. + cmp = -1; + + } else { + if (type == BY_SEQ) { + if (item->lastRec.seqNum < seqnum) cmp = -1; + else if (item->lastRec.seqNum > seqnum) cmp = 1; + else cmp = 0; + } else { + cmp = cmpSizedBuf(item->lastRec.kv.key, key); + } + } + + + item->flags = ItrItem::none; + if (opt == GREATER && cmp < 0) { + item->flags |= ItrItem::no_more_next; + } else if (opt == SMALLER && cmp > 0) { + item->flags |= ItrItem::no_more_prev; + } + } else { + item->flags = ItrItem::no_more_prev | + ItrItem::no_more_next; + } + + avl_cmp_func* cmp_func = (type == BY_SEQ) + ? (ItrItem::cmpSeq) + : (ItrItem::cmpKey); + avl_node* avl_ret = avl_insert(&curWindow, &item->an, cmp_func); + assert(avl_ret == &item->an); + (void)avl_ret; + } + + if (opt == GREATER) { + windowCursor = avl_first(&curWindow); + while (windowCursor) { + // Find first valid item. + ItrItem* item = _get_entry(windowCursor, ItrItem, an); + + if (item->flags == ItrItem::none) break; + else windowCursor = avl_next(windowCursor); + } + } else { // SMALLER + windowCursor = avl_last(&curWindow); + while (windowCursor) { + // Find *LAST* valid item (only for BY_KEY). + ItrItem* item = _get_entry(windowCursor, ItrItem, an); + + if (item->flags == ItrItem::none) break; + else windowCursor = avl_prev(windowCursor); + } + } + + if (!windowCursor) { + // Reached the end. + if (opt == GREATER) windowCursor = avl_last(&curWindow); + if (opt == SMALLER) windowCursor = avl_first(&curWindow); + } + return Status(); +} + + +int TableMgr::Iterator::cmpSizedBuf(const SizedBuf& l, const SizedBuf& r) { + CMP_NULL_CHK(l.data, r.data); + if (tMgr->getDbConfig()->cmpFunc) { + // Custom cmp mode. + CustomCmpFunc func = tMgr->getDbConfig()->cmpFunc; + void* param = tMgr->getDbConfig()->cmpFuncParam; + return func(l.data, l.size, r.data, r.size, param); + } + return SizedBuf::cmp(l, r); +} + +bool TableMgr::Iterator::checkValidBySeq(ItrItem* item, + const uint64_t cur_seq, + const bool is_prev) +{ + if ( ( !is_prev && (item->flags & ItrItem::no_more_next) ) || + ( is_prev && (item->flags & ItrItem::no_more_prev) ) ) { + return false; + } else if (item->lastRec.seqNum == cur_seq) { + // Duplicate item, skip. + return false; + } + return true; +} + +bool TableMgr::Iterator::checkValidByKey(ItrItem* item, + const SizedBuf& cur_key, + const bool is_prev) +{ + if ( ( !is_prev && (item->flags & ItrItem::no_more_next) ) || + ( is_prev && (item->flags & ItrItem::no_more_prev) ) ) { + return false; + } else if (cmpSizedBuf(item->lastRec.kv.key, cur_key) == 0) { + // Duplicate item, skip. + return false; + } + return true; +} + + +Status TableMgr::Iterator::close() { + if (!tMgr) return Status(); + + avl_node* cursor = avl_first(&curWindow); + while (cursor) { + ItrItem* item = _get_entry(cursor, ItrItem, an); + cursor = avl_next(&item->an); + avl_remove(&curWindow, &item->an); + } + + for (auto& _level: tables) { + for (auto& _table: _level) { + ItrItem* ctx = _table; + ctx->tItr->close(); + ctx->lastRec.free(); + delete ctx->tItr; + if (!snapTableList) { + // Only when not a snapshot. + ctx->tInfo->done(); + } + delete ctx; + } + } + + tMgr = nullptr; + windowCursor = nullptr; + startKey.free(); + endKey.free(); + + return Status(); +} + +}; // namespace jungle diff --git a/src/table_lookup_booster.cc b/src/table_lookup_booster.cc new file mode 100644 index 0000000..ccc827c --- /dev/null +++ b/src/table_lookup_booster.cc @@ -0,0 +1,76 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_lookup_booster.h" + +#include "table_file.h" +#include "table_mgr.h" + +#include _MACRO_TO_STR(LOGGER_H) + +namespace jungle { + +TableLookupBooster::TableLookupBooster(uint64_t limit, + const TableMgr* table_mgr, + const TableFile* table_file) + : curLimit(limit) + , tMgr(table_mgr) + , tFile(table_file) + , elems(curLimit) + , elemsLock(17) +{ + // Will be used in the future. + (void)tMgr; + (void)tFile; +} + +TableLookupBooster::~TableLookupBooster() { +} + +size_t TableLookupBooster::size() const { + return elems.size(); +} + +Status TableLookupBooster::get(uint32_t h_value, uint64_t& offset_out) { + size_t lock_idx = h_value % elemsLock.size(); + size_t elem_idx = h_value % elems.size(); + + std::lock_guard l( elemsLock[lock_idx] ); + Elem& existing = elems[elem_idx]; + if (existing.hValue() == h_value) { + offset_out = existing.offset(); + return Status(); + } + return Status::KEY_NOT_FOUND; +} + +Status TableLookupBooster::setIfNew(const TableLookupBooster::Elem& elem) { + size_t lock_idx = elem.hValue() % elemsLock.size(); + size_t elem_idx = elem.hValue() % elems.size(); + + std::lock_guard l( elemsLock[lock_idx] ); + Elem& existing = elems[elem_idx]; + if ( existing.hValue() == elem.hValue() && + existing.seqNum() >= elem.seqNum() ) { + return Status::ALREADY_EXIST; + } + + elems[elem_idx] = elem; + return Status(); +} + +} + diff --git a/src/table_lookup_booster.h b/src/table_lookup_booster.h new file mode 100644 index 0000000..11af7fa --- /dev/null +++ b/src/table_lookup_booster.h @@ -0,0 +1,96 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "internal_helper.h" +#include "list.h" + +#include + +#include +#include + +namespace jungle { + +class TableMgr; +class TableFile; +class TableLookupBooster { +public: + class Elem { + public: + Elem(uint32_t h = 0, + uint64_t s = 0, + uint64_t o = 0) + { + uint8_t* ptr = (uint8_t*)data; + *((uint32_t*)(ptr + 0)) = h; + *((uint64_t*)(ptr + 4)) = s; + *((uint64_t*)(ptr + 12)) = o; + } + // Marshalling. + inline uint32_t hValue() const { + uint8_t* ptr = (uint8_t*)data; + return *((uint32_t*)(ptr + 0)); + } + inline uint64_t seqNum() const { + uint8_t* ptr = (uint8_t*)data; + return *((uint64_t*)(ptr + 4)); + } + inline uint64_t offset() const { + uint8_t* ptr = (uint8_t*)data; + return *((uint64_t*)(ptr + 12)); + } + private: + char data[20]; + }; + + TableLookupBooster(uint64_t limit, + const TableMgr* table_mgr, + const TableFile* table_file); + + ~TableLookupBooster(); + + size_t size() const; + + Status get(uint32_t h_value, uint64_t& offset_out); + + Status setIfNew(const Elem& elem); + +private: +// --- TYPES --- + +// --- FUNCTIONS --- + +// --- VARIABLES --- + // Limit (the number of elems). + uint64_t curLimit; + + // Table manager. + const TableMgr* tMgr; + + // Table file. + const TableFile* tFile; + + // Element array. + std::vector elems; + + // Lock for `elems`. + mutable std::vector elemsLock; +}; + +} + diff --git a/src/table_manifest.cc b/src/table_manifest.cc new file mode 100644 index 0000000..f1d020f --- /dev/null +++ b/src/table_manifest.cc @@ -0,0 +1,658 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_manifest.h" + +#include "crc32.h" +#include "internal_helper.h" +#include "table_mgr.h" + +#include + +namespace jungle { + +static uint8_t TABMANI_FOOTER[8] = {0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0xab, 0xef}; +static uint32_t TABMANI_VERSION = 0x1; + + +struct TableManifest::LevelInfo { + LevelInfo() + : numTables(0) + { + tables = new skiplist_raw(); + skiplist_init(tables, TableInfo::cmp); + } + + ~LevelInfo() { + if (tables) { + skiplist_node* cursor = skiplist_begin(tables); + while (cursor) { + TableInfo* t_info = _get_entry(cursor, TableInfo, snode); + cursor = skiplist_next(tables, cursor); + + TableStack* stack = t_info->stack; + if (stack) { + std::lock_guard l(stack->lock); + for (TableInfo* tt: stack->tables) { + delete tt->file; + delete tt; + } + } + + delete t_info->file; + delete t_info; + } + skiplist_free(tables); + delete tables; + } + } + + std::atomic numTables; + skiplist_raw* tables; +}; + + +TableManifest::TableManifest(const TableMgr* table_mgr, + FileOps* _f_ops) + : tableMgr(table_mgr) + , fOps(_f_ops) + , mFile(nullptr) + , maxTableNum(NOT_INITIALIZED) + , myLog(nullptr) +{ + levels.reserve(NUM_RESERVED_LEVELS); +} + +TableManifest::~TableManifest() { + if (mFile) { + // `delete` will close file if opened. + delete mFile; + } + for (size_t ii=0; iiexist(filename.c_str())) return Status::ALREADY_EXIST; + if (filename.empty()) return Status::INVALID_PARAMETERS; + + dirPath = path; + mFileName = filename; + + // Create a new file. + Status s; + EP(fOps->open(&mFile, mFileName.c_str())); + if (!s) return s; + + // Initially there is only one level (level-0). + levels.resize(1); + levels[0] = new LevelInfo(); + + // Store initial data. + EP(store()); + + return Status(); +} + +Status TableManifest::load(const std::string& path, + const uint64_t prefix_num, + const std::string& filename) +{ + if (!fOps) return Status::NOT_INITIALIZED; + if (!fOps->exist(filename.c_str())) return Status::FILE_NOT_EXIST; + if (filename.empty()) return Status::INVALID_PARAMETERS; + + dirPath = path; + mFileName = filename; + + Status s; + EP( fOps->open(&mFile, mFileName.c_str()) ); + + try { + // File should be bigger than 16 bytes (FOOTER + version + CRC32). + size_t file_size = fOps->eof(mFile); + if (file_size < 16) throw Status(Status::FILE_CORRUPTION); + + // Footer check + RwSerializer ss(fOps, mFile); + uint8_t footer_file[8]; + ss.pos(file_size - 16); + ss.get(footer_file, 8); + if (memcmp(TABMANI_FOOTER, footer_file, 8) != 0) { + throw Status(Status::FILE_CORRUPTION); + } + + // Version check + uint32_t ver_file = ss.getU32(s); + (void)ver_file; + + // CRC check + uint32_t crc_file = ss.getU32(s); + + SizedBuf chk_buf(file_size - 4); + SizedBuf::Holder h_chk_buf(chk_buf); + ss.pos(0); + ss.get(chk_buf.data, chk_buf.size); + uint32_t crc_local = crc32_8(chk_buf.data, chk_buf.size, 0); + if (crc_local != crc_file) throw Status(Status::CHECKSUM_ERROR); + + // Max (latest) table file number. + ss.pos(0); + maxTableNum.store(ss.getU64(s), MOR); + + // Number of levels. + uint32_t num_levels = ss.getU32(s); + levels.resize(num_levels); + + for (size_t ii=0; iinumTables.store(num_tables, MOR); + + for (size_t jj=0; jj= 4) hash_num = ss.getU32(s); + if (k_size >= 8) status = ss.getU32(s); + + t_info = new TableInfo(ii, table_number, hash_num); + if (status) t_info->status = (TableInfo::Status)status; + + } else { + // Level-1+: read min key. + t_info = new TableInfo(ii, table_number, 0); + if (k_size) { + t_info->minKey.alloc(k_size, nullptr); + s = ss.get(t_info->minKey.data, k_size); + } + } + + TableFile* t_file = new TableFile(tableMgr); + t_file->setLogger(myLog); + std::string t_filename = TableFile::getTableFileName + ( dirPath, prefix_num, table_number ); + + TableFileOptions t_opt; + t_file->load(ii, table_number, t_filename, fOps, t_opt); + + // Checkpoints + t_file->loadCheckpoints(ss); + + t_info->file = t_file; + t_info->file->setTableInfo(t_info); + + bool add_to_skiplist = true; + if (add_to_skiplist) { + skiplist_insert(l_info->tables, &t_info->snode); + } + } + } + + return Status(); + + } catch (Status s) { + fOps->close(mFile); + DELETE(mFile); + return s; + } +} + +Status TableManifest::store() { + if (mFileName.empty() || !fOps) return Status::NOT_INITIALIZED; + + Status s; + //EP(BackupRestore::backup(fOps, mFileName)); + + RwSerializer ss(fOps, mFile); + + // << Table manifest file format >> + // Latest table file number, 8 bytes + // Current number of levels (N), 4 bytes + // +--- + // | Level entry, ... + // +--- N times + ss.putU64(maxTableNum.load(MOR)); + uint32_t num_levels = levels.size(); + ss.putU32(num_levels); + + for (size_t ii=0; ii> + // Number of tables (M), 4 bytes + // +--- + // | Table entry, ... + // +--- M times + LevelInfo* l_info = levels[ii]; + ss.putU32(l_info->numTables.load(MOR)); + skiplist_node* cursor = skiplist_begin(l_info->tables); + while (cursor) { + // << Table entry format >> + // Table number, 8 bytes + // Table min key length (L), 4 bytes + // Table min key (L0: hash num + a), L bytes + // Number of checkpoints (K), 4 bytes + // +--- + // | Checkpoint seq num, 8 bytes + // | ForestDB seq num, 8 bytes + // +--- K times + TableInfo* t_info = _get_entry(cursor, TableInfo, snode); + + // Do not save if this file is COMPACT_DST: + // it means that this file is incomplete yet. + if (t_info->isValid()) { + ss.putU64(t_info->number); + if ( t_info->level == 0 ) { + // Level-0 (hash): + // hash num 4 bytes + // file status + flags 4 bytes + ss.putU32(sizeof(t_info->hashNum) + sizeof(uint32_t)); + ss.putU32(t_info->hashNum); + ss.putU32((uint32_t)t_info->status.load()); + + } else { + // Level-1+: append min key. + ss.putU32(t_info->minKey.size); + ss.put(t_info->minKey.data, t_info->minKey.size); + } + t_info->file->appendCheckpoints(ss); + } + + cursor = skiplist_next(l_info->tables, cursor); + skiplist_release_node(&t_info->snode); + } + if (cursor) skiplist_release_node(cursor); + } + + // Footer. + ss.put(TABMANI_FOOTER, 8); + + // Version. + ss.putU32(TABMANI_VERSION); + + // CRC32 + size_t ctx_size = ss.pos(); + + // Read whole file. + RwSerializer ss_read(fOps, mFile); + SizedBuf temp_buf(ctx_size); + SizedBuf::Holder h_temp_buf(temp_buf); + ss_read.get(temp_buf.data, temp_buf.size); + uint32_t crc_val = crc32_8(temp_buf.data, temp_buf.size, 0); + + ss.putU32(crc_val); + + // Should truncate tail. + fOps->ftruncate(mFile, ss.pos()); + + // After success, make a backup file one more time, + // using the latest data. + EP(BackupRestore::backup(fOps, mFileName)); + + return Status(); +} + +Status TableManifest::storeTableStack(RwSerializer& ss, + TableInfo* base_table) +{ + if ( !base_table || + !base_table->baseTable || + !base_table->stack ) return Status(); + + TableStack* stack = base_table->stack; + + std::lock_guard l(stack->lock); + for (auto& entry: stack->tables) { + TableInfo* cur_table = entry; + ss.putU64(cur_table->number); + ss.putU32(cur_table->minKey.size); + ss.put(cur_table->minKey.data, cur_table->minKey.size); + cur_table->file->appendCheckpoints(ss); + } + return Status(); +} + +Status TableManifest::sync() { + return fOps->fsync(mFile); +} + +Status TableManifest::extendLevel() { + size_t new_level = 0; + { std::lock_guard l(levelsLock); + new_level = levels.size(); + levels.resize(new_level + 1); + levels[new_level] = new LevelInfo(); + } + uint64_t level_limit = tableMgr->getLevelSizeLimit(new_level); + uint64_t table_limit = tableMgr->getDbConfig()->getMaxTableSize(new_level); + + _log_info( myLog, + "NEW LEVEL: %zu, table size limit %zu (%s), " + "level size limit %zu (%s)", + new_level, + table_limit, Formatter::sizeToString(table_limit, 2).c_str(), + level_limit, Formatter::sizeToString(level_limit, 2).c_str() ); + return Status(); +} + +Status TableManifest::issueTableNumber(uint64_t& new_table_number) { + uint64_t expected = NOT_INITIALIZED; + uint64_t val = 0; + if (maxTableNum.compare_exchange_weak(expected, val)) { + // The first table file, number 0. + } else { + // Otherwise: current max + 1. + do { + expected = maxTableNum; + val = maxTableNum + 1; + } while (!maxTableNum.compare_exchange_weak(expected, val)); + } + new_table_number = val; + return Status(); +} + +Status TableManifest::addTableFile(size_t level, + uint32_t hash_num, + SizedBuf min_key, + TableFile* t_file, + bool allow_stacking) +{ + if (level >= levels.size()) return Status::INVALID_LEVEL; + + // New table corresponding to the number. + TableInfo* t_info = new TableInfo(level, t_file->getNumber(), hash_num); + t_info->file = t_file; + t_info->file->setTableInfo(t_info); + t_info->file->setLogger(myLog); + min_key.copyTo(t_info->minKey); + + LevelInfo* cur_level = levels[level]; + + // Non-tiering mode: + // Duplicate min-key SHOULD NOT exist. + skiplist_insert(cur_level->tables, &t_info->snode); + cur_level->numTables.fetch_add(1); + _log_info( myLog, "level %zu: added table %zu_%zu (hash %zu) to manifest, " + "min key %s, num tables %zu", + level, + tableMgr->getTableMgrOptions()->prefixNum, + t_file->getNumber(), hash_num, + t_info->minKey.toReadableString().c_str(), + cur_level->numTables.load() ); + return Status(); +} + +Status TableManifest::removeTableFile(size_t level, + TableInfo* table_to_remove) +{ + if (level >= levels.size()) return Status::INVALID_LEVEL; + + LevelInfo* cur_level = levels[level]; + if (!table_to_remove->baseTable) { + // Stack of other table. Do nothing on skiplist. + table_to_remove->setRemoved(); + cur_level->numTables.fetch_sub(1); + _log_info(myLog, "level %zu: removed table %zu_%zu from stack " + "ref count %zu, num tables %zu", + level, + tableMgr->getTableMgrOptions()->prefixNum, + table_to_remove->number, + table_to_remove->refCount.load(), + cur_level->numTables.load()); + + return Status(); + } + + uint64_t table_num = table_to_remove->number; + + skiplist_node* cursor = skiplist_begin(cur_level->tables); + while (cursor) { + TableInfo* t_info = _get_entry(cursor, TableInfo, snode); + if (t_info->number == table_num) { + skiplist_erase_node(cur_level->tables, &t_info->snode); + skiplist_release_node(&t_info->snode); + skiplist_wait_for_free(&t_info->snode); + + // NOTE: the last done() call will kill itself (suicide). + t_info->setRemoved(); + cur_level->numTables.fetch_sub(1); + + _log_info( myLog, "level %zu: removed table %zu_%zu from manifest " + "min key %s, ref count %zu, num tables %zu", + level, + tableMgr->getTableMgrOptions()->prefixNum, + table_num, + t_info->minKey.toReadableString().c_str(), + t_info->refCount.load(), + cur_level->numTables.load() ); + + } + cursor = skiplist_next(cur_level->tables, cursor); + skiplist_release_node(&t_info->snode); + } + + return Status(); +} + +bool TableManifest::doesHashPartitionExist(uint32_t hash_num) { + LevelInfo* l_info = levels[0]; + skiplist_node* cursor = skiplist_begin(l_info->tables); + if (!cursor) return false; + + while (cursor) { + TableInfo* t_info = _get_entry(cursor, TableInfo, snode); + if ( t_info->hashNum == hash_num && + t_info->isValid() ) { + // Match + skiplist_release_node(&t_info->snode); + return true; + } + cursor = skiplist_next(l_info->tables, cursor); + skiplist_release_node(&t_info->snode); + } + return false; +} + +Status TableManifest::getTablesByHash(LevelInfo* l_info, + uint32_t target_hash_num, + std::list& tables_out) +{ + // Find the smallest number NORMAL log file. + size_t num_partitions = tableMgr->getNumL0Partitions(); + + // NOTE: only for level-0. + skiplist_node* cursor = skiplist_begin(l_info->tables); + if (!cursor) return Status::TABLE_NOT_FOUND; + + while (cursor) { + TableInfo* t_info = _get_entry(cursor, TableInfo, snode); + // If 1) 1) invalid hash value (implies return all), OR + // 2) the file's hash value is matched with given target hash. + // AND + // 2) the file is valid (NORMAL or COMPACT_SRC), AND + // 3) the file is not removed. + if ( ( target_hash_num >= num_partitions || + t_info->hashNum == target_hash_num ) && + t_info->isValid() && + !t_info->isRemoved() ) { + // Match + t_info->grab(); + tables_out.push_back(t_info); + } + cursor = skiplist_next(l_info->tables, cursor); + skiplist_release_node(&t_info->snode); + } + if (tables_out.empty()) return Status::TABLE_NOT_FOUND; + + return Status(); +} + +Status TableManifest::getL0Tables(uint32_t target_hash_num, + std::list& tables_out) +{ + LevelInfo* l_info = levels[0]; + return getTablesByHash(l_info, target_hash_num, tables_out); +} + +Status TableManifest::getTablesPoint(const size_t level, + const SizedBuf& key, + std::list& tables_out) +{ + if (level >= levels.size()) return Status::INVALID_LEVEL; + + LevelInfo* l_info = levels[level]; + + if ( level == 0 ) { + // Level-0: hash partition + size_t num_partitions = tableMgr->getNumL0Partitions(); + uint32_t target_hash = getMurmurHash(key, num_partitions); + return getTablesByHash(l_info, target_hash, tables_out); + + } else { + // Otherwise: range-based partition (key) + TableInfo query(level, 0, 0, true); + query.minKey.referTo(key); + skiplist_node* cursor = skiplist_find_smaller_or_equal + ( l_info->tables, &query.snode ); + if (!cursor) return Status::TABLE_NOT_FOUND; + + TableInfo* t_info = _get_entry(cursor, TableInfo, snode); + t_info->grab(); + pushTablesInStack(t_info, tables_out); + tables_out.push_back(t_info); + + skiplist_release_node(&t_info->snode); + } + + return Status(); +} + +void TableManifest::pushTablesInStack(TableInfo* t_info, + std::list& tables_out) +{ + // If tiering mode, push all stack in reversed order + // (newer -> older -> base (oldest)). + TableStack* stack = t_info->stack; + if (t_info->baseTable && stack) { + std::lock_guard l(stack->lock); + + auto entry = stack->tables.rbegin(); + while (entry != stack->tables.rend()) { + TableInfo* tt = *entry; + tt->grab(); + tables_out.push_back(tt); + entry++; + } + } +} + +Status TableManifest::getTablesRange(const size_t level, + const SizedBuf& min_key, + const SizedBuf& max_key, + std::list& tables_out) +{ + if (level >= levels.size()) return Status::INVALID_LEVEL; + + LevelInfo* l_info = levels[level]; + + if ( level == 0 ) { + // Level-0: hash partition + // Ignore the given range and return all. + size_t num_partitions = tableMgr->getNumL0Partitions(); + return getTablesByHash(l_info, num_partitions, tables_out); + + } else { + TableInfo query(level, 0, 0, true); + query.minKey.referTo(min_key); + skiplist_node* cursor = skiplist_find_smaller_or_equal + ( l_info->tables, &query.snode ); + if (!cursor) cursor = skiplist_begin(l_info->tables); + + while (cursor) { + TableInfo* t_info = _get_entry(cursor, TableInfo, snode); + if ( max_key.empty() || + t_info->minKey <= max_key ) { + t_info->grab(); + pushTablesInStack(t_info, tables_out); + tables_out.push_back(t_info); + } + + cursor = skiplist_next(l_info->tables, cursor); + skiplist_release_node(&t_info->snode); + } + if (cursor) skiplist_release_node(cursor); + } + + return Status(); +} + +void TableManifest::getTableNumbers(std::set& numbers_out) { + for (auto& entry: levels) { + LevelInfo*& l_info = entry; + skiplist_node* cursor = skiplist_begin(l_info->tables); + while (cursor) { + TableInfo* t_info = _get_entry(cursor, TableInfo, snode); + + TableStack* stack = t_info->stack; + if (stack) { + std::lock_guard l(stack->lock); + for (TableInfo* tt: stack->tables) { + numbers_out.insert(tt->number); + } + } + + numbers_out.insert(t_info->number); + cursor = skiplist_next(l_info->tables, cursor); + skiplist_release_node(&t_info->snode); + } + if (cursor) skiplist_release_node(cursor); + } +} + +size_t TableManifest::getNumLevels() { + std::lock_guard l(levelsLock); + return levels.size(); +} + +Status TableManifest::getNumTables(size_t level, size_t& num_tables_out) const { + if (level >= levels.size()) return Status::INVALID_LEVEL; + + LevelInfo* l_info = levels[level]; + num_tables_out = l_info->numTables; + return Status(); +} + + +} // namespace jungle + diff --git a/src/table_manifest.h b/src/table_manifest.h new file mode 100644 index 0000000..d8552a6 --- /dev/null +++ b/src/table_manifest.h @@ -0,0 +1,242 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "fileops_base.h" +#include "internal_helper.h" +#include "skiplist.h" +#include "table_file.h" + +#include + +#include _MACRO_TO_STR(LOGGER_H) + +#include +#include +#include +#include + +namespace jungle { + +struct TableInfo; +struct TableStack { + TableStack() {} + ~TableStack() {} + std::list tables; + std::mutex lock; +}; + +struct TableInfo { + enum Status { + NORMAL = 0, + COMPACT_SRC = 1, + COMPACT_DST = 2, + QUERY_PURPOSE = 10000, + }; + + TableInfo(const uint32_t _level, + const uint64_t _num, + const uint64_t _hash_num, + bool query_purpose = false) + : level(_level) + , number(_num) + , hashNum(_hash_num) + , file(nullptr) + , refCount(0) + , removed(false) + , status( query_purpose ? QUERY_PURPOSE : NORMAL ) + , stack(nullptr) + , baseTable(true) + { + if (query_purpose) return; + skiplist_init_node(&snode); + } + ~TableInfo() { + if (status == QUERY_PURPOSE) return; + skiplist_free_node(&snode); + minKey.free(); + delete stack.load(); + stack = nullptr; + assert(refCount.load() == 0); + } + + static int cmp(skiplist_node *a, skiplist_node *b, void *aux) { + TableInfo *aa, *bb; + aa = _get_entry(a, TableInfo, snode); + bb = _get_entry(b, TableInfo, snode); + return SizedBuf::cmp(aa->minKey, bb->minKey); + } + + void grab() { refCount.fetch_add(1); } + void done() { + assert(refCount); + // WARNING: Refer to the comment in `LogFileInfo::done()`. + if (removed) { + uint64_t count = refCount.fetch_sub(1); + SimpleLogger* temp = file->getLogger(); + _log_info(temp, "removed level %zu file %zu " + "ref count %zu -> %zu status %d", + level, number, count, count-1, status.load()); + if (count == 1) { + file->destroySelf(); + delete file; + delete this; + } + return; + } + + // Normal case. + refCount.fetch_sub(1); + } + uint64_t getRefCount() const { return refCount.load(MOR); } + void setRemoved() { removed.store(true, MOR); } + bool isRemoved() { return removed.load(MOR); } + + void setCompactSrc() { return status.store(COMPACT_SRC, MOR); } + void setNormal() { return status.store(NORMAL, MOR); } + bool isNormal() const { return status.load(MOR) == NORMAL; } + bool isSrc() const { return status.load(MOR) == COMPACT_SRC; } + bool isValid() const { return status.load(MOR) != COMPACT_DST; } + + // Skiplist metadata. + skiplist_node snode; + + // Level number. + uint32_t level; + + // Table number. + uint64_t number; + + // For tables in level-0 (hash-partition). + uint32_t hashNum; + + // For tables in level-1+ (range-partition). + SizedBuf minKey; + + // Table file instance. + TableFile* file; + + // Reference counter. + std::atomic refCount; + + // Flag indicating whether or not this file is removed. + std::atomic removed; + + // Current table status. + std::atomic status; + + // Tiering: list of stacks: oldest -> ... -> newest. + std::atomic stack; + + // `true` if this table is the owner of stack. + // `false` if this table belongs to the stack of other table. + bool baseTable; +}; + +class TableManifest { +public: + TableManifest(const TableMgr* table_mgr, FileOps* _f_ops); + ~TableManifest(); + + Status create(const std::string& path, + const std::string& filename); + Status load(const std::string& path, + const uint64_t prefix_num, + const std::string& filename); + Status store(); + Status storeTableStack(RwSerializer& ss, TableInfo* base_table); + + Status sync(); + Status extendLevel(); + Status issueTableNumber(uint64_t& new_table_number); + Status addTableFile(size_t level_num, + uint32_t hash_num, + SizedBuf min_key, + TableFile* t_file, + bool allow_stacking = true); + Status removeTableFile(size_t level, TableInfo* table_to_remove); + + bool doesHashPartitionExist(uint32_t hash_num); + + Status getL0Tables(uint32_t target_hash_num, + std::list& tables_out); + + // Note: done() should be called after use. + Status getTablesPoint(const size_t level, + const SizedBuf& key, + std::list& tables_out); + + // Note: done() should be called after use. + Status getTablesRange(const size_t level, + const SizedBuf& min_key, + const SizedBuf& max_key, + std::list& tables_out); + + void getTableNumbers(std::set& numbers_out); + + size_t getNumLevels(); + + Status getNumTables(size_t level, size_t& num_tables_out) const; + + std::mutex& getLock() { return tableUpdateLock; } + + void setLogger(SimpleLogger* logger) { myLog = logger; } + +private: + struct LevelInfo; + + // Always open Table files belonging to level up to 1. + static const size_t MAX_OPENED_FILE_LEVEL = 1; + static const size_t NUM_RESERVED_LEVELS = 16; + + Status getTablesByHash(LevelInfo* l_info, + uint32_t target_hash_num, + std::list& tables_out); + + void pushTablesInStack(TableInfo* t_info, + std::list& tables_out); + + // Backward pointer to table manager. + const TableMgr* tableMgr; + + // File operations. + FileOps* fOps; + + // Manifest file handle. + FileHandle* mFile; + + // Path. + std::string dirPath; + + // Manifest file name. + std::string mFileName; + + // Total LSM levels. + std::mutex levelsLock; + std::vector levels; + + // Current greatest table number. + std::atomic maxTableNum; + + // To guarantee atomic update of manifest. + std::mutex tableUpdateLock; + + // Logger. + SimpleLogger* myLog; +}; + +} // namespace jungle diff --git a/src/table_mgr.cc b/src/table_mgr.cc new file mode 100644 index 0000000..87360ae --- /dev/null +++ b/src/table_mgr.cc @@ -0,0 +1,739 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_mgr.h" + +#include "db_internal.h" +#include "internal_helper.h" + +#include _MACRO_TO_STR(LOGGER_H) + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace jungle { + +TableMgr::TableMgr(DB* parent_db) + : APPROX_META_SIZE(96) + , parentDb(parent_db) + , allowCompaction(false) + , mani(nullptr) + , numL0Partitions(1) + , numL1Compactions(0) + , myLog(nullptr) + {} + +TableMgr::~TableMgr() { + assert(sMap.size() == 0); + + for (size_t ii=0; iiissueTableNumber(t_num) ); + + TableFile* t_file = new TableFile(this); + t_file->setLogger(myLog); + std::string t_filename = + TableFile::getTableFileName(opt.path, opt.prefixNum, t_num); + + EP( t_file->create(level, t_num, t_filename, opt.fOps, t_opt) ); + uint64_t bf_size = t_file->getBfSize(); + _log_info(myLog, "level %zu: created new table %zu_%zu, " + "BF size %zu bytes (%zu bits)", + level, opt.prefixNum, t_num, + bf_size / 8, bf_size); + + table_file_out = t_file; + return Status(); +} + +void TableMgr::logTableSettings(const DBConfig* db_config) { + if (db_config->compactionFactor) { + _log_info( myLog, "compaction factor %u, reuse factor %zu, " + "min file size %zu, cycle at least %u at most %u", + db_config->compactionFactor, + db_config->blockReuseFactor, + db_config->minFileSizeToCompact, + db_config->minBlockReuseCycleToCompact, + db_config->maxBlockReuseCycle ); + } else { + _log_info(myLog, "auto compaction is disabled"); + } + if (db_config->compactionCbFunc) { + _log_info(myLog, "compaction callback function is given by user"); + } + _log_info( myLog, "table lookup booster limit %zu %zu", + getBoosterLimit(0), getBoosterLimit(1) ); + _log_info( myLog, "next level extension %s", + getOnOffStr(db_config->nextLevelExtension) ); + _log_info( myLog, "bloom filter bits per unit: %.1f", + db_config->bloomFilterBitsPerUnit ); + if (db_config->nextLevelExtension) { + _log_info( myLog, "L0 table limit %zu, L1 table limit %zu, " + "L1 size limit %zu", + db_config->maxL0TableSize, + db_config->maxL1TableSize, + db_config->maxL1Size ); + std::string ratio_str; + for (double dd: db_config->tableSizeRatio) { + ratio_str += std::to_string(dd) + " "; + } + ratio_str += ", "; + for (double dd: db_config->levelSizeRatio) { + ratio_str += std::to_string(dd) + " "; + } + _log_info( myLog, "ratio: %s", ratio_str.c_str() ); + } + + DBMgr* mgr = DBMgr::getWithoutInit(); + if (mgr) { + GlobalConfig* g_conf = mgr->getGlobalConfig(); + _log_info( myLog, "flusher: %zu, compactor: %zu, writer: %zu, " + "max parallel writes: %zu", + g_conf->numFlusherThreads, + g_conf->numCompactorThreads, + g_conf->numTableWriters, + db_config->getMaxParallelWriters() ); + } +} + +Status TableMgr::init(const TableMgrOptions& _options) { + if (mani) return Status::ALREADY_INITIALIZED; + + opt = _options; + const DBConfig* db_config = getDbConfig(); + + compactStatus.resize(db_config->numL0Partitions); + for (size_t ii=0; ii*& entry = compactStatus[ii]; + entry = new std::atomic(false); + } + + Status s; + mani = new TableManifest(this, opt.fOps); + mani->setLogger(myLog); + + char p_num[16]; + sprintf(p_num, "%04" PRIu64, opt.prefixNum); + std::string m_filename = opt.path + "/table" + p_num + "_manifest"; + + try { + if (opt.fOps->exist(m_filename.c_str())) { + // Manifest file already exists, load it. + s = mani->load(opt.path, opt.prefixNum, m_filename); + if (!s) { + // Error happened, try again using backup file. + _log_err(myLog, "loading manifest error: %d, try again", s); + TC(BackupRestore::restore(opt.fOps, m_filename)); + s = mani->load(opt.path, opt.prefixNum, m_filename); + } + if (!s) throw s; + + // Check & adjust num level-0 partitions. + size_t ii = 0; + size_t num_tables = 0; + s = mani->getNumTables(0, num_tables); + while (ii < num_tables) { + if (!mani->doesHashPartitionExist(ii)) break; + ++ii; + } + numL0Partitions = ii; + + // Adjust table file option for block reuse min size. + if ( !db_config->nextLevelExtension ) { + for (size_t ii=0; ii tables; + s = mani->getL0Tables(ii, tables); + if (!s) continue; + + // Find max working set size. + uint64_t max_working_set_size = 0; + for (TableInfo* t_info: tables) { + TableStats t_stats; + t_info->file->getStats(t_stats); + max_working_set_size = std::max( max_working_set_size, + t_stats.workingSetSizeByte ); + } + + for (TableInfo* t_info: tables) { + TableFileOptions t_opt; + t_opt.minBlockReuseFileSize = + std::max( t_opt.minBlockReuseFileSize, + max_working_set_size * + getDbConfig()->blockReuseFactor / 100 ); + t_info->file->changeOptions(t_opt); + t_info->done(); + } + } + } + + } else { + // Not exist, initial setup phase. + + // Create manifest file. + s = mani->create(opt.path, m_filename); + if (!s) return s; + + numL0Partitions = db_config->numL0Partitions; + if ( !db_config->logSectionOnly && + !db_config->readOnly ) { + // Hash partition. + for (size_t ii=0; iiaddTableFile(0, ii, SizedBuf(), t_file) ); + } + + } else { + // Otherwise: table section is disabled. + numL0Partitions = 0; + } + + // Store manifest file. + mani->store(); + mani->sync(); + } + logTableSettings(db_config); + + removeStaleFiles(); + + const DBConfig* db_config = getDbConfig(); + if (!db_config->readOnly) { + allowCompaction = true; + } + return Status(); + + } catch (Status s) { + _log_err(myLog, "init manifest error: %d", s); + DELETE(mani); + return s; + } +} + +Status TableMgr::removeStaleFiles() { + // Do nothing in read only mode. + if (getDbConfig()->readOnly) return Status(); + + std::vector files; + FileMgr::scan(opt.path, files); + + char p_num[16]; + sprintf(p_num, "%04" PRIu64, opt.prefixNum); + std::string prefix = "table"; + prefix += p_num; + prefix += "_"; + size_t prefix_len = prefix.size(); + + std::string m_filename = "table"; + m_filename += p_num; + m_filename += "_manifest"; + + std::set table_numbers; + mani->getTableNumbers(table_numbers); + + for (auto& entry: files) { + std::string& ff = entry; + size_t pos = ff.find(prefix); + if ( pos != std::string::npos && + ff.find(m_filename) == std::string::npos ) { + // Check if it is in manifest. + uint64_t t_num = atoi( ff.substr( prefix_len, + ff.size() - prefix_len ).c_str() ); + if (table_numbers.find(t_num) == table_numbers.end()) { + Timer tt; + opt.fOps->remove(opt.path + "/" + ff); + _log_warn(myLog, "%s does not exist in manifest, removed. %zu us", + ff.c_str(), tt.getUs()); + } + } + } + return Status(); +} + +Status TableMgr::shutdown() { + fdb_status fs = fdb_shutdown(); + if (fs != FDB_RESULT_SUCCESS) { + return Status::ERROR; + } + return Status(); +} + +Status TableMgr::openSnapshot(DB* snap_handle, + const uint64_t checkpoint, + std::list*& table_list_out) +{ + Status s; + TableList* t_list = new TableList(); + { mGuard l(sMapLock); + sMap.insert( std::make_pair(snap_handle, t_list) ); + } + + SizedBuf empty_key; + size_t num_levels = mani->getNumLevels(); + + for (size_t ii=0; iigetTablesRange(ii, empty_key, empty_key, *t_list); + if (!s) continue; + } + + // If `checkpoint == 0`, take the latest one. + for (auto& entry: *t_list) { + TableInfo* info = entry; + info->file->openSnapshot(snap_handle, checkpoint); + } + + table_list_out = t_list; + return Status(); +} + +Status TableMgr::closeSnapshot(DB* snap_handle) { + Status s; + TableList* t_list = nullptr; + { mGuard l(sMapLock); + auto entry = sMap.find(snap_handle); + assert(entry != sMap.end()); + t_list = entry->second; + sMap.erase(entry); + } + + for (auto& entry: *t_list) { + TableInfo* info = entry; + info->file->closeSnapshot(snap_handle); + info->done(); + } + delete t_list; + + return Status(); +} + +Status TableMgr::storeManifest() { + Status s; + EP(mani->store()); + EP(mani->sync()); + return Status(); +} + +Status TableMgr::get(DB* snap_handle, + Record& rec_inout, + bool meta_only) +{ + // NOTE: + // `rec_inout.kv.key` is given by user, shouldn't free in here. + const DBConfig* db_config = getDbConfig(); + if (db_config->logSectionOnly) return Status::KEY_NOT_FOUND; + + Status s; + Record latest_rec; + size_t num_levels = mani->getNumLevels(); + + for (size_t ii=0; ii tables; + s = mani->getTablesPoint(ii, rec_inout.kv.key, tables); + if (!s) continue; + + TableInfo* sm_table = nullptr; + // Search smallest normal table first and then the others, + // as it always has the newest data for the same key. + sm_table = getSmallestNormalTable(tables); + if (sm_table) { + Record new_rec; + new_rec.kv.key = rec_inout.kv.key; + s = sm_table->file->get(snap_handle, new_rec, meta_only); + if (s) { + // `latest_rec` should be empty. + assert(latest_rec.empty()); + new_rec.moveTo(latest_rec); + for (TableInfo* tt: tables) tt->done(); + break; + } + } + // Smallest table doesn't exist or exists but doesn't have the key. + + for (TableInfo* table: tables) { + if (sm_table && table == sm_table) { + // We already searched smallest normal table, skip. + sm_table->done(); + continue; + } + + Record new_rec; + // WARNING: SHOULD NOT free `key` here + // as it is given by caller. + new_rec.kv.key = rec_inout.kv.key; + s = table->file->get(snap_handle, new_rec, meta_only); + if ( s.ok() && + ( latest_rec.empty() || + latest_rec.seqNum < new_rec.seqNum ) ) { + // `new_rec` is newer. + latest_rec.kv.key.clear(); + latest_rec.free(); + new_rec.moveTo(latest_rec); + } else { + // `latest_rec` is newer. + new_rec.kv.key.clear(); + new_rec.free(); + } + table->done(); + } + if (!latest_rec.empty()) break; + } + if (latest_rec.empty()) return Status::KEY_NOT_FOUND; + + // Since `key` is given by caller, change the other fields only. + latest_rec.kv.value.moveTo(rec_inout.kv.value); + latest_rec.meta.moveTo(rec_inout.meta); + rec_inout.seqNum = latest_rec.seqNum; + rec_inout.type = latest_rec.type; + return Status(); +} + +TableInfo* TableMgr::getSmallestSrcTable(const std::list& tables, + uint32_t target_hash_num) +{ + TableInfo* target_table = nullptr; + uint64_t min_tnum = std::numeric_limits::max(); + for (TableInfo* table: tables) { + if ( table->isSrc() && + table->number < min_tnum && + ( target_hash_num == _SCU32(-1) || + target_hash_num == table->hashNum ) ) { + min_tnum = table->number; + target_table = table; + } + } + return target_table; +} + +void TableMgr::getTwoSmallSrcTables(const std::list& tables, + uint32_t target_hash_num, + TableInfo*& table1_out, + TableInfo*& table2_out) +{ + std::map sorted_tables; + for (TableInfo* table: tables) { + if ( table->isSrc() && + ( target_hash_num == _SCU32(-1) || + target_hash_num == table->hashNum ) ) { + sorted_tables.emplace( table->number, table ); + } + } + + size_t cnt = 0; + table1_out = table2_out = nullptr; + for (auto& entry: sorted_tables) { + if (cnt == 0) table1_out = entry.second; + else if (cnt == 1) table2_out = entry.second; + cnt++; + } +} + +TableInfo* TableMgr::getSmallestNormalTable(const std::list& tables, + uint32_t target_hash_num) +{ + TableInfo* target_table = nullptr; + uint64_t min_tnum = std::numeric_limits::max(); + for (TableInfo* table: tables) { + if ( table->isNormal() && + table->number < min_tnum && + ( target_hash_num == _SCU32(-1) || + target_hash_num == table->hashNum ) ) { + min_tnum = table->number; + target_table = table; + } + } + return target_table; +} + +void TableMgr::getTwoSmallNormalTables(const std::list& tables, + uint32_t target_hash_num, + TableInfo*& table1_out, + TableInfo*& table2_out) +{ + std::map sorted_tables; + for (TableInfo* table: tables) { + if ( table->isNormal() && + ( target_hash_num == _SCU32(-1) || + target_hash_num == table->hashNum ) ) { + sorted_tables.emplace( table->number, table ); + } + } + + size_t cnt = 0; + table1_out = table2_out = nullptr; + for (auto& entry: sorted_tables) { + if (cnt == 0) table1_out = entry.second; + else if (cnt == 1) table2_out = entry.second; + cnt++; + } +} + +size_t TableMgr::getNumLevels() const { + return mani->getNumLevels(); +} + +Status TableMgr::getLevelSize(size_t level, + uint64_t& wss_out, + uint64_t& total_out, + uint64_t& max_stack_size_out) +{ + if (level >= mani->getNumLevels()) return Status::INVALID_LEVEL; + + Status s; + std::list tables; + SizedBuf empty_key; + mani->getTablesRange(level, empty_key, empty_key, tables); + + uint64_t wss_local = 0; + uint64_t total_local = 0; + uint64_t max_stack_local = 0; + for (TableInfo*& t_info: tables) { + TableStats t_stats; + EP( t_info->file->getStats(t_stats) ); + wss_local += t_stats.workingSetSizeByte; + total_local += t_stats.totalSizeByte; + + uint64_t cur_stack_size = 1; + TableStack* stack = t_info->stack; + if (stack) { + std::lock_guard l(stack->lock); + cur_stack_size += stack->tables.size(); + } + max_stack_local = std::max( cur_stack_size, max_stack_local ); + + t_info->done(); + } + + wss_out = wss_local; + total_out = total_local; + max_stack_size_out = max_stack_local; + + return Status::OK; +} + +uint64_t TableMgr::getLevelSizeLimit(size_t level) const { + if (level >= mani->getNumLevels()) return 0; + const DBConfig* db_config = getDbConfig(); + + if (level == 0) { + return (uint64_t) + db_config->numL0Partitions * + db_config->maxL0TableSize; + } + uint64_t ret = db_config->maxL1Size; + size_t num_ratio_elems = db_config->levelSizeRatio.size(); + double last_ratio = num_ratio_elems + ? *db_config->levelSizeRatio.rbegin() + : 10; + for (size_t ii = 1; ii < level; ++ii) { + size_t vector_idx = ii - 1; + if (num_ratio_elems > vector_idx) { + ret *= db_config->levelSizeRatio[vector_idx]; + } else { + ret *= last_ratio; + } + } + return ret; +} + +bool TableMgr::disallowCompaction() { + bool old_value = allowCompaction; + allowCompaction = false; + return old_value; +} + +Status TableMgr::close() { + disallowCompaction(); + + _log_info(myLog, "Wait for on-going compaction."); + bool wait_more = false; + uint64_t ticks = 0; + do { + wait_more = false; + // non-LSM mode compaction. + for (size_t ii=0; iiload()) { + wait_more = true; + break; + } + } + // LSM mode compaction. + { std::lock_guard l(lockedTablesLock); + if (lockedTables.size()) wait_more = true; + } + if (wait_more) { + ticks++; + Timer::sleepMs(100); + } + } while (wait_more); + _log_info(myLog, "Closing TableMgr, %zu ticks", ticks); + + return Status(); +} + +Status TableMgr::getAvailCheckpoints(std::list& chk_out) { + if (!mani) return Status::NOT_INITIALIZED; + + // Get tables in level-0. + std::list t_info_ret; + SizedBuf empty_key; + + // NOTE: checkpoint is a special case, we only search for level-0. + mani->getTablesRange(0, empty_key, empty_key, t_info_ret); + + // For each table, get checkpoints. + std::vector< std::unordered_set > chks_by_hash(numL0Partitions); + for (auto& entry: t_info_ret) { + TableInfo* info = entry; + std::list chks_by_file; + info->file->getAvailCheckpoints(chks_by_file); + + // Insert into hash set. + for (auto& le: chks_by_file) { + // 0: special case, skip. + if (le == 0) continue; + chks_by_hash[info->hashNum].insert(le); + } + + info->done(); + } + + // Checkpoint should exist in all hash partitions. + for (auto& e_outer: chks_by_hash[0]) { + bool found_outer = true; + for (size_t ii=1; ii tables; + size_t num_levels = mani->getNumLevels(); + for (size_t ii=0; iigetTablesRange(ii, empty_key, empty_key, tables); + if (!s) continue; + } + + for (TableInfo*& cur_table: tables) { + TableStats local_stat; + cur_table->file->getStats(local_stat); + aggr_stats_out.numKvs += local_stat.numKvs; + aggr_stats_out.workingSetSizeByte += local_stat.workingSetSizeByte; + cur_table->done(); + } + + aggr_stats_out.cacheUsedByte = fdb_get_buffer_cache_used(); + return Status(); +} + +Status TableMgr::getLastSeqnum(uint64_t& seqnum_out) { + Status s; + std::list tables; + try { + for (size_t ii=0; iigetL0Tables(ii, tables) ); + } + + uint64_t seqnum = 0; + for (TableInfo*& cur_table: tables) { + TableStats stats; + cur_table->file->getStats(stats); + _log_info(myLog, "table %zu (hash %zu) last seq %zu", + cur_table->number, cur_table->hashNum, stats.lastSeqnum); + if (valid_number(stats.lastSeqnum) && stats.lastSeqnum > seqnum) { + seqnum = stats.lastSeqnum; + } + cur_table->done(); + } + if (seqnum) seqnum_out = seqnum; + return Status(); + + } catch (Status s) { + for (TableInfo*& cur_table: tables) cur_table->done(); + return s; + } +} + +uint64_t TableMgr::getBoosterLimit(size_t level) const { + const DBConfig* db_config = getDbConfig(); + + // Invalid mode. + if (!db_config->nextLevelExtension) return 0; + + // Only for level 0 and 1. + if ( level >= 2 || + level + 1 > db_config->lookupBoosterLimit_mb.size() ) { + return 0; + } + + if (level == 0) { + uint64_t size_limit = (uint64_t)db_config->lookupBoosterLimit_mb[0] + * 1024 * 1024; + uint64_t ret = size_limit + / sizeof(TableLookupBooster::Elem) + / db_config->numL0Partitions; + return ret; + } + if (level == 1) { + uint64_t size_limit = (uint64_t)db_config->lookupBoosterLimit_mb[1] + * 1024 * 1024; + size_t l1_tables = 1; + mani->getNumTables(level, l1_tables); + l1_tables = std::max(l1_tables, (size_t)db_config->numL0Partitions); + uint64_t ret = size_limit + / sizeof(TableLookupBooster::Elem) + / l1_tables; + return ret; + } + return 0; +} + +} // namespace jungle + diff --git a/src/table_mgr.h b/src/table_mgr.h new file mode 100644 index 0000000..50bef4e --- /dev/null +++ b/src/table_mgr.h @@ -0,0 +1,477 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "avltree.h" +#include "fileops_base.h" +#include "internal_helper.h" +#include "table_file.h" +#include "table_helper.h" +#include "table_manifest.h" +#include "table_writer.h" + +#include + +#include +#include +#include +#include +#include +#include + +namespace jungle { + +class TableMgrOptions { +public: + TableMgrOptions() + : fOps(nullptr) + , prefixNum(0) + , dbConfig(nullptr) + {} + + std::string path; + + FileOps* fOps; + + // KVS ID. + uint64_t prefixNum; + + // Pointer to the parent DB handle's config. + const DBConfig* dbConfig; +}; + +class TableStats { +public: + TableStats() + : numKvs(0) + , workingSetSizeByte(0) + , totalSizeByte(0) + , cacheUsedByte(0) + , blockReuseCycle(0) + , lastSeqnum(0) + , approxDocCount(0) + , approxDelCount(0) + {} + uint64_t numKvs; + uint64_t workingSetSizeByte; + uint64_t totalSizeByte; + uint64_t cacheUsedByte; + uint64_t blockReuseCycle; + uint64_t lastSeqnum; + uint64_t approxDocCount; + uint64_t approxDelCount; +}; + +namespace checker { class Checker; } + +class TableMgr { + friend class checker::Checker; + friend class Compactor; + friend class TableLockHolder; + friend class LevelLockHolder; + +private: + struct LsmFlushResult { + LsmFlushResult() : tFile(nullptr) {} + LsmFlushResult(TableFile* _file) : tFile(_file) {} + LsmFlushResult(TableFile* _file, const SizedBuf& _key) + : tFile(_file), minKey(_key) {} + + // Less function, descending `minKey` order: + // `true` if `ll.minKey > rr.minKey`. + static bool cmp(const LsmFlushResult& ll, const LsmFlushResult& rr) { + return (ll.minKey > rr.minKey); + } + + TableFile* tFile; + SizedBuf minKey; + }; + + enum MergeType { + LEVELING = 0x0, + TIERING = 0x1, + APPEND = 0x2, + }; + +public: + TableMgr(DB* parent_db); + + ~TableMgr(); + + enum VictimPolicy { + WORKING_SET_SIZE = 0x0, + STALE_RATIO = 0x1, + STACK_SIZE = 0x2, + WORKING_SET_SIZE_SPLIT = 0x3, + SMALL_WORKING_SET = 0x4, + }; + + enum MergeStrategy { + NONE = 0x0, + INTERLEVEL = 0x1, + SPLIT = 0x2, + INPLACE = 0x3, + MERGE = 0x4, + }; + + Status init(const TableMgrOptions& _options); + + Status removeStaleFiles(); + + Status shutdown(); + + Status openSnapshot(DB* snap_handle, + const uint64_t checkpoint, + std::list*& table_list_out); + Status closeSnapshot(DB* snap_handle); + + Status setBatch(std::list& batch, + std::list& checkpoints, + bool bulk_load_mode = false); + + Status setBatchHash(std::list& batch, + std::list& checkpoints, + bool bulk_load_mode); + + Status splitTableItr(TableInfo* victim_table); + + Status storeManifest(); + + Status get(DB* snap_handle, + Record& rec_inout, + bool meta_only = false); + + size_t getNumLevels() const; + + Status getLevelSize(size_t level, + uint64_t& wss_out, + uint64_t& total_out, + uint64_t& max_stack_size_out); + + uint64_t getLevelSizeLimit(size_t level) const; + + Status pickVictimTable(size_t level, + VictimPolicy policy, + bool check_limit, + TableInfo*& victim_table_out, + uint64_t& wss_out, + uint64_t& total_out); + + TableInfo* findLocalVictim(size_t level, + TableInfo* given_victim, + VictimPolicy policy, + bool honor_limit); + + Status splitLevel(const CompactOptions& options, + TableInfo* victim_table, + size_t level); + + Status mergeLevel(const CompactOptions& options, + TableInfo* victim_table, + size_t level); + + Status compactLevelItr(const CompactOptions& options, + TableInfo* victim_table, + size_t level); + + Status compactInPlace(const CompactOptions& options, + TableInfo* victim_table, + size_t level); + + Status compactL0(const CompactOptions& options, + uint32_t hash_num); + + bool isL0CompactionInProg(); + + bool chkL0CompactCond(uint32_t hash_num); + + Status chkLPCompactCond(size_t level, + TableMgr::MergeStrategy& s_out, + TableInfo*& victim_table_out); + + bool disallowCompaction(); + + Status close(); + + Status getAvailCheckpoints(std::list& chk_out); + + const DBConfig* getDbConfig() const { return opt.dbConfig; } + + const TableMgrOptions* getTableMgrOptions() const { return &opt; } + + uint32_t getNumL0Partitions() const { return numL0Partitions; } + + void setLogger(SimpleLogger* logger) { + myLog = logger; + if (mani) mani->setLogger(myLog); + } + + Status getStats(TableStats& aggr_stats_out); + + Status getLastSeqnum(uint64_t& seqnum_out); + + uint64_t getBoosterLimit(size_t level) const; + + bool isCompactionAllowed() const { return allowCompaction; } + + void setTableFile(std::list& batch, + std::list& checkpoints, + bool bulk_load_mode, + TableFile* table_file, + uint32_t target_hash, + const SizedBuf& min_key, + const SizedBuf& next_min_key); + + void setTableFileOffset(std::list& checkpoints, + TableFile* src_file, + TableFile* dst_file, + std::vector& offsets, + uint64_t start_index, + uint64_t count); + + void setTableFileItrFlush(TableFile* dst_file, + std::list& recs_batch, + bool without_commit); + + struct Iterator { + public: + Iterator(); + ~Iterator(); + + enum SeekOption { + GREATER = 0, + SMALLER = 1, + }; + + Status init(DB* snap_handle, + TableMgr* table_mgr, + const SizedBuf& start_key, + const SizedBuf& end_key); + Status initSN(DB* snap_handle, + TableMgr* table_mgr, + uint64_t min_seq, + uint64_t max_seq); + Status get(Record& rec_out); + Status prev(); + Status next(); + Status seek(const SizedBuf& key, SeekOption opt = GREATER); + Status seekSN(const uint64_t seqnum, SeekOption opt = GREATER); + Status gotoBegin(); + Status gotoEnd(); + Status close(); + private: + enum Type { + BY_KEY = 0, + BY_SEQ = 1 + }; + struct ItrItem { + ItrItem() : flags(0x0), tInfo(nullptr), tItr(nullptr) {} + enum Flag { + none = 0x0, + no_more_prev = 0x1, + no_more_next = 0x2, + }; + inline static int cmpSeq(avl_node *a, avl_node *b, void *aux) { + ItrItem* aa = _get_entry(a, ItrItem, an); + ItrItem* bb = _get_entry(b, ItrItem, an); + if (aa->lastRec.seqNum < bb->lastRec.seqNum) return -1; + else if (aa->lastRec.seqNum > bb->lastRec.seqNum) return 1; + + // In case of the same seq number, + // we should compare pointer to make AVL-tree distinguish + // different objects who have the same seq number. + // NOTE: iterator will not care of equality (==) condition. + if (aa < bb) return -1; + else if (aa > bb) return 1; + + // Return 0 only if a and b are the same memory object. + return 0; + } + inline static int cmpKey(avl_node *a, avl_node *b, void *aux) { + ItrItem* aa = _get_entry(a, ItrItem, an); + ItrItem* bb = _get_entry(b, ItrItem, an); + + CMP_NULL_CHK(aa->lastRec.kv.key.data, bb->lastRec.kv.key.data); + + int cmp = 0; + if (aux) { + // Custom cmp mode. + TableMgr* tm = reinterpret_cast(aux); + CustomCmpFunc func = tm->getDbConfig()->cmpFunc; + void* param = tm->getDbConfig()->cmpFuncParam; + cmp = func(aa->lastRec.kv.key.data, aa->lastRec.kv.key.size, + bb->lastRec.kv.key.data, bb->lastRec.kv.key.size, + param); + } else { + cmp = SizedBuf::cmp(aa->lastRec.kv.key, bb->lastRec.kv.key); + } + + // Note: key: ascending, seq: descending order. + if (cmp == 0) return cmpSeq(b, a, aux); + return cmp; + } + avl_node an; + uint8_t flags; + TableInfo* tInfo; + TableFile::Iterator* tItr; + Record lastRec; + }; + + void addTableItr(DB* snap_handle, TableInfo* t_info); + Status initInternal(DB* snap_handle, + TableMgr* table_mgr, + uint64_t min_seq, + uint64_t max_seq, + const SizedBuf& start_key, + const SizedBuf& end_key, + Type _type); + Status seekInternal(const SizedBuf& key, + const uint64_t seqnum, + SeekOption opt, + bool goto_end = false); + inline int cmpSizedBuf(const SizedBuf& l, const SizedBuf& r); + bool checkValidBySeq(ItrItem* item, + const uint64_t cur_seq, + const bool is_prev = false); + bool checkValidByKey(ItrItem* item, + const SizedBuf& cur_key, + const bool is_prev = false); + + Type type; + TableMgr* tMgr; + std::vector< std::vector > tables; + std::list* snapTableList; + uint64_t minSeqSnap; + uint64_t maxSeqSnap; + SizedBuf startKey; + SizedBuf endKey; + avl_tree curWindow; + avl_node* windowCursor; + }; + +protected: +// === TYPES + struct RecGroup { + RecGroup(std::list* _recs, + TableInfo* _table) + : recs(_recs), table(_table) {} + // List of records to put into table. + std::list* recs; + // If null, no corresponding table. Should create a new table. + TableInfo* table; + }; + + struct RecGroupItr { + RecGroupItr(const SizedBuf& _min_key, + uint64_t _index, + TableInfo* _table) + : index(_index) + , table(_table) + { + _min_key.copyTo(minKey); + } + + ~RecGroupItr() { + minKey.free(); + } + + // Min key for this table. + SizedBuf minKey; + + // Starting index of this table, in offset array. + uint64_t index; + + // If null, no corresponding table. Should create a new table. + TableInfo* table; + }; + + using TableList = std::list; + using SnapMap = std::unordered_map; + +// === FUNCTIONS + void logTableSettings(const DBConfig* db_config); + + Status createNewTableFile(size_t level, + TableFile*& table_file_out, + const TableFileOptions& t_opt); + + TableInfo* getSmallestSrcTable(const std::list& tables, + uint32_t target_hash_num = _SCU32(-1)); + + void getTwoSmallSrcTables(const std::list& tables, + uint32_t target_hash_num, + TableInfo*& table1_out, + TableInfo*& table2_out); + + TableInfo* getSmallestNormalTable(const std::list& tables, + uint32_t target_hash_num = _SCU32(-1)); + + void getTwoSmallNormalTables(const std::list& tables, + uint32_t target_hash_num, + TableInfo*& table1_out, + TableInfo*& table2_out); + + void putLsmFlushResult(TableFile* cur_file, + const std::list& local_records, + std::list& res_out); + + void putLsmFlushResultWithKey(TableFile* cur_file, + const SizedBuf& key, + std::list& res_out); + + bool isTableLocked(uint64_t t_number); + + bool isLevelLocked(uint64_t l_number); + +// === VARIABLES + const size_t APPROX_META_SIZE; + + // Backward pointer to parent DB instance. + DB* parentDb; + + std::atomic allowCompaction; + + TableMgrOptions opt; + TableManifest* mani; + + std::mutex sMapLock; + SnapMap sMap; + + std::mutex L0Lock; + + uint32_t numL0Partitions; + + // Compaction status of L0 hash partitions. + std::vector< std::atomic* > compactStatus; + + // Number of on-going compactions in L1, + // only used for level extension mode. + std::atomic numL1Compactions; + + // Set of tables being compacted/merged/split. + std::set lockedTables; + std::mutex lockedTablesLock; + + // Set of (source) levels that interlevel compaction is in progress. + std::unordered_set lockedLevels; + std::mutex lockedLevelsLock; + + SimpleLogger* myLog; +}; + +} // namespace jungle + diff --git a/src/table_set_batch.cc b/src/table_set_batch.cc new file mode 100644 index 0000000..8337e2b --- /dev/null +++ b/src/table_set_batch.cc @@ -0,0 +1,247 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_mgr.h" + +#include "db_internal.h" +#include "internal_helper.h" + +#include _MACRO_TO_STR(LOGGER_H) + +namespace jungle { + +void TableMgr::setTableFile( std::list& batch, + std::list& checkpoints, + bool bulk_load_mode, + TableFile* table_file, + uint32_t target_hash, + const SizedBuf& min_key, + const SizedBuf& next_min_key ) +{ + table_file->setBatch( batch, + checkpoints, + min_key, + next_min_key, + target_hash, + bulk_load_mode ); + + if (myLog->debugAllowed()) { + _log_debug( myLog, + "Set batch table num %zu, hash %zu, " + "key1: %s key2: %s", + table_file->getNumber(), target_hash, + min_key.toReadableString().c_str(), + next_min_key.toReadableString().c_str() ); + } +} + +void TableMgr::setTableFileOffset( std::list& checkpoints, + TableFile* src_file, + TableFile* dst_file, + std::vector& offsets, + uint64_t start_index, + uint64_t count ) +{ + const DBConfig* db_config = getDbConfig(); + (void)db_config; + DBMgr* mgr = DBMgr::getWithoutInit(); + DebugParams d_params = mgr->getDebugParams(); + + Status s; + TableFile::Iterator itr; + SizedBuf empty_key; + std::list recs_batch; + + try { + for (uint64_t ii = start_index; ii < start_index + count; ++ii) { + if (!isCompactionAllowed()) { + throw Status(Status::COMPACTION_CANCELLED); + } + + Record rec_out; + Record::Holder h_rec_out(rec_out); + s = src_file->getByOffset(nullptr, offsets[ii], rec_out); + if (!s) { + _log_fatal(myLog, "failed to read record at %zu", offsets[ii]); + assert(0); + continue; + } + uint32_t key_hash_val = getMurmurHash32(rec_out.kv.key);; + uint64_t offset_out = 0; // not used. + dst_file->setSingle(key_hash_val, rec_out, offset_out); + + if (d_params.compactionItrScanDelayUs) { + // If debug parameter is given, sleep here. + Timer::sleepUs(d_params.compactionItrScanDelayUs); + } + } + + // Final commit, and generate snapshot on it. + setTableFileItrFlush(dst_file, recs_batch, false); + _log_info(myLog, "(end of batch) set total %zu records", count); + + } catch (Status s) { // ----------------------------------- + _log_err(myLog, "got error: %d", (int)s); + itr.close(); + + for (Record* rr: recs_batch) { + rr->free(); + delete rr; + } + } +} + +void TableMgr::setTableFileItrFlush(TableFile* dst_file, + std::list& recs_batch, + bool without_commit) +{ + SizedBuf empty_key; + std::list dummy_chk; + + dst_file->setBatch( recs_batch, dummy_chk, + empty_key, empty_key, _SCU32(-1), + without_commit ); + for (Record* rr: recs_batch) { + rr->free(); + delete rr; + } + recs_batch.clear(); +} + +Status TableMgr::setBatch(std::list& batch, + std::list& checkpoints, + bool bulk_load_mode) +{ + // NOTE: + // This function deals with level-0 tables only, + // which means that it is always hash-partitioning. + const DBConfig* db_config = getDbConfig(); + if (db_config->logSectionOnly) return Status::TABLES_ARE_DISABLED; + + std::unique_lock l(L0Lock); + Timer tt; + Status s; + + // Not pure LSM: L0 is hash partitioned. + EP( setBatchHash(batch, checkpoints, bulk_load_mode) ); + + uint64_t elapsed_us = tt.getUs(); + _log_info(myLog, "L0 write done: %zu records, %zu us, %.1f iops", + batch.size(), elapsed_us, + (double)batch.size() * 1000000 / elapsed_us); + + return Status(); +} + +Status TableMgr::setBatchHash( std::list& batch, + std::list& checkpoints, + bool bulk_load_mode ) +{ + Status s; + std::list target_tables; + + DBMgr* db_mgr = DBMgr::getWithoutInit(); + DebugParams d_params = db_mgr->getDebugParams(); + + _log_info(myLog, "Records: %zu", batch.size()); + + // NOTE: write in parallel. + size_t num_partitions = getNumL0Partitions(); + size_t max_writers = getDbConfig()->getMaxParallelWriters(); + + // For the case where `num_partitions > num_writers`. + for (size_t ii = 0; ii < num_partitions; ) { + size_t upto_orig = std::min(ii + max_writers, num_partitions); + + // NOTE: request `req_writers - 1`, as the other one is this thread. + size_t req_writers = upto_orig - ii; + TableWriterHolder twh(db_mgr->tableWriterMgr(), req_writers - 1); + + // Lease may not succeed, adjust `upto`. + size_t leased_writers = twh.leasedWriters.size(); + size_t upto = ii + leased_writers + 1; + + for (size_t jj = ii; jj < upto; ++jj) { + size_t worker_idx = jj - ii; + bool leased_thread = (jj + 1 < upto); + + TableWriterArgs local_args; + local_args.myLog = myLog; + + TableWriterArgs* w_args = (leased_thread) + ? &twh.leasedWriters[worker_idx]->writerArgs + : &local_args; + w_args->callerAwaiter.reset(); + + // Find target table for the given hash number. + std::list tables; + s = mani->getL0Tables(jj, tables); + if (!s) continue; + + TableInfo* target_table = getSmallestNormalTable(tables, jj); + if (!target_table) { + // Target table does not exist, skip. + for (TableInfo*& entry: tables) entry->done(); + continue; + } + + w_args->payload = TableWritePayload( this, + &batch, + &checkpoints, + target_table->file, + jj, + bulk_load_mode ); + // Release all tables except for the target. + for (TableInfo*& entry: tables) { + if (entry != target_table) entry->done(); + } + target_tables.push_back(target_table); + + if (leased_thread) { + // Leased threads. + w_args->invoke(); + } else { + // This thread. + TableWriterMgr::doTableWrite(w_args); + } + } + + // Wait for each worker. + for (size_t jj = ii; jj < upto - 1; ++jj) { + size_t worker_idx = jj - ii; + TableWriterArgs* w_args = &twh.leasedWriters[worker_idx]->writerArgs; + while ( !w_args->payload.isEmpty() ) { + w_args->callerAwaiter.wait_ms(1000); + w_args->callerAwaiter.reset(); + } + } + + if (d_params.tableSetBatchCb) { + DebugParams::GenericCbParams p; + d_params.tableSetBatchCb(p); + } + + ii += leased_writers + 1; + } + + // Release all target tables. + for (TableInfo*& entry: target_tables) entry->done(); + + return Status(); +} + +} // namespace jungle + diff --git a/src/table_split.cc b/src/table_split.cc new file mode 100644 index 0000000..d0ee61d --- /dev/null +++ b/src/table_split.cc @@ -0,0 +1,439 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_mgr.h" + +#include "db_internal.h" +#include "internal_helper.h" + +#include _MACRO_TO_STR(LOGGER_H) + +namespace jungle { + +Status TableMgr::splitLevel(const CompactOptions& options, + TableInfo* victim_table, + size_t level) +{ + if (level >= mani->getNumLevels()) return Status::INVALID_LEVEL; + if ( victim_table && + victim_table->level != level ) return Status::INVALID_PARAMETERS; + + Status s; + const DBConfig* db_config = getDbConfig(); + + bool honor_limit = false; + VictimPolicy vp = WORKING_SET_SIZE_SPLIT; + TableInfo* local_victim = findLocalVictim( level, victim_table, + vp, honor_limit ); + if (!local_victim) return Status::TABLE_NOT_FOUND; + + if (db_config->nextLevelExtension) { + s = splitTableItr(local_victim); + } + + // WARNING: + // Release it ONLY WHEN this table is not given by caller. + // If not, caller is responsible to release the table. + if (!victim_table) local_victim->done(); + + return s; +} + +// Logically do the same thing as `splitTable()`, +// but does not keep records in memory, and instead +// use iterator directly. +// +// Trade-off: less memory, but needs two-phase scan. +// +Status TableMgr::splitTableItr(TableInfo* victim_table) { + size_t level = victim_table->level; + if (level >= mani->getNumLevels()) return Status::INVALID_LEVEL; + + DBMgr* mgr = DBMgr::getWithoutInit(); + DebugParams d_params = mgr->getDebugParams(); + + const DBConfig* db_config = getDbConfig(); + Status s; + SizedBuf empty_key; + Timer tt; + + // Lock victim table only, all the others will be newly created. + TableLockHolder tl_holder(this, {victim_table->number}); + if (!tl_holder.ownsLock()) return Status::OPERATION_IN_PROGRESS; + + _log_info( myLog, "split table %zu_%zu, level %zu, min key %s begins", + opt.prefixNum, victim_table->number, level, + victim_table->minKey.toReadableString().c_str() ); + + std::list results; + // Clone of min keys of newly created tables. + std::vector min_keys; + + TableStats victim_stats; + victim_table->file->getStats(victim_stats); + + uint64_t num_records_read = 0; + uint64_t TABLE_LIMIT = db_config->getMaxTableSize(level); + uint64_t NUM_OUTPUT_TABLES = + (victim_stats.workingSetSizeByte / TABLE_LIMIT) + 1; + uint64_t EXP_DOCS = (victim_stats.numKvs / NUM_OUTPUT_TABLES) + 1; + bool moved_to_new_table = true; + _log_info(myLog, "split table WSS %zu limit %zu num docs %zu " + "output tables %zu expected docs per table %zu", + victim_stats.workingSetSizeByte, + TABLE_LIMIT, + victim_stats.numKvs, + NUM_OUTPUT_TABLES, + EXP_DOCS); + + TableFile::Iterator* itr = new TableFile::Iterator(); + EP( itr->init(nullptr, victim_table->file, empty_key, empty_key) ); + + std::vector offsets; + // Reserve 10% more headroom, just in case. + offsets.reserve(victim_stats.approxDocCount * 11 / 10); + std::vector start_indexes; + + try { + if (level == 1) numL1Compactions.fetch_add(1); + + uint64_t cur_docs_acc = 0; + + // Initial scan to get + // 1) number of files after split, and + // 2) min keys for each new file. + do { + if (!isCompactionAllowed()) { + throw Status(Status::COMPACTION_CANCELLED); + } + + Record rec_out; + Record::Holder h_rec_out(rec_out); + size_t value_size_out = 0; + uint64_t offset_out = 0; + s = itr->getMeta(rec_out, value_size_out, offset_out); + if (!s) break; + + offsets.push_back(offset_out); + uint64_t cur_index = offsets.size() - 1; + + if (moved_to_new_table) { + moved_to_new_table = false; + + SizedBuf key_clone; + if (min_keys.size() == 0) { + // First split table: should inherit + // victim table's min key. + victim_table->minKey.copyTo(key_clone); + } else { + rec_out.kv.key.copyTo(key_clone); + } + min_keys.push_back( key_clone ); + start_indexes.push_back( cur_index ); + } + + cur_docs_acc++; + num_records_read++; + + if (d_params.compactionDelayUs) { + // If debug parameter is given, sleep here. + Timer::sleepUs(d_params.compactionDelayUs); + } + + if (cur_docs_acc > EXP_DOCS) { + // Go to next table. + cur_docs_acc = 0; + moved_to_new_table = true; + } + } while (itr->next().ok()); + itr->close(); + DELETE(itr); + + uint64_t elapsed_us = std::max( tt.getUs(), (uint64_t)1 ); + double scan_rate = (double)num_records_read * 1000000 / elapsed_us; + size_t num_new_tables = min_keys.size(); + + _log_info(myLog, "split table %zu_%zu, level %zu, %zu records, %zu files, " + "initial scan %zu us, %.1f iops", + opt.prefixNum, victim_table->number, + level, num_records_read, num_new_tables, + elapsed_us, scan_rate); + + size_t max_writers = db_config->getMaxParallelWriters(); + std::list dummy_chk; + + for (size_t ii=0; iitableWriterMgr(), req_writers - 1); + + // Lease may not succeed, adjust `upto`. + size_t leased_writers = twh.leasedWriters.size(); + size_t upto = ii + leased_writers + 1; + + for (size_t jj = ii; jj < upto; jj++) { + size_t worker_idx = jj - ii; + bool leased_thread = (jj + 1 < upto); + + // Create a new file and then flush. + TableFile* cur_file = nullptr; + TableFileOptions t_opt; + s = createNewTableFile(level, cur_file, t_opt); + if (!s) continue; + + TableWriterArgs local_args; + local_args.myLog = myLog; + + TableWriterArgs* w_args = (leased_thread) + ? &twh.leasedWriters[worker_idx]->writerArgs + : &local_args; + w_args->callerAwaiter.reset(); + + uint64_t count = (jj + 1 == num_new_tables) + ? offsets.size() - start_indexes[jj] + : start_indexes[jj+1] - start_indexes[jj]; + w_args->payload = TableWritePayload( this, + &offsets, + start_indexes[jj], + count, + &dummy_chk, + victim_table->file, + cur_file ); + putLsmFlushResultWithKey(cur_file, min_keys[jj], results); + + if (leased_thread) { + // Leased threads. + w_args->invoke(); + } else { + // This thread. + TableWriterMgr::doTableWrite(w_args); + } + } + + // Wait for workers. + for (size_t jj = ii; jj < upto - 1; jj++) { + size_t worker_idx = jj - ii; + TableWriterArgs* w_args = &twh.leasedWriters[worker_idx]->writerArgs; + while ( !w_args->payload.isEmpty() ) { + w_args->callerAwaiter.wait_ms(1000); + w_args->callerAwaiter.reset(); + } + } + + if (!isCompactionAllowed()) throw Status(Status::COMPACTION_CANCELLED); + + ii += leased_writers + 1; + } + + { // Grab lock, add first, and then remove next. + std::lock_guard l(mani->getLock()); + // WARNING: + // We should add in descending order of min key. + // Otherwise, if there is a point query in the middle, + // it may go to wrong (newly created) table which + // causes false "key not found". + results.sort( LsmFlushResult::cmp ); + for (LsmFlushResult& rr: results) { + TC( mani->addTableFile( level, 0, rr.minKey, rr.tFile ) ); + rr.tFile = nullptr; + } + mani->removeTableFile(level, victim_table); + } + mani->store(); + mani->sync(); + + elapsed_us = std::max( tt.getUs(), (uint64_t)1 ); + double split_rate = (double)num_records_read * 1000000 / elapsed_us; + if (parentDb) { + parentDb->p->tStats.lastSplitRate = split_rate; + parentDb->p->tStats.lastSplitRateExpiry.reset(); + } + _log_info( myLog, "split table done: %zu us, %zu records, %.1f iops", + elapsed_us, num_records_read, split_rate); + + for (SizedBuf& sb: min_keys) sb.free(); + + if (level == 1) numL1Compactions.fetch_sub(1); + return Status(); + + } catch (Status s) { // --------------------------------------------------- + _log_err(myLog, "split failed: %d", (int)s); + + if (itr) { + itr->close(); + DELETE(itr); + } + + for (SizedBuf& sb: min_keys) sb.free(); + + for (auto entry: results) { + LsmFlushResult& rr = entry; + delete rr.tFile; + rr.minKey.free(); + } + if (level == 1) numL1Compactions.fetch_sub(1); + return s; + } +} + +Status TableMgr::mergeLevel(const CompactOptions& options, + TableInfo* victim_table, + size_t level) +{ + if (level == 0 || level >= mani->getNumLevels()) return Status::INVALID_LEVEL; + if ( victim_table && + victim_table->level != level ) return Status::INVALID_PARAMETERS; + + Status s; + DBMgr* mgr = DBMgr::getWithoutInit(); + DebugParams d_params = mgr->getDebugParams(); + + bool honor_limit = false; + VictimPolicy vp = SMALL_WORKING_SET; + TableInfo* local_victim = findLocalVictim( level, victim_table, + vp, honor_limit ); + if (!local_victim) return Status::TABLE_NOT_FOUND; + + // Find the target table (i.e., right before the victim). + std::list tables; + SizedBuf empty_key; + mani->getTablesRange(level, empty_key, empty_key, tables); + + TableInfo* target_table = nullptr; + for (TableInfo* tt: tables) { + if (tt == local_victim) break; + target_table = tt; + } + for (TableInfo* tt: tables) { + if (tt != target_table) tt->done(); + } + + TableFile::Iterator* itr = nullptr; + try { + assert(target_table); + if (!target_table) { + // It should not happen anyway. + throw Status( Status::TABLE_NOT_FOUND ); + } + + _log_info( myLog, + "merge table level %zu, %zu_%zu min key %s -> " + "%zu_%zu min key %s begins", + level, + opt.prefixNum, local_victim->number, + local_victim->minKey.toReadableString().c_str(), + opt.prefixNum, target_table->number, + target_table->minKey.toReadableString().c_str() ); + + Timer timer; + TableStats victim_stats; + local_victim->file->getStats(victim_stats); + _log_info(myLog, "merge victim table WSS %zu / total %zu, %zu records", + victim_stats.workingSetSizeByte, + victim_stats.totalSizeByte, + victim_stats.numKvs); + + // Lock order (in the same level): + // smaller number table to bigger number table. + uint64_t num_sm = std::min(target_table->number, local_victim->number); + uint64_t num_gt = std::max(target_table->number, local_victim->number); + + TableLockHolder tl_sm_holder( this, {num_sm} ); + if (!tl_sm_holder.ownsLock()) throw Status(Status::OPERATION_IN_PROGRESS); + + TableLockHolder tl_gt_holder(this, {num_gt}); + if (!tl_gt_holder.ownsLock()) throw Status(Status::OPERATION_IN_PROGRESS); + + // Read all records from the victim table. + itr = new TableFile::Iterator(); + TC( itr->init(nullptr, local_victim->file, empty_key, empty_key) ); + + uint64_t total_count = 0; + do { + if (d_params.compactionItrScanDelayUs) { + // If debug parameter is given, sleep here. + Timer::sleepUs(d_params.compactionItrScanDelayUs); + } + + if (!isCompactionAllowed()) { + throw Status(Status::COMPACTION_CANCELLED); + } + + Record rec_out; + Record::Holder h(rec_out); + s = itr->get(rec_out); + if (!s) break; + + uint32_t key_hash_val = getMurmurHash32(rec_out.kv.key);; + uint64_t offset_out = 0; // not used. + target_table->file->setSingle(key_hash_val, rec_out, offset_out); + total_count++; + } while (itr->next().ok()); + itr->close(); + delete itr; + + // Set a dummy batch to trigger commit. + std::list dummy_batch; + std::list dummy_chk; + SizedBuf empty_key; + target_table->file->setBatch(dummy_batch, dummy_chk, empty_key, empty_key, + _SCU32(-1), false); + + { // Grab lock, add first, and then remove next. + std::lock_guard l(mani->getLock()); + mani->removeTableFile(level, local_victim); + } + + uint64_t elapsed_us = std::max( timer.getUs(), (uint64_t)1 ); + double write_rate = (double)total_count * 1000000 / elapsed_us; + _log_info( myLog, + "merge table level %zu, %zu_%zu min key %s -> " + "%zu_%zu min key %s done, %zu records, %zu us, %.1f iops", + level, + opt.prefixNum, local_victim->number, + local_victim->minKey.toReadableString().c_str(), + opt.prefixNum, target_table->number, + target_table->minKey.toReadableString().c_str(), + total_count, elapsed_us, write_rate ); + + // WARNING: + // Release it ONLY WHEN this table is not given by caller. + // If not, caller is responsible to release the table. + if (!victim_table) local_victim->done(); + if (target_table) target_table->done(); + + mani->store(); + mani->sync(); + + return Status(); + + } catch (Status ss) { + _log_err(myLog, "merge failed: %d", (int)ss); + + if (itr) { + itr->close(); + delete itr; + } + if (!victim_table) local_victim->done(); + if (target_table) target_table->done(); + return ss; + } +} + +} + diff --git a/src/table_writer.cc b/src/table_writer.cc new file mode 100644 index 0000000..e3d3187 --- /dev/null +++ b/src/table_writer.cc @@ -0,0 +1,179 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "table_writer.h" + +#include "db_mgr.h" +#include "internal_helper.h" +#include "table_mgr.h" + +#include + +namespace jungle { + +TableWriterArgs::TableWriterArgs() + : writerId(0) + , stopSignal(false) + , myLog(nullptr) + {} + +void TableWriterArgs::invoke() { + awaiter.invoke(); +} + +TableWriterPkg::TableWriterPkg() + : tHandle(nullptr) + , inUse(false) + {} + +void TableWriterPkg::init(size_t writer_id, + TableWriterMgr* tw_mgr, + SimpleLogger* logger) +{ + writerArgs.writerId = writer_id; + writerArgs.myLog = logger; + tHandle = new std::thread + ( &TableWriterMgr::tableWriterLoop, tw_mgr, &writerArgs ); +} + +void TableWriterPkg::shutdown() { + writerArgs.stopSignal = true; + writerArgs.invoke(); + if (tHandle && tHandle->joinable()) { + tHandle->join(); + } + DELETE(tHandle); +} + + +TableWriterMgr::~TableWriterMgr() { + shutdown(); +} + +Status TableWriterMgr::init() { + DBMgr* mgr = DBMgr::getWithoutInit(); + assert(mgr); + + GlobalConfig* g_config = mgr->getGlobalConfig(); + + // Spawn level-0 writers. + size_t num_writers = g_config->numTableWriters; + + SimpleLogger* my_log = mgr->getLogger(); + + for (size_t ii=0; iiinit(ii, this, my_log); + tableWriters.push_back(pp); + } + + return Status(); +} + +Status TableWriterMgr::shutdown() { + for (auto& entry: tableWriters) { + TableWriterPkg*& pp = entry; + pp->shutdown(); + DELETE(pp); + } + tableWriters.clear(); + + return Status(); +} + +std::vector TableWriterMgr::leaseWriters(size_t num) { + std::vector ret; + if (!num) return ret; + + std::lock_guard l(globalLock); + + // Search idle writers upto `num`. + for (auto& entry: tableWriters) { + TableWriterPkg* pp = entry; + if (!pp->inUse) { + pp->inUse = true; + ret.push_back(pp); + if (ret.size() >= num) break; + } + } + // Even though there is no idle writer, just return empty list. + return ret; +} + +void TableWriterMgr::returnWriters(const std::vector writers) { + std::lock_guard l(globalLock); + for (auto& entry: writers) { + TableWriterPkg* pp = entry; + tableWriters[pp->writerArgs.writerId]->inUse = false; + } +} + +void TableWriterMgr::tableWriterLoop(TableWriterArgs* args) { +#ifdef __linux__ + std::string thread_name = "j_twriter_" + std::to_string(args->writerId); + pthread_setname_np(pthread_self(), thread_name.c_str()); +#endif + + _log_info( args->myLog, "table Writer initiated (%zu)", + args->writerId ); + + while (!args->stopSignal) { + args->awaiter.wait_ms(1000); + args->awaiter.reset(); + + if (args->stopSignal) break; + if (args->payload.isEmpty()) continue; + + doTableWrite(args); + + // Clear the workload. + args->payload.reset(); + + // Invoke table mgr. + args->callerAwaiter.invoke(); + } + + _log_info( args->myLog, "table Writer terminated (%zu)", + args->writerId ); +} + +void TableWriterMgr::doTableWrite(TableWriterArgs* args) { + if (args->payload.batch) { + // Batch is given. + SizedBuf empty_key; + args->payload.targetTableMgr->setTableFile + ( *args->payload.batch, + *args->payload.checkpoints, + args->payload.bulkLoadMode, + args->payload.targetTableFile, + args->payload.targetHash, + empty_key, empty_key ); + + } else if (args->payload.sourceTableFile) { + // Offset is given. + args->payload.targetTableMgr->setTableFileOffset + ( *args->payload.checkpoints, + args->payload.sourceTableFile, + args->payload.targetTableFile, + *args->payload.offsets, + args->payload.startIdx, + args->payload.count ); + } +} + + +}; // namespace jungle + diff --git a/src/table_writer.h b/src/table_writer.h new file mode 100644 index 0000000..34a0699 --- /dev/null +++ b/src/table_writer.h @@ -0,0 +1,225 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "event_awaiter.h" +#include "internal_helper.h" + +#include + +#include +#include +#include +#include + +class SimpleLogger; + +namespace jungle { + +class TableFile; +class TableMgr; +class TableWriterGroup; +class TableWriterMgr; + +struct TableWritePayload { + TableWritePayload() + : payloadReady(false) + , targetTableMgr(nullptr) + , batch(nullptr) + , offsets(nullptr) + , startIdx(0) + , count(0) + , checkpoints(nullptr) + , sourceTableFile(nullptr) + , targetTableFile(nullptr) + , targetHash(_SCU32(-1)) + , bulkLoadMode(false) + {} + + // When records are given in advance. + TableWritePayload(TableMgr* _table_mgr, + std::list* _batch, + std::list* _checkpoints, + TableFile* _target_table_file, + uint32_t _target_hash, + bool _bulk_load_mode) + : payloadReady(false) + , targetTableMgr(_table_mgr) + , batch(_batch) + , offsets(nullptr) + , startIdx(0) + , count(0) + , checkpoints(_checkpoints) + , sourceTableFile(nullptr) + , targetTableFile(_target_table_file) + , targetHash(_target_hash) + , bulkLoadMode(_bulk_load_mode) + { + payloadReady = true; + } + + // When only offset is known. + TableWritePayload(TableMgr* _table_mgr, + std::vector* _offsets, + uint64_t s_idx, + uint64_t c, + std::list* _checkpoints, + TableFile* _source_table_file, + TableFile* _target_table_file) + : payloadReady(false) + , targetTableMgr(_table_mgr) + , batch(nullptr) + , offsets(_offsets) + , startIdx(s_idx) + , count(c) + , checkpoints(_checkpoints) + , sourceTableFile(_source_table_file) + , targetTableFile(_target_table_file) + , targetHash(_SCU32(-1)) + , bulkLoadMode(false) + { + payloadReady = true; + } + + TableWritePayload& operator=(const TableWritePayload& src) { + std::lock_guard l(lock); + assert(payloadReady.load() == false); + + targetTableMgr = src.targetTableMgr; + batch = src.batch; + offsets = src.offsets; + startIdx = src.startIdx; + count = src.count; + checkpoints = src.checkpoints; + sourceTableFile = src.sourceTableFile; + targetTableFile = src.targetTableFile; + targetHash = src.targetHash; + bulkLoadMode = src.bulkLoadMode; + + payloadReady = true; + return *this; + } + + void reset() { + std::lock_guard l(lock); + + targetTableMgr = nullptr; + batch = nullptr; + offsets = nullptr; + startIdx = 0; + count = 0; + checkpoints = nullptr; + sourceTableFile = nullptr; + targetTableFile = nullptr; + targetHash = _SCU32(-1); + bulkLoadMode = false; + + // WARNING: + // There are many threads monitoring + // `payloadReady` flag. We SHOULD clear + // this flag at the end this function. + payloadReady = false; + } + + bool isEmpty() const { return !payloadReady.load(); } + + std::mutex lock; + std::atomic payloadReady; + TableMgr* targetTableMgr; + std::list* batch; + std::vector* offsets; + uint64_t startIdx; + uint64_t count; + std::list* checkpoints; + TableFile* sourceTableFile; + TableFile* targetTableFile; + uint32_t targetHash; + bool bulkLoadMode; +}; + +struct TableWriterArgs { + TableWriterArgs(); + + void invoke(); + + uint32_t writerId; + EventAwaiter awaiter; + EventAwaiter callerAwaiter; + std::atomic stopSignal; + TableWritePayload payload; + SimpleLogger* myLog; +}; + +struct TableWriterPkg { + TableWriterPkg(); + + void init(size_t writer_id, + TableWriterMgr* tw_mgr, + SimpleLogger* logger); + + void shutdown(); + + std::thread* tHandle; + TableWriterArgs writerArgs; + std::atomic inUse; +}; + +class TableWriterMgr { +public: + TableWriterMgr() {} + + ~TableWriterMgr(); + + Status init(); + + Status shutdown(); + + void tableWriterLoop(TableWriterArgs* args); + + static void doTableWrite(TableWriterArgs* args); + + std::vector leaseWriters(size_t num); + + void returnWriters(const std::vector writers); + + // Lock for both writer groups and waiting queue. + std::mutex globalLock; + + std::vector tableWriters; + +}; + +class TableWriterHolder { +public: + TableWriterHolder(TableWriterMgr* _mgr, + size_t num_requested) + : mgr(_mgr) + { + leasedWriters = mgr->leaseWriters(num_requested); + } + + ~TableWriterHolder() { + mgr->returnWriters(leasedWriters); + } + + TableWriterMgr* mgr; + std::vector leasedWriters; +}; + + +}; // namespace jungle + diff --git a/src/worker_mgr.cc b/src/worker_mgr.cc new file mode 100644 index 0000000..bf23e22 --- /dev/null +++ b/src/worker_mgr.cc @@ -0,0 +1,160 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "worker_mgr.h" + +namespace jungle { + +WorkerBase::WorkerBase() + : workerName("noname") + , status(NO_INSTANCE) + , doNotSleepNextTime(false) +{} + +WorkerBase::~WorkerBase() { +} + +void WorkerBase::loop(WorkerOptions* opt) { + if (!opt || !opt->worker) return; + + WorkerBase* worker = opt->worker; +#ifdef __linux__ + std::string thread_name = "j_" + worker->workerName; + thread_name = thread_name.substr(0, 15); + pthread_setname_np(pthread_self(), thread_name.c_str()); +#endif + + for (;;) { + // Sleep if IDLE or STOP. + if ( (worker->status == IDLE || worker->status == STOP) && + !worker->doNotSleepNextTime.load() ) { + worker->ea.wait_ms(opt->sleepDuration_ms); + worker->ea.reset(); + } + + if (worker->status == TERMINATING) { + // Terminate. + break; + } + + // Work. + // IDLE --> WORKING. + WStatus exp = IDLE; + WStatus val = WORKING; + if (worker->status.compare_exchange_weak(exp, val)) { + worker->doNotSleepNextTime = false; + worker->work(opt); + // WORKING --> IDLE. + exp = WORKING; + val = IDLE; + worker->status.compare_exchange_weak(exp, val); + } + } + worker->status = NO_INSTANCE; +} + +void WorkerBase::run() { + WStatus exp = STOP; + WStatus val = IDLE; + if (status.compare_exchange_weak(exp, val)) { + } else { + } +} + +void WorkerBase::stop() { + status = STOP; +} + +void WorkerBase::invoke() { + ea.invoke(); +} + +void WorkerBase::destroy(const DestroyOptions& options) { + status = TERMINATING; + if (handle.joinable()) { + if (options.wait) { + handle.join(); + } else { + ea.invoke(); + } + } +} + + + + +WorkerMgr::WorkerMgr() { +} + +WorkerMgr::~WorkerMgr() { + std::lock_guard ll(workersLock); + for (auto& entry: workers) { + WorkerBase* w = entry.second; + WorkerBase::DestroyOptions d_opt; + d_opt.wait = false; + w->destroy(d_opt); + + // Wait until the worker normally finishes its job. + while (w->status != WorkerBase::NO_INSTANCE) { + std::this_thread::yield(); + } + if (w->handle.joinable()) { + w->handle.join(); + } + delete w; + } +} + +Status WorkerMgr::addWorker(WorkerBase* worker) +{ + std::lock_guard ll(workersLock); + + auto itr = workers.find(worker->name()); + if (itr != workers.end()) { + return Status::ALREADY_EXIST; + } + workers.insert( std::make_pair(worker->name(), worker) ); + worker->stop(); + + return Status(); +} + +Status WorkerMgr::invokeWorker(const std::string& prefix, bool invoke_all) { + std::lock_guard ll(workersLock); + for (auto& entry: workers) { + WorkerBase* worker = entry.second; + if ( worker->name().find(prefix) != std::string::npos ) { + worker->invoke(); + if (!invoke_all) break; + } + } + + return Status(); +} + +WorkerBase* WorkerMgr::getWorker(const std::string& name) { + std::lock_guard ll(workersLock); + for (auto& entry: workers) { + WorkerBase* worker = entry.second; + if ( worker->name() == name ) { + return worker; + } + } + + return nullptr; +} + +} // namespace jungle diff --git a/src/worker_mgr.h b/src/worker_mgr.h new file mode 100644 index 0000000..43350f7 --- /dev/null +++ b/src/worker_mgr.h @@ -0,0 +1,101 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "event_awaiter.h" + +#include + +#include +#include +#include +#include +#include +#include + +namespace jungle { + +// Base class of child worker classes. +class WorkerBase { +public: + enum WStatus { + // Thread is not created yet. + NO_INSTANCE = 0, + // Not running, cannot work. + STOP = 1, + // Running but idle, can work. + IDLE = 2, + // Working. + WORKING = 3, + // Waiting for termination. + TERMINATING = 4, + }; + + struct DestroyOptions { + DestroyOptions() : wait(true) {} + bool wait; + }; + + struct WorkerOptions { + WorkerOptions() + : sleepDuration_ms(1000) + , worker(nullptr) {} + size_t sleepDuration_ms; + WorkerBase* worker; + }; + + WorkerBase(); + virtual ~WorkerBase(); + + virtual void run(); + virtual void stop(); + virtual void invoke(); + virtual void work(WorkerOptions* options) = 0; + virtual void destroy(const DestroyOptions& options); + + virtual bool isIdle() const { return status == IDLE; } + std::string name() const { return workerName; } + + static void loop(WorkerOptions* options); + + std::thread handle; + std::string workerName; + WorkerOptions curOptions; + std::atomic status; + std::atomic doNotSleepNextTime; + + EventAwaiter ea; +}; + +class WorkerMgr { +public: + WorkerMgr(); + ~WorkerMgr(); + + Status addWorker(WorkerBase* worker); + Status invokeWorker(const std::string& prefix, bool invoke_all = false); + WorkerBase* getWorker(const std::string& name); + +private: + std::mutex workersLock; + std::unordered_map workers; +}; + + +} // namespace jungle + + diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..b26a096 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,200 @@ +set(TEST_DIR ${PROJECT_SOURCE_DIR}/tests) +set(UNIT_TEST_DIR ${TEST_DIR}/unit) +set(JUNGLE_TEST_DIR ${TEST_DIR}/jungle) +set(STRESS_TEST_DIR ${TEST_DIR}/stress) + +set(JUNGLE_TEST_DEPS + ${CMAKE_CURRENT_BINARY_DIR}/../libjungle.a + ${LIBSIMPLELOGGER} + ${FDB_LIB_DIR}/libforestdb.a + ${LIBSNAPPY} + ${LIBDL}) + +set(FILEOPS_TEST ${TEST_DIR}/unit/fileops_test.cc) +add_executable(fileops_test ${FILEOPS_TEST}) +target_link_libraries(fileops_test ${JUNGLE_TEST_DEPS}) +add_dependencies(fileops_test static_lib) + +set(FILEOPS_DIRECTIO_TEST ${TEST_DIR}/unit/fileops_directio_test.cc) +add_executable(fileops_directio_test ${FILEOPS_DIRECTIO_TEST}) +target_link_libraries(fileops_directio_test ${JUNGLE_TEST_DEPS}) +add_dependencies(fileops_directio_test static_lib) + +set(KEYVALUE_TEST ${TEST_DIR}/unit/keyvalue_test.cc) +add_executable(keyvalue_test ${KEYVALUE_TEST}) +target_link_libraries(keyvalue_test ${JUNGLE_TEST_DEPS}) +add_dependencies(keyvalue_test static_lib) + +set(MEMTABLE_TEST ${TEST_DIR}/unit/memtable_test.cc) +add_executable(memtable_test ${MEMTABLE_TEST}) +target_link_libraries(memtable_test ${JUNGLE_TEST_DEPS}) +add_dependencies(memtable_test static_lib) + +set(CRC32_TEST ${TEST_DIR}/unit/crc32_test.cc) +add_executable(crc32_test ${CRC32_TEST}) +target_link_libraries(crc32_test ${JUNGLE_TEST_DEPS}) +add_dependencies(crc32_test static_lib) + +set(TABLE_LOOKUP_BOOSTER_TEST ${TEST_DIR}/unit/table_lookup_booster_test.cc) +add_executable(table_lookup_booster_test ${TABLE_LOOKUP_BOOSTER_TEST}) +target_link_libraries(table_lookup_booster_test ${JUNGLE_TEST_DEPS}) +add_dependencies(table_lookup_booster_test static_lib) + +add_custom_target(unit_test) +add_dependencies(unit_test + fileops_test + fileops_directio_test + keyvalue_test + crc32_test) + + +set(CASUAL_TEST ${TEST_DIR}/jungle/casual_test.cc) +add_executable(casual_test ${CASUAL_TEST}) +target_link_libraries(casual_test ${JUNGLE_TEST_DEPS}) +add_dependencies(casual_test static_lib) + +set(BASIC_OP_TEST ${TEST_DIR}/jungle/basic_op_test.cc) +add_executable(basic_op_test ${BASIC_OP_TEST}) +target_link_libraries(basic_op_test ${JUNGLE_TEST_DEPS}) +add_dependencies(basic_op_test static_lib) + +set(SEQ_ITR_TEST ${TEST_DIR}/jungle/seq_itr_test.cc) +add_executable(seq_itr_test ${SEQ_ITR_TEST}) +target_link_libraries(seq_itr_test ${JUNGLE_TEST_DEPS}) +add_dependencies(seq_itr_test static_lib) + +set(KEY_ITR_TEST ${TEST_DIR}/jungle/key_itr_test.cc) +add_executable(key_itr_test ${KEY_ITR_TEST}) +target_link_libraries(key_itr_test ${JUNGLE_TEST_DEPS}) +add_dependencies(key_itr_test static_lib) + +set(SNAPSHOT_TEST ${TEST_DIR}/jungle/snapshot_test.cc) +add_executable(snapshot_test ${SNAPSHOT_TEST}) +target_link_libraries(snapshot_test ${JUNGLE_TEST_DEPS}) +add_dependencies(snapshot_test static_lib) + +set(CUSTOM_CMP_TEST ${TEST_DIR}/jungle/custom_cmp_test.cc) +add_executable(custom_cmp_test ${CUSTOM_CMP_TEST}) +target_link_libraries(custom_cmp_test ${JUNGLE_TEST_DEPS}) +add_dependencies(custom_cmp_test static_lib) + +set(CORRUPTION_TEST ${TEST_DIR}/jungle/corruption_test.cc) +add_executable(corruption_test ${CORRUPTION_TEST}) +target_link_libraries(corruption_test ${JUNGLE_TEST_DEPS}) +add_dependencies(corruption_test static_lib) + +set(COMPACTION_TEST ${TEST_DIR}/jungle/compaction_test.cc) +add_executable(compaction_test ${COMPACTION_TEST}) +target_link_libraries(compaction_test ${JUNGLE_TEST_DEPS}) +add_dependencies(compaction_test static_lib) + +set(LEVEL_EXT_TEST ${TEST_DIR}/jungle/level_extension_test.cc) +add_executable(level_extension_test ${LEVEL_EXT_TEST}) +target_link_libraries(level_extension_test ${JUNGLE_TEST_DEPS}) +add_dependencies(level_extension_test static_lib) + +set(LOG_RECLAIM_TEST ${TEST_DIR}/jungle/log_reclaim_test.cc) +add_executable(log_reclaim_test ${LOG_RECLAIM_TEST}) +target_link_libraries(log_reclaim_test ${JUNGLE_TEST_DEPS}) +add_dependencies(log_reclaim_test static_lib) + +set(MT_TEST ${TEST_DIR}/jungle/mt_test.cc) +add_executable(mt_test ${MT_TEST}) +target_link_libraries(mt_test ${JUNGLE_TEST_DEPS}) +add_dependencies(mt_test static_lib) + +set(LARGE_TEST ${TEST_DIR}/jungle/large_test.cc) +add_executable(large_test ${LARGE_TEST}) +target_link_libraries(large_test ${JUNGLE_TEST_DEPS}) +add_dependencies(large_test static_lib) + +add_custom_target(func_test) +add_dependencies(func_test + casual_test + basic_op_test + seq_itr_test + key_itr_test + snapshot_test + custom_cmp_test + mt_test + large_test) + + +set(FLUSH_ST_TEST ${TEST_DIR}/stress/flush_stress_test.cc) +add_executable(flush_stress_test ${FLUSH_ST_TEST}) +target_link_libraries(flush_stress_test ${JUNGLE_TEST_DEPS}) +add_dependencies(flush_stress_test static_lib) + +set(PURGE_ST_TEST ${TEST_DIR}/stress/purge_stress_test.cc) +add_executable(purge_stress_test ${PURGE_ST_TEST}) +target_link_libraries(purge_stress_test ${JUNGLE_TEST_DEPS}) +add_dependencies(purge_stress_test static_lib) + +set(ITR_ST_TEST ${TEST_DIR}/stress/iterator_stress_test.cc) +add_executable(iterator_stress_test ${ITR_ST_TEST}) +target_link_libraries(iterator_stress_test ${JUNGLE_TEST_DEPS}) +add_dependencies(iterator_stress_test static_lib) + +set(COMPACT_ST_TEST ${TEST_DIR}/stress/compactor_stress_test.cc) +add_executable(compactor_stress_test ${COMPACT_ST_TEST}) +target_link_libraries(compactor_stress_test ${JUNGLE_TEST_DEPS}) +add_dependencies(compactor_stress_test static_lib) + +set(LOG_RC_ST_TEST ${TEST_DIR}/stress/log_reclaim_stress_test.cc) +add_executable(log_reclaim_stress_test ${LOG_RC_ST_TEST}) +target_link_libraries(log_reclaim_stress_test ${JUNGLE_TEST_DEPS}) +add_dependencies(log_reclaim_stress_test static_lib) + +set(MANY_LOG_TEST ${TEST_DIR}/stress/many_log_files_test.cc) +add_executable(many_log_files_test ${MANY_LOG_TEST}) +target_link_libraries(many_log_files_test ${JUNGLE_TEST_DEPS}) +add_dependencies(many_log_files_test static_lib) + +add_custom_target(stress_test) +add_dependencies(stress_test + flush_stress_test + purge_stress_test + many_log_files_test) + + +set(BASIC_ROBUST_CHILD ${TEST_DIR}/robust/basic_robust_child.cc) +add_executable(basic_robust_child ${BASIC_ROBUST_CHILD}) +target_link_libraries(basic_robust_child ${JUNGLE_TEST_DEPS}) +add_dependencies(basic_robust_child static_lib) + +set(DIST_TEST + ${TEST_DIR}/bench/dist_def_test.cc) +add_executable(dist_test ${DIST_TEST}) +target_link_libraries(dist_test ${JUNGLE_TEST_DEPS}) +add_dependencies(dist_test static_lib) + + +# --- Benchmark --- + +set(BENCH + ${TEST_DIR}/bench/bench.cc + ${TEST_DIR}/bench/db_adapter_jungle.cc) +add_executable(jungle_bench ${BENCH}) +set_target_properties(jungle_bench + PROPERTIES COMPILE_FLAGS + "-DJUNGLE_ADAPTER=1") +target_link_libraries(jungle_bench ${JUNGLE_TEST_DEPS}) +add_dependencies(jungle_bench static_lib) + +# If RocksDB library & header paths are given, compile it as well. +if (ROCKSDB_LIBRARY AND ROCKSDB_INCLUDE) + message(STATUS "RocksDB library: ${ROCKSDB_LIBRARY}") + message(STATUS "RocksDB include: ${ROCKSDB_INCLUDE}") + set(ROCKS_BENCH + ${TEST_DIR}/bench/bench.cc + ${TEST_DIR}/bench/db_adapter_rocksdb.cc) + add_executable(rocksdb_bench ${ROCKS_BENCH}) + set_target_properties(rocksdb_bench + PROPERTIES COMPILE_FLAGS + "-DROCKSDB_ADAPTER=1") + target_link_libraries(rocksdb_bench + ${ROCKSDB_LIBRARY} + ${JUNGLE_TEST_DEPS}) + add_dependencies(rocksdb_bench static_lib) + +endif () diff --git a/tests/bench/adapter_selector.h b/tests/bench/adapter_selector.h new file mode 100644 index 0000000..a92b1b5 --- /dev/null +++ b/tests/bench/adapter_selector.h @@ -0,0 +1,33 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "db_adapter.h" +#include "db_adapter_jungle.h" + +namespace jungle_bench { + +#if defined(JUNGLE_ADAPTER) + DbAdapter* getAdapter() { + return new JungleAdapter(); + } +#else + #make_error "Database adapter is not specified." +#endif + +} + diff --git a/tests/bench/bench.cc b/tests/bench/bench.cc new file mode 100644 index 0000000..96dd324 --- /dev/null +++ b/tests/bench/bench.cc @@ -0,0 +1,947 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "bench_config.h" +#include "internal_helper.h" +#include "latency_collector.h" +#include "test_common.h" + +// TODO: Other DBs +#include "adapter_selector.h" +#include "db_adapter_jungle.h" + +#include "murmurhash3.h" + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace jungle_bench { + +#define MAX_KEYLEN (2048) + +static LatencyCollector global_lat; +static bool force_use_existing = false; + +static char* ABT_ARRAY = (char*)"abcdefghijklmnopqrstuvwxyz"; +static size_t ABT_NUM = 26; + +void generate_key(const BenchConfig& conf, + uint64_t index, + char* buf, + size_t& buflen_inout) +{ + size_t len = conf.keyLen.get(index); + // Minimum length: 8 bytes. + if (len < 8) len = 8; + + uint64_t vv = index; + int ii = 7; + while (vv >= ABT_NUM) { + buf[ii--] = ABT_ARRAY[vv % ABT_NUM]; + vv /= ABT_NUM; + } + buf[ii--] = ABT_ARRAY[vv]; + for (int jj=ii; jj>=0; --jj) { + buf[jj] = ABT_ARRAY[0]; + } + + vv = index; + +#if 1 + uint32_t seed = 0; + for (size_t jj=8; jj ret.size()) ? len - ret.size() : 0; + ret += "_" + std::string(remain, 'x'); + return ret; +} + +uint64_t get_write_bytes() { +#if defined(__linux) && !defined(__ANDROID__) + FILE *fp = fopen("/proc/self/io", "r"); + while(!feof(fp)) { + char str[64]; + unsigned long temp; + int ret = fscanf(fp, "%s %lu", str, &temp); + (void)ret; + if (!strcmp(str, "write_bytes:")) { + fclose(fp); + return temp; + } + } + fclose(fp); +#endif + // TODO: Other platforms? + return 0; +} + +uint64_t get_cpu_usage_ms() { +#if defined(__linux) && !defined(__ANDROID__) + std::ifstream fs; + std::string path = "/proc/self/stat"; + + fs.open(path.c_str()); + if (!fs.good()) return 0; + + std::string dummy_str; + uint64_t dummy_int; + uint64_t user_time_ms = 0; + uint64_t kernel_time_ms = 0; + + // 1) pid + // 2) executable name (str) + // 3) state (str) + // 4) ppid + // 5) pgrp + // 6) session + // 7) tty + // 8) tpgid + // 9) flags + // 10) # minor page faults + // 11) # minor page faults including children + // 12) # major page faults + // 13) # major page faults including children + // 14) user time + // 15) kernel time + // ... + + fs >> dummy_int >> dummy_str >> dummy_str; + fs >> dummy_int >> dummy_int >> dummy_int >> dummy_int; + fs >> dummy_int >> dummy_int >> dummy_int >> dummy_int; + fs >> dummy_int >> dummy_int >> user_time_ms >> kernel_time_ms; + + fs.close(); + + // TODO: currently assuming 100Hz (10ms) jiffy. + // It should support all kinds of platforms. + user_time_ms *= 10; + kernel_time_ms *= 10; + return user_time_ms + kernel_time_ms; +#endif + // TODO: Other platforms? + return 0; +} + +uint64_t get_mem_usage() { +#if defined(__linux) && !defined(__ANDROID__) + std::ifstream fs; + std::string path = "/proc/self/statm"; + + fs.open(path.c_str()); + if (!fs.good()) return 0; + + uint64_t dummy_int; + uint64_t rss = 0; + fs >> dummy_int >> rss >> dummy_int; + fs.close(); + + uint32_t pgsize = sysconf(_SC_PAGESIZE); + rss *= pgsize; + return rss; +#endif + // TODO: Other platforms? + return 0; +} + +void print_latencies( const std::string& log_file, + const std::vector& items ) +{ + char buffer[1024]; + std::stringstream ss; + + sprintf(buffer, + "%12s%12s%12s%12s%12s%12s\n", + "type", "p50", "p99", "p99.9", "p99.99", "p99.999" ); + ss << buffer; + + for (const std::string& item: items) { + sprintf( + buffer, + "%12s%12s%12s%12s%12s%12s\n", + item.c_str(), + TestSuite::usToString(global_lat.getPercentile(item, 50)).c_str(), + TestSuite::usToString(global_lat.getPercentile(item, 99)).c_str(), + TestSuite::usToString(global_lat.getPercentile(item, 99.9)).c_str(), + TestSuite::usToString(global_lat.getPercentile(item, 99.99)).c_str(), + TestSuite::usToString(global_lat.getPercentile(item, 99.999)).c_str() ); + ss << buffer; + } + ss << "-----\n"; + TestSuite::_msg("%s", ss.str().c_str()); + + std::ofstream fs; + fs.open(log_file, std::ofstream::out | std::ofstream::app); + if (!fs.good()) return; + + fs << ss.str(); + fs.close(); +} + +int initial_load(const BenchConfig& conf, + const std::string& log_file_name, + DbAdapter* db_inst, + std::atomic& stop_signal) +{ + if (!conf.initialLoad) { + TestSuite::_msg("skipping initial load, use existing DB\n"); + TestSuite::_msg("-----\n"); + return 0; + } + + CHK_Z( db_inst->startInitialLoad() ); + + // NOTE: + // Initial loading in a random order will be + // the worst case scenario for LSM-based approaches, + // as the key range every merge operation will be + // as wide as the entire key space, so that will touch + // almost all next level tables. + std::vector key_arr(conf.numKvPairs); + std::iota(key_arr.begin(), key_arr.end(), 0); + + if (conf.initialLoadOrder == BenchConfig::RANDOM) { + // Randomly ordered initial load, shuffle. + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(key_arr.begin(), key_arr.end(), g); + } + + uint64_t wamp_base = get_write_bytes(); + uint64_t cpu_base = get_cpu_usage_ms(); + + std::ofstream log_fs; + log_fs.open( log_file_name, std::ofstream::out | std::ofstream::app ); + + TestSuite::Timer tt; + TestSuite::WorkloadGenerator wg(conf.initialLoadRate); + + // time xx s + // initial load x / y speed xx.x ops/s + // w.amp s.amp + // average cpu rss + TestSuite::Displayer dd(4, 4); + TestSuite::Timer dd_timer(80); + TestSuite::Timer wamp_timer(1500); + TestSuite::Timer samp_timer(5000); + TestSuite::Timer log_timer(5000); + dd.init(); + dd.setWidth( {20, 15, 10, 18} ); + dd.set(0, 0, "time"); + dd.set(1, 0, "initial load"); + dd.set(2, 0, "w.amp"); + dd.set(2, 2, "s.amp"); + dd.set(3, 0, "average cpu"); + dd.set(3, 2, "rss"); + + uint64_t w_amount = 0; + uint64_t s_amount = 0; + uint64_t cpu_ms = 0; + uint64_t rss_amount = 0; + + char key_buf[MAX_KEYLEN]; + size_t key_len = 0; + for (size_t ii = 0; ii < conf.numKvPairs; ++ii) { + while (!wg.getNumOpsToDo()) { + TestSuite::sleep_ms(1); + } + + DbAdapter::KvElem elem; + // TODO: Should be replaced with random generation. + generate_key(conf, key_arr[ii], key_buf, key_len); + elem.key = DbAdapter::Buffer(key_buf, key_len); + + std::string value_str = generate_value(conf, key_arr[ii]); + elem.value = DbAdapter::Buffer(value_str); + + { TestSuite::Timer set_timer; + db_inst->set(elem); + global_lat.addLatency("init.load", set_timer.getTimeUs()); + } + + uint64_t num_sets = ii + 1; + + if ( dd_timer.timeout() || + num_sets == conf.numKvPairs ) { + dd_timer.reset(); + + dd.set(0, 1, "%zu s", tt.getTimeSec()); + dd.set(1, 1, "%zu", num_sets); + dd.set( 1, 3, "%.1f ops/s", + (double)num_sets * 1000000 / tt.getTimeUs() ); + + if (wamp_timer.timeout()) { + wamp_timer.reset(); + + // Write amplification. + w_amount = get_write_bytes() - wamp_base; + dd.set( 2, 1, "%.1f", + (double)w_amount / num_sets / + ( conf.keyLen.median + conf.valueLen.median ) ); + + // CPU. + cpu_ms = get_cpu_usage_ms() - cpu_base; + dd.set( 3, 1, "%.1f %%", (double)cpu_ms / tt.getTimeMs() * 100 ); + + // RSS. + rss_amount = get_mem_usage(); + dd.set( 3, 3, "%s", TestSuite::sizeToString(rss_amount).c_str() ); + } + + if (samp_timer.timeout()) { + samp_timer.reset(); + + // Space amplification. + s_amount = jungle::FileMgr::dirSize(conf.dbPath, true); + dd.set( 2, 3, "%.1f", + (double)s_amount / num_sets / + ( conf.keyLen.median + conf.valueLen.median ) ); + } + + // Write log file. + if (log_timer.timeout()) { + log_timer.reset(); + + log_fs << TestSuite::getTimeStringPlain() << "\t" + << num_sets << "\t" + << 0 << "\t" + << 0 << "\t" + << w_amount << "\t" + << s_amount << "\t" + << cpu_ms << "\t" + << rss_amount + << std::endl; + } + + dd.print(); + } + + if (stop_signal) break; + wg.addNumOpsDone(1); + } + dd.print(); + + CHK_Z( db_inst->endInitialLoad() ); + + TestSuite::_msg("-----\n"); + log_fs << "-----" << std::endl; + log_fs.close(); + + print_latencies(log_file_name, {"init.load"}); + + return 0; +} + +struct BenchStatus { + BenchStatus() + : numSet(0) + , numPoint(0) + , numRange(0) + , amountWriteByte(0) + , amountSpaceByte(0) + , cpuMs(0) + , rssByte(0) + {} + std::atomic numSet; + std::atomic numPoint; + std::atomic numRange; + std::atomic amountWriteByte; + std::atomic amountSpaceByte; + std::atomic cpuMs; + std::atomic rssByte; +}; + +int do_write(WorkerHandle* args, + const WorkerDef& my_def, + TestSuite::WorkloadGenerator& wg) +{ + thread_local TestSuite::Timer tt; + + if (my_def.rate >= 0) { + size_t ops_todo = wg.getNumOpsToDo(); + if (!ops_todo) { + TestSuite::sleep_us(100); + return 0; + } + } + + thread_local char key_buf[MAX_KEYLEN]; + size_t key_len = 0; + uint64_t idx = std::rand() % args->conf.numKvPairs; + DbAdapter::KvElem elem; + generate_key(args->conf, idx, key_buf, key_len); + elem.key = DbAdapter::Buffer(key_buf, key_len); + std::string value_str = generate_value(args->conf, idx); + elem.value = DbAdapter::Buffer(value_str); + { + tt.reset(); + CHK_Z( args->dbInst->set(elem) ); + global_lat.addLatency("set", tt.getTimeUs()); + } + + wg.addNumOpsDone(1); + args->stats.numSet.fetch_add(1); + return 0; +} + +int do_point_read(WorkerHandle* args, + const WorkerDef& my_def, + TestSuite::WorkloadGenerator& wg) +{ + thread_local TestSuite::Timer tt; + + if (my_def.rate >= 0) { + size_t ops_todo = wg.getNumOpsToDo(); + if (!ops_todo) { + TestSuite::sleep_us(100); + return 0; + } + } + + char key_buf[MAX_KEYLEN]; + size_t key_len = 0; + uint64_t idx = std::rand() % args->conf.numKvPairs; + generate_key(args->conf, idx, key_buf, key_len); + bool succ = true; + { + tt.reset(); + DbAdapter::Buffer value_out; + int ret = args->dbInst->get( DbAdapter::Buffer(key_buf, key_len), + value_out ); + if (ret != 0) { + abort(); + succ = false; + } + global_lat.addLatency("get", tt.getTimeUs()); + value_out.free(); + } + + if (succ) { + wg.addNumOpsDone(1); + args->stats.numPoint.fetch_add(1); + } + return 0; +} + +int do_range_read(WorkerHandle* args, + const WorkerDef& my_def, + TestSuite::WorkloadGenerator& wg) +{ + thread_local TestSuite::Timer tt; + + if (my_def.rate >= 0) { + size_t ops_todo = wg.getNumOpsToDo(); + if (!ops_todo) { + TestSuite::sleep_us(100); + return 0; + } + } + + thread_local char skey_buf[MAX_KEYLEN], ekey_buf[MAX_KEYLEN]; + size_t skey_len = 0, ekey_len = 0; + + uint64_t s_idx = std::rand() % args->conf.numKvPairs; + generate_key(args->conf, s_idx, skey_buf, skey_len); + + size_t batch_size = my_def.batchSize.get(); + if (s_idx + batch_size >= args->conf.numKvPairs) { + batch_size = args->conf.numKvPairs - s_idx - 1; + } + + // NOTE: Both start/end keys are inclusive, + // so that `batchsize == 0` means that `start_key == end_key`. + generate_key(args->conf, s_idx + batch_size, ekey_buf, ekey_len); + + bool succ = true; + { + tt.reset(); + std::list kvs_out; + int ret = args->dbInst->getRange + ( DbAdapter::Buffer(skey_buf, skey_len), + DbAdapter::Buffer(ekey_buf, ekey_len), + kvs_out ); + if (ret != 0) { + abort(); + succ = false; + } + // The number of returned records should metach. + assert( kvs_out.size() == batch_size + 1 ); + + global_lat.addLatency("iterate", tt.getTimeUs()); + + for (auto& entry: kvs_out) { + DbAdapter::KvElem& kk = entry; + kk.key.free(); + kk.value.free(); + } + } + + if (succ) { + wg.addNumOpsDone(1); + args->stats.numRange.fetch_add(1); + } + return 0; +} + +int bench_worker(TestSuite::ThreadArgs* base_args) { + WorkerHandle* args = static_cast(base_args); + const WorkerDef& my_def = args->conf.workerDefs[args->wId]; + +#ifdef __linux__ + std::string t_name = "bench_w_" + std::to_string(args->wId); + pthread_setname_np(pthread_self(), t_name.c_str()); +#endif + + TestSuite::WorkloadGenerator wg(my_def.rate); + while (!args->stopSignal.load()) { + switch (my_def.type) { + case WorkerDef::WRITER: + CHK_Z( do_write(args, my_def, wg) ); + break; + + case WorkerDef::POINT_READER: + CHK_Z( do_point_read(args, my_def, wg) ); + break; + + case WorkerDef::RANGE_READER: + CHK_Z( do_range_read(args, my_def, wg) ); + break; + + default: + TestSuite::sleep_us(1000); + break; + } + } + + return 0; +} + +struct DisplayArgs : public TestSuite::ThreadArgs { + DisplayArgs(const BenchConfig& _conf, + BenchStatus& _stat, + std::atomic& _stop_signal) + : conf(_conf) + , stat(_stat) + , stopSignal(_stop_signal) + {} + std::string dbFile; + std::string logFile; + const BenchConfig& conf; + BenchStatus& stat; + std::atomic& stopSignal; +}; + +int displayer(TestSuite::ThreadArgs* base_args) { +#ifdef __linux__ + pthread_setname_np(pthread_self(), "displayer"); +#endif + + DisplayArgs* args = static_cast(base_args); + + // x/x total IOPS p50 p99 + // set + // point get + // range get + // w.amp + // s.amp + // cpu rss + + // 6 rows, 5 columns + + TestSuite::Displayer dd(7, 5); + dd.init(); + dd.set(0, 1, "total"); + dd.set(0, 2, "IOPS"); + dd.set(0, 3, "p50"); + dd.set(0, 4, "p99"); + dd.set(1, 0, "set"); + dd.set(2, 0, "point get"); + dd.set(3, 0, "range get"); + dd.set(4, 0, "w.amp"); + dd.set(5, 0, "s.amp"); + dd.set(6, 0, "cpu"); + dd.set(6, 3, "rss"); + + std::vector col_width( {15, 15, 15, 15, 15} ); + dd.setWidth(col_width); + + uint64_t wamp_base = get_write_bytes(); + uint64_t cpu_base = get_cpu_usage_ms(); + uint64_t cpu_inst_base = get_cpu_usage_ms(); + + std::ofstream log_fs; + log_fs.open( args->logFile, std::ofstream::out | std::ofstream::app ); + + TestSuite::Timer tt; + TestSuite::Timer wamp_timer(1500); + TestSuite::Timer samp_timer(5000); + TestSuite::Timer log_timer(5000); + tt.resetSec(args->conf.durationSec); + while (!tt.timeout() && !args->stopSignal.load()) { + TestSuite::sleep_ms(80); + uint64_t cur_us = tt.getTimeUs(); + if (!cur_us) continue; + + dd.set( 0, 0, "%zu/%zu s", cur_us / 1000000, args->conf.durationSec ); + + // writer row + dd.set( 1, 1, "%s", + TestSuite::countToString( args->stat.numSet ).c_str() ); + dd.set( 1, 2, "%s ops/s", + TestSuite::countToString + ( args->stat.numSet * 1000000 / cur_us ).c_str() ); + dd.set( 1, 3, "%s", + TestSuite::usToString + ( global_lat.getPercentile("set", 50) ).c_str() ); + dd.set( 1, 4, "%s", + TestSuite::usToString + ( global_lat.getPercentile("set", 99) ).c_str() ); + + // p reader row + dd.set( 2, 1, "%s", + TestSuite::countToString( args->stat.numPoint ).c_str() ); + dd.set( 2, 2, "%s ops/s", + TestSuite::countToString + ( args->stat.numPoint * 1000000 / cur_us ).c_str() ); + dd.set( 2, 3, "%s", + TestSuite::usToString + ( global_lat.getPercentile("get", 50) ).c_str() ); + dd.set( 2, 4, "%s", + TestSuite::usToString + ( global_lat.getPercentile("get", 99) ).c_str() ); + + // r reader row + dd.set( 3, 1, "%s", + TestSuite::countToString( args->stat.numRange ).c_str() ); + dd.set( 3, 2, "%s ops/s", + TestSuite::countToString + ( args->stat.numRange * 1000000 / cur_us ).c_str() ); + dd.set( 3, 3, "%s", + TestSuite::usToString + ( global_lat.getPercentile("iterate", 50) ).c_str() ); + dd.set( 3, 4, "%s", + TestSuite::usToString + ( global_lat.getPercentile("iterate", 99) ).c_str() ); + + // w.amp row + if (wamp_timer.timeout()) { + uint64_t inst_time_us = wamp_timer.getTimeUs(); + uint64_t inst_time_ms = inst_time_us / 1000; + wamp_timer.reset(); + + uint64_t w_amt = get_write_bytes() - wamp_base; + + // Amount of writes. + dd.set( 4, 1, "%s", + TestSuite::sizeToString(w_amt).c_str() ); + if (args->stat.numSet.load()) { + // Write amplification. + dd.set( 4, 2, "%.1fx", + (double)w_amt / args->stat.numSet / + ( args->conf.keyLen.median + args->conf.valueLen.median ) ); + } else { + dd.set( 4, 2, "--" ); + } + // Instant write throughput. + dd.set( 4, 3, "%s/s", + TestSuite::sizeThroughputStr + ( w_amt - args->stat.amountWriteByte, + inst_time_us ).c_str() ); + // Average write throughput. + dd.set( 4, 4, "%s/s", + TestSuite::sizeThroughputStr + ( w_amt, tt.getTimeUs() ).c_str() ); + args->stat.amountWriteByte = w_amt; + + // CPU. + uint64_t cur_cpu_ms = get_cpu_usage_ms(); + uint64_t cpu_ms = cur_cpu_ms - cpu_base; + uint64_t cpu_ms_inst = cur_cpu_ms - cpu_inst_base; + cpu_inst_base = cur_cpu_ms; + args->stat.cpuMs = cpu_ms; + // Instant usage (since last display). + dd.set( 6, 1, "%.1f %%", (double)cpu_ms_inst / inst_time_ms * 100 ); + // Overall average. + dd.set( 6, 2, "%.1f %%", (double)cpu_ms / tt.getTimeMs() * 100 ); + + // RSS. + uint64_t rss_amount = get_mem_usage(); + args->stat.rssByte = rss_amount; + dd.set( 6, 4, "%s", TestSuite::sizeToString(rss_amount).c_str() ); + } + + // s.amp row + if (samp_timer.timeout()) { + samp_timer.reset(); + + uint64_t samp = jungle::FileMgr::dirSize(args->dbFile, true); + args->stat.amountSpaceByte = samp; + + dd.set( 5, 1, "%s", + TestSuite::sizeToString(samp).c_str() ); + dd.set( 5, 2, "%.1fx", + (double)samp / args->conf.numKvPairs / + ( args->conf.keyLen.median + args->conf.valueLen.median ) ); + } + + // Write log file. + if (log_timer.timeout()) { + log_timer.reset(); + + log_fs + << TestSuite::getTimeStringPlain() << "\t" + << args->stat.numSet.load() << "\t" + << args->stat.numPoint.load() << "\t" + << args->stat.numRange.load() << "\t" + << args->stat.amountWriteByte.load() << "\t" + << args->stat.amountSpaceByte.load() << "\t" + << args->stat.cpuMs.load() << "\t" + << args->stat.rssByte.load() + << std::endl; + } + dd.print(); + + // Check the number of ops if set. + if (args->conf.durationOps) { + uint64_t ops = args->stat.numSet + + args->stat.numPoint + + args->stat.numRange; + if ( args->conf.durationOps && + ops >= args->conf.durationOps ) break; + } + } + + TestSuite::_msg("-----\n"); + log_fs << "-----\n"; + log_fs.close(); + args->stopSignal.store(true); + + return 0; +} + +static void (*old_sigint_handler)(int) = nullptr; +static std::atomic* global_ref_of_stop_signal = nullptr; +static std::atomic* global_ref_of_skip_cooling_down = nullptr; + +void custom_handler_2nd(int signum) +{ + std::cout << std::endl + << "Force termination (DB corruption may happen)" + << std::endl; + exit(-1); +} + +void custom_handler(int signum) +{ + std::cout << std::endl + << "Got termination signal, will wait for safe DB close.." + << std::endl; + signal(SIGINT, custom_handler_2nd); + global_ref_of_stop_signal->store(true); + global_ref_of_skip_cooling_down->store(true); +} + +int bench_main(const std::string& config_file) { + BenchConfig conf = BenchConfig::load(config_file); + if (force_use_existing) conf.initialLoad = false; + std::cout << conf.toString() << std::endl; + + std::string db_file = conf.dbPath; + if (conf.initialLoad && jungle::FileMgr::exist(db_file)) { + // Previous DB already exists, ask user before delete it. + char answer[64], + *ret = nullptr; + memset(answer, 0x0, sizeof(answer)); + + std::cout << "-----" << std::endl + << "Previous DB file " + << db_file + << " already exists. " + << "Are you sure to remove it (y/N)? "; + ret = fgets(answer, sizeof(answer), stdin); + + if (ret && !(answer[0] == 'Y' || answer[0] == 'y')) { + return 0; + } + if (!db_file.empty() && db_file != "/") { + std::string cmd = "rm -rf " + db_file + "/*"; + int r = ::system(cmd.c_str()); + (void)r; + assert(r == 0); + } + } + + std::atomic stop_signal(false); + std::atomic skip_cooling_down(false); + global_ref_of_stop_signal = &stop_signal; + global_ref_of_skip_cooling_down = &skip_cooling_down; + old_sigint_handler = signal(SIGINT, custom_handler); + + TestSuite::_msg("-----\n"); + DbAdapter* db_inst = getAdapter(); + CHK_Z( db_inst->open(db_file, conf, conf.dbConf) ); + TestSuite::_msg("DB settings: %s\n", conf.dbConf.dump().c_str()); + + std::ofstream log_fs; + std::string log_file_name = conf.metricsPath + "/" + + db_inst->getName() + "_" + "bench_log_" + + TestSuite::getTimeStringPlain();; + log_fs.open( log_file_name, std::ofstream::out | std::ofstream::app ); + log_fs << conf.confJson.dump() << std::endl; + log_fs << "-----" << std::endl; + log_fs.close(); + + TestSuite::_msg("-----\n"); + CHK_Z( initial_load(conf, log_file_name, db_inst, stop_signal) ); + + // Warming up phase. + { TestSuite::Progress pp(conf.warmingUpSec, "warming up", "s"); + TestSuite::Timer tt; + tt.resetSec(conf.warmingUpSec); + while (!tt.timeout() && !stop_signal) { + pp.update(tt.getTimeSec()); + TestSuite::sleep_ms(100); + } + pp.done(); + TestSuite::_msg("-----\n"); + } + + BenchStatus cur_status; + + // Spawn workers. + size_t num_workers = conf.workerDefs.size(); + std::vector w_handles(num_workers); + std::vector w_holders(num_workers); + for (size_t ii=0; iijoin(); + CHK_Z( w_holders[ii]->getResult() ); + delete w_holders[ii]; + delete w_handles[ii]; + } + + // Print out latencies. + print_latencies(d_args.logFile, {"set", "get", "iterate"}); + + // Cooling down phase. + { TestSuite::Progress pp(conf.coolingDownSec, "cooling down", "s"); + TestSuite::Timer tt; + tt.resetSec(conf.coolingDownSec); + while (!tt.timeout() && !skip_cooling_down) { + pp.update(tt.getTimeSec()); + TestSuite::sleep_ms(100); + } + pp.done(); + TestSuite::_msg("-----\n"); + } + + CHK_Z( db_inst->close() ); + db_inst->shutdown(); + delete db_inst; + + return 0; +} + +void usage(int argc, char** argv) { + std::stringstream ss; + ss << "Usage: \n"; + ss << " " << argv[0] << " [config file name] \n" + << std::endl + << "Options:\n" + << " -e, --use-existing:\n" + << " Force use the existing DB. It will overwrites the config " + << "in the given file." + << std::endl; + + std::cout << ss.str(); + exit(0); +} + +int check_args(int argc, char** argv) { + int input_file_idx = 0; + for (int ii=1; ii +#include +#include + +namespace jungle_bench { + +struct BenchConfig { + enum InitialLoadOrder { + SEQ = 0x0, + RANDOM = 0x1, + }; + + BenchConfig() + : dbPath() + , metricsPath("./") + , durationSec(10) + , durationOps(0) + , warmingUpSec(3) + , coolingDownSec(5) + , numKvPairs(100000) + , initialLoad(true) + , initialLoadRate(20000) + , initialLoadOrder(RANDOM) + , keyLen(DistDef::RANDOM, 8, 0) + , valueLen(DistDef::RANDOM, 512, 0) + {} + + static BenchConfig load(const std::string& filename) { + BenchConfig conf; + json::JSON obj; + read_json_object(filename, obj); + conf.confJson = obj; + + _jstr(conf.dbPath, obj, "db_path"); + _jstr(conf.metricsPath, obj, "log_path"); + _jint(conf.durationSec, obj, "duration_sec"); + _jint(conf.durationOps, obj, "duration_ops"); + _jint(conf.warmingUpSec, obj, "warming_up_sec"); + _jint(conf.coolingDownSec, obj, "cooling_down_sec"); + _jint(conf.numKvPairs, obj, "num_kv_pairs"); + _jbool(conf.initialLoad, obj, "initial_load"); + _jint(conf.initialLoadRate, obj, "initial_load_rate"); + + std::string initial_load_order; + _jstr(initial_load_order, obj, "initial_load_order"); + conf.initialLoadOrder = + (initial_load_order.find("seq") == std::string::npos) + ? RANDOM : SEQ; + + if (obj["key"].NotNull()) { + conf.keyLen = load_dist_def_from_json( obj["key"] ); + } + if (obj["value"].NotNull()) { + conf.valueLen = load_dist_def_from_json( obj["value"] ); + } + conf.dbConf = obj["db_configs"]; + + // Workers + json::JSON w_obj = obj["workers"]; + if (w_obj.NotNull()) { + size_t num = w_obj.size(); + + for (size_t ii = 0; ii < num; ++ii) { + WorkerDef ww = WorkerDef::load(w_obj[ii]); + if (ww.type != WorkerDef::UNKNOWN) { + conf.workerDefs.push_back(ww); + } + } + } + + if (conf.dbPath.empty()) conf.dbPath = "./bench_db"; + if (conf.metricsPath.empty()) conf.metricsPath = "./"; + + return conf; + } + + std::string toString() { + char msg[1024]; + std::stringstream ss; + sprintf( msg, "total %zu kv, duration %s", + numKvPairs, + TestSuite::usToString(durationSec * 1000000).c_str() ); + ss << msg; + if (durationOps) { + sprintf(msg, " OR %s ops\n", + TestSuite::countToString(durationOps).c_str()); + ss << msg; + } else { + ss << std::endl; + } + + ss << "key: " << keyLen.toString() << std::endl; + ss << "value: " << valueLen.toString() << std::endl; + ss << "approx working set size: " + << TestSuite::sizeToString + ( (keyLen.median + valueLen.median) * numKvPairs ) + << std::endl; + ss << "DB path: " << dbPath << std::endl; + ss << "log path: " << metricsPath << std::endl; + if (initialLoad) { + ss << " - will do initial load: " + << initialLoadRate << ", " + << ((initialLoadOrder == SEQ) ? "SEQ" : "RANDOM") + << std::endl; + } else { + ss << " - will use existing DB" << std::endl; + } + + ss << " -- " << workerDefs.size() << " workers --" << std::endl; + for (size_t ii = 0; ii < workerDefs.size(); ++ii) { + WorkerDef& wd = workerDefs[ii]; + ss << " [" << ii << "] "; + ss << wd.toString(); + if (ii < workerDefs.size() - 1) ss << std::endl; + } + + return ss.str(); + } + + std::string dbPath; + std::string metricsPath; + size_t durationSec; + size_t durationOps; + size_t warmingUpSec; + size_t coolingDownSec; + size_t numKvPairs; + bool initialLoad; + size_t initialLoadRate; + InitialLoadOrder initialLoadOrder; + DistDef keyLen; + DistDef valueLen; + std::vector workerDefs; + json::JSON confJson; + // It will be passed to DB adapter. + json::JSON dbConf; +}; + +}; + diff --git a/tests/bench/bench_worker.h b/tests/bench/bench_worker.h new file mode 100644 index 0000000..663ebb9 --- /dev/null +++ b/tests/bench/bench_worker.h @@ -0,0 +1,119 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "json_common.h" +#include "json_to_dist_def.h" + +#include "test_common.h" + +#include +#include +#include + +namespace jungle_bench { + +struct BenchConfig; +struct BenchStatus; +class DbAdapter; +struct WorkerDef { + enum Type { + WRITER = 0x0, + POINT_READER = 0x1, + RANGE_READER = 0x2, + UNKNOWN = 0x3, + }; + + WorkerDef(Type _type = WRITER, + uint64_t _rate = 1, + DistDef _dist = DistDef()) + : type(_type) + , rate(_rate) + , batchSize(_dist) + {} + + static WorkerDef load(json::JSON& obj) { + std::string type_str; + uint64_t rate; + _jstr(type_str, obj, "type"); + _jint(rate, obj, "rate"); + + Type type = UNKNOWN; + if (!type_str.empty()) { + if (type_str[0] == 'p' || type_str[0] == 'P') { + type = POINT_READER; + } else if (type_str[0] == 'r' || type_str[0] == 'R') { + type = RANGE_READER; + } else if (type_str[0] == 'w' || type_str[0] == 'W') { + type = WRITER; + } + } + + DistDef batch = load_dist_def_from_json( obj["batch"] ); + return WorkerDef(type, rate, batch); + } + + std::string toString() { + char msg[128]; + static std::unordered_map + w_type_name + ( { {WRITER, "writer"}, + {POINT_READER, "P reader"}, + {RANGE_READER, "R reader"} } ); + + std::stringstream ss; + if (rate >= 0) { + sprintf( msg, "%-10s %8zu.0 ops/sec", + w_type_name[type].c_str(), + (size_t)rate ); + } else { + sprintf( msg, "%-10s MAX SPEED", + w_type_name[type].c_str() ); + } + ss << msg; + if (type == RANGE_READER) { + ss << ", batch " << batchSize.toString(); + } + return ss.str(); + } + + Type type; + int64_t rate; + DistDef batchSize; +}; + +struct WorkerHandle : public TestSuite::ThreadArgs { + WorkerHandle(size_t _id, + DbAdapter* _db_inst, + const BenchConfig& _conf, + BenchStatus& _stat, + std::atomic& _stop_signal) + : wId(_id) + , dbInst(_db_inst) + , conf(_conf) + , stats(_stat) + , stopSignal(_stop_signal) + {} + size_t wId; + DbAdapter* dbInst; + const BenchConfig& conf; + BenchStatus& stats; + std::atomic& stopSignal; +}; + +} + diff --git a/tests/bench/db_adapter.h b/tests/bench/db_adapter.h new file mode 100644 index 0000000..474801d --- /dev/null +++ b/tests/bench/db_adapter.h @@ -0,0 +1,117 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "bench_config.h" +#include "json.hpp" + +#include +#include + +#include "string.h" + +namespace jungle_bench { + +class DbAdapter { +public: + struct Buffer { + Buffer() : data(nullptr), size(0) {} + Buffer(void* ptr, size_t len) : data((uint8_t*)ptr), size(len) {} + Buffer(const std::string& s) : data((uint8_t*)s.data()), size(s.size()) {} + void alloc(size_t len) { + data = (uint8_t*)malloc(len); + } + void free() { + ::free(data); + } + uint8_t* data; + size_t size; + }; + + struct KvElem { + KvElem() {} + Buffer key; + Buffer value; + }; + + struct BatchOptions { + BatchOptions() : sync(false) {} + + // If true, given batch should be durable on disk + // before returning `setBatch()` API. + bool sync; + }; + + DbAdapter() {} + + virtual ~DbAdapter() {} + + // Return a string that can identify the type and version of DB. + virtual std::string getName() = 0; + + // Open a DB instance. + virtual int open(const std::string& db_file, + const BenchConfig& bench_config, + json::JSON db_config) = 0; + + // Close a DB instance. + virtual int close() = 0; + + // Shutdown process-wide engine stuffs (if necessary). + virtual int shutdown() = 0; + + // Start bulk load mode (if necessary). + virtual int startInitialLoad() = 0; + + // Finish bulk load mode (if necessary). + virtual int endInitialLoad() = 0; + + // Single set. + virtual int set(const KvElem& elem) = 0; + + // Batch set. + virtual int setBatch(const std::list& batch, + const BatchOptions& opt) = 0; + + // Point query. + // Underlying database should allocate a new memory + // for returned `value_out`, and benchmark program + // is responsible for freeing it. + virtual int get(const Buffer& key, + Buffer& value_out) = 0; + + virtual std::string get(const std::string& key) { + Buffer value_out; + int ret = get(Buffer(key), value_out); + if (ret != 0) return std::string(); + + std::string ret_str((const char*)value_out.data, value_out.size); + value_out.free(); + return ret_str; + } + + // Range query returns KV pairs within [start, end]. + // Underlying database should allocate new memory blobs + // for returned `kvs_out`, and benchmark program + // is responsible for freeing them. + virtual int getRange(const Buffer& start, + const Buffer& end, + std::list& kvs_out) = 0; +}; + +} // namespace jungle_bench; + diff --git a/tests/bench/db_adapter_jungle.cc b/tests/bench/db_adapter_jungle.cc new file mode 100644 index 0000000..c045fa6 --- /dev/null +++ b/tests/bench/db_adapter_jungle.cc @@ -0,0 +1,219 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "db_adapter_jungle.h" + +#include "json_common.h" +#include "libjungle/jungle.h" + +namespace jungle_bench { + +jungle::SizedBuf conv_buf(const DbAdapter::Buffer& buf) { + return jungle::SizedBuf(buf.size, buf.data); +} + +DbAdapter::Buffer conv_buf(const jungle::SizedBuf& buf) { + DbAdapter::Buffer b; + b.data = buf.data; + b.size = buf.size; + return b; +} + +jungle::KV conv_kv(const DbAdapter::KvElem& elem) { + return jungle::KV( conv_buf(elem.key), conv_buf(elem.value) ); +} + + +int JungleAdapter::open(const std::string& db_file, + const BenchConfig& bench_config, + json::JSON db_config) +{ + dbPath = db_file; + configObj = db_config; + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + //g_config.flusherSleepDuration_ms = 5000; + uint64_t wal_size_mb = 256; + _jint(wal_size_mb, configObj, "wal_size_mb"); + g_config.flusherMinRecordsToTrigger = + wal_size_mb * 1024 * 1024 / bench_config.valueLen.median; + + uint64_t cache_size_mb = 4096; + _jint(cache_size_mb, configObj, "cache_size_mb"); + g_config.fdbCacheSize = (uint64_t)cache_size_mb*1024*1024; + + _jint(g_config.numCompactorThreads, configObj, "num_compactor_threads"); + if (!g_config.numCompactorThreads) g_config.numCompactorThreads = 1; + + g_config.compactorSleepDuration_ms = 1000; // 1 second + + _jint(g_config.numTableWriters, configObj, "num_table_writers"); + if (!g_config.numTableWriters) g_config.numTableWriters = 8; + + g_config.flusherAutoSync = false; + + //g_config.itcOpt.timeWindow_sec = 10; + g_config.itcOpt.startHour = 0; + g_config.itcOpt.endHour = 0; + + jungle::init(g_config); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.compactionFactor = 300; + _jint(config.compactionFactor, configObj, "compaction_factor"); + + config.minFileSizeToCompact = 16*1024*1024; + + _jint(config.blockReuseFactor, configObj, "block_reuse_factor"); + config.minBlockReuseCycleToCompact = 0; + config.maxBlockReuseCycle = 100; + + config.nextLevelExtension = true; + + uint64_t table_size_mb = 1024; + _jint(table_size_mb, configObj, "l0_table_size_mb"); + config.maxL0TableSize = table_size_mb * 1024 * 1024; + + table_size_mb = 1024; + _jint(table_size_mb, configObj, "l1_table_size_mb"); + config.maxL1TableSize = table_size_mb * 1024 * 1024; + + /* + config.lookupBoosterLimit_mb = { (uint32_t)cache_size_mb / 12, + (uint32_t)cache_size_mb * 2 / 12 };*/ + config.lookupBoosterLimit_mb = {100, 200}; + + uint64_t max_l1_size_mb = 10240; + _jint(max_l1_size_mb, configObj, "l1_size_mb"); + config.maxL1Size = max_l1_size_mb * 1024 * 1024; + + config.bloomFilterBitsPerUnit = 10; + _jfloat(config.bloomFilterBitsPerUnit, configObj, "bloom_filter_bits"); + + //config.useBloomFilter = false; + + jungle::Status s = jungle::DB::open(&myDb, db_file, config); + if (!s) return s.getValue(); + + // Flush all logs on initial open. + myDb->sync(false); + myDb->flushLogs( jungle::FlushOptions() ); + + return 0; +} + +int JungleAdapter::close() { + // Flush all logs before close. + myDb->sync(false); + myDb->flushLogs( jungle::FlushOptions() ); + + jungle::Status s = jungle::DB::close(myDb); + if (!s) return s.getValue(); + + return 0; +} + +int JungleAdapter::shutdown() { + jungle::shutdown(); + return 0; +} + +int JungleAdapter::startInitialLoad() { + return 0; +} + +int JungleAdapter::endInitialLoad() { + myDb->sync(false); + myDb->flushLogs( jungle::FlushOptions() ); + return 0; +} + +int JungleAdapter::set(const KvElem& elem) { + jungle::Status s; + s = myDb->set( conv_kv(elem) ); + if (!s) return (int)s; + + return 0; +} + +int JungleAdapter::setBatch(const std::list& batch, + const BatchOptions& opt) +{ + jungle::Status s; + for (auto& entry: batch) { + const KvElem& elem = entry; + s = myDb->set( conv_kv(elem) ); + if (!s) return s.getValue(); + } + + if (opt.sync) { + s = myDb->sync(opt.sync); + if (!s) return s.getValue(); + } + + return 0; +} + +int JungleAdapter::get(const Buffer& key, + Buffer& value_out) +{ + jungle::Status s; + jungle::SizedBuf local_value_out; + s = myDb->get( conv_buf(key), local_value_out ); + if (!s) return (int)s; + + value_out = conv_buf(local_value_out); + return 0; +} + +int JungleAdapter::getRange(const Buffer& start, + const Buffer& end, + std::list& kvs_out) +{ + jungle::Status s; + jungle::Iterator itr; + s = itr.init(myDb, conv_buf(start), conv_buf(end)); + if (!s) return (int)s; + + do { + jungle::Record rec_out; + jungle::Record::Holder h_rec_out(rec_out); + s = itr.get(rec_out); + if (!s) break; + if (rec_out.kv.key > conv_buf(end)) break; + + jungle::SizedBuf key_out; + jungle::SizedBuf val_out; + rec_out.kv.key.moveTo(key_out); + rec_out.kv.value.moveTo(val_out); + + KvElem elem_to_add; + elem_to_add.key = conv_buf(key_out); + elem_to_add.value = conv_buf(val_out); + kvs_out.push_back(elem_to_add); + + } while (itr.next()); + + s = itr.close(); + if (!s) return (int)s; + + return 0; +} + +} // namespace jungle_bench; + diff --git a/tests/bench/db_adapter_jungle.h b/tests/bench/db_adapter_jungle.h new file mode 100644 index 0000000..1f17693 --- /dev/null +++ b/tests/bench/db_adapter_jungle.h @@ -0,0 +1,67 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "db_adapter.h" + +#include + +namespace jungle { + class DB; +}; + +namespace jungle_bench { + +class JungleAdapter : public DbAdapter { +public: + JungleAdapter() : myDb(nullptr) {} + + ~JungleAdapter() {} + + std::string getName() { return "jungle"; } + + int open(const std::string& db_file, + const BenchConfig& bench_config, + json::JSON db_confi); + + int close(); + + int shutdown(); + + int startInitialLoad(); + + int endInitialLoad(); + + int set(const KvElem& elem); + + int setBatch(const std::list& batch, + const BatchOptions& opt); + + int get(const Buffer& key, + Buffer& value_out); + + int getRange(const Buffer& start, + const Buffer& end, + std::list& kvs_out); + + std::string dbPath; + json::JSON configObj; + jungle::DB* myDb; +}; + +} // namespace jungle_bench; + diff --git a/tests/bench/dist_def.h b/tests/bench/dist_def.h new file mode 100644 index 0000000..d406985 --- /dev/null +++ b/tests/bench/dist_def.h @@ -0,0 +1,250 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "murmurhash3.h" + +#include +#include +#include +#include +#include +#include + +#include +#include + +struct DistDef { + enum Type { + RANDOM = 0x0, + NORMAL = 0x1, + ZIPF = 0x2, + }; + + DistDef( Type _type = RANDOM, + uint64_t _median = 50, + uint64_t _sigma = 50, + double _alpha = 1.0, + uint64_t _z_elems = 10 ) + : type(_type) + , median(_median) + , sigma(_sigma) + , zAlpha(_alpha) + , zNumElems(_z_elems) + , zPrimeNumber(7) + { + if (type == ZIPF) { + // Prepare the probabilities. + double c = 0; + for (uint64_t ii = 1; ii <= zNumElems; ++ii) { + c = c + ( 1.0 / pow( (double)ii, zAlpha) ); + } + c = 1.0 / c; + + // Reverse calculate the cumulative probability, and keep it. + double x = 0.0; + for (uint64_t ii = 1; ii <= zNumElems; ++ii) { + x += c / pow((double)ii, zAlpha); + + // The key of last `probs` entry MUST BE 1.0. + if (ii == zNumElems) x = 1.0; + zProbs[x] = ii; + } + + // Find proper prime number according to `sigma * 2`. + static std::vector + prime_numbers = {53, 503, 5003, 50021, 500009, 5000011}; + for (uint64_t p: prime_numbers) { + if (p < sigma * 2) zPrimeNumber = p; + } + } + } + + ~DistDef() {} + + /** + * Return a random number according to configured distribution. + * + * @return Random number. + */ + uint64_t get() const { + return get( std::rand() ); + } + + /** + * Return the index of Zipfian element according to + * configured parameter (i.e., alpha). + * + * @return Zipfian element index number: [0, zNumElems). + */ + uint64_t getZipfElem() const { + if (type != ZIPF) return 0; + return getZipfElem( std::rand() ); + } + + /** + * Calculate random number from given index deterministically. + * If the same index is given, it returns the same random number. + * + * @param index Seed number to generate random number. + * @return Random number. + */ + uint64_t get(uint64_t index) const { + static uint64_t MAX64 = std::numeric_limits::max(); + static double PI = 3.141592654; + + // Generate two u64 random numbers. + uint64_t rr[2]; + MurmurHash3_x64_128(&index, sizeof(index), 0, rr); + + switch (type) { + case NORMAL: { + double r1 = 0, r2 = 0; + r1 = -log( 1 - ( (double)rr[0] / MAX64 ) ); + r2 = 2 * PI * ( (double)rr[1] / MAX64 ); + r1 = sqrt( 2 * r1 ); + + int64_t value = (int64_t)sigma * r1 * cos(r2) + median; + if (value < 0) value = 0; + return (uint64_t)value; + } + + case ZIPF: { + // 1) First get the Zipf group index. + double z = (double)rr[0] / MAX64; + auto iter = zProbs.lower_bound(z); + assert(iter != zProbs.end()); + uint64_t group_idx = iter->second - 1; + assert(group_idx < zNumElems); + + // 2) Randomly get a random number within range corresponding to + // the Zipf group. + uint64_t range_per_group = sigma * 2 / zNumElems; + uint64_t next_index = (range_per_group) + ? rr[1] % range_per_group : 0; + next_index += range_per_group * group_idx; + + // 3) Using `next_index`, do linear shift and find + // its corresponding (pseudo random) number, + // by linear congruential generation. + uint64_t local_random = (next_index * zPrimeNumber + 7) % + (sigma * 2); + return median + local_random - sigma; + } + + default: + // Uniform random. + uint64_t local_random = 0; + if (sigma) local_random = rr[0] % (sigma * 2); + return median + local_random - sigma; + } + return 0; + } + + /** + * Calculate the index of Zipfian element corresponding to the given + * index. If the same index is given, it returns the same element index. + * + * @param index Seed number to get Zipfian element index. + * @return Zipfian element index number: [0, zNumElems). + */ + uint64_t getZipfElem(uint64_t index) const { + if (type != ZIPF) return 0; + + // Generate two u64 random numbers. + uint64_t rr[2]; + MurmurHash3_x64_128(&index, sizeof(index), 0, rr); + + double z = (double)rr[0] / std::numeric_limits::max(); + auto iter = zProbs.lower_bound(z); + assert(iter != zProbs.end()); + return iter->second - 1; + } + + /** + * Dump current distribution settings to string. + * + * @return Information string. + */ + std::string toString() { + std::stringstream ss; + + if (type == RANDOM || type == NORMAL) { + if (!sigma) { + // Fixed length + ss << median << " (fixed)"; + return ss.str(); + } + } + + switch (type) { + case RANDOM: + ss << "R[" << median - sigma << ", " << median + sigma << ")"; + break; + + case NORMAL: + ss << "N(" << median << ", " << sigma << ")"; + break; + + case ZIPF: + ss << "Z(" << zAlpha << ", " << zNumElems << ")"; + break; + + default: + ss << "unknown"; + break; + }; + return ss.str(); + } + + // * Uniform random: [median - sigma, median + sigma) + // + // * Normal: Norm(median, sigma) + // - 68%: [median - sigma, median + sigma] + // - 97%: [median - 2 * sigma, median + 2 * sigma] + // - 99%: [median - 3 * sigma, median + 3 * sigma] + // + // * Zipf: According to `zAlpha` value. + // If `zNumElems` = 10 + // 0 1 2 3 (Zipfian element index) + // - 0.0: 10%, 10%, 10%, 10%, ... (even) + // - 1.0: 34%, 17%, 11%, 8.5%, ... + // - 2.0: 65%, 16%, 7%, 4%, ... + // Within the same element, the probability will be uniform random. + + Type type; + + // Expected median. + uint64_t median; + + // Expected sigma. + uint64_t sigma; + + // --- For Zipfian distribution --- + // Zipfian alpha parameter. + double zAlpha; + + // Number of probability elements for Zipfian distribution. + uint64_t zNumElems; + + // Cumulative probability map. + std::map zProbs; + + // Prime number for linear shift. + uint64_t zPrimeNumber; +}; + diff --git a/tests/bench/dist_def_test.cc b/tests/bench/dist_def_test.cc new file mode 100644 index 0000000..be4337c --- /dev/null +++ b/tests/bench/dist_def_test.cc @@ -0,0 +1,367 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "test_common.h" + +#include "dist_def.h" +#include "json_common.h" +#include "json_to_dist_def.h" + +#include +#include +#include +#include + +using namespace jungle_bench; + +namespace dist_def_test { + +void prepare_sample_set(DistDef::Type type, + int size, + int zipf_n, + int mean, + int sigma, + double alpha, + std::map& map) +{ + json::JSON params; + + switch (type) { + case DistDef::Type::NORMAL: + params["type"] = "normal"; + break; + case DistDef::Type::RANDOM: + default: + params["type"] = "random"; + break; + case DistDef::Type::ZIPF: + params["type"] = "zipf"; + break; + } + + params["alpha"] = alpha; + params["n"] = zipf_n; + params["median"] = mean; + params["sigma"] = sigma; + + DistDef dist = load_dist_def_from_json(params); + TestSuite::_msg(dist.toString().c_str()); + for (int i = 0; i < size; ++i) { + uint64_t val = + (type != DistDef::Type::ZIPF) + ? dist.get() : dist.getZipfElem(); + if (map.find(val) == map.end()) { + map[val] = 0; + } + map[val]++; + } +} + +double get_mean(const std::map& map) { + uint64_t total_sum = 0; + uint64_t total_size = 0; + for (auto iter: map) { + total_sum += iter.first * iter.second; + total_size += iter.second; + } + return (double) (total_sum / total_size); +} + +double get_sigma(const std::map& map) { + double mean = get_mean(map); + double sigma = 0.0; + uint64_t total_sum = 0; + for (auto iter: map) { + uint64_t i = iter.second; + for (uint64_t j = 0; j < i; ++j) { + sigma += pow((mean - iter.first), 2); + } + total_sum += iter.second; + } + return sqrt(sigma / total_sum); +} + +uint64_t get_min(const std::map& map) { + auto itr = map.begin(); + if (itr == map.end()) return 0; + return itr->first; +} + +uint64_t get_max(const std::map& map) { + auto itr = map.rbegin(); + if (itr == map.rend()) return 0; + return itr->first; +} + +double get_cdf(const std::map& map, uint64_t x) { + uint64_t cnt = 0; + uint64_t total_sum = 0; + for (auto iter: map) { + if (iter.first <= x) { + cnt += iter.second; + } + total_sum += iter.second; + } + return (cnt / (double)total_sum); +} + +int random_dist_test(const std::vector& sizes, + uint64_t median, + uint64_t sigma) +{ + std::map mymap; + TestSuite::_msg("Random Distribution: \n"); + for (int size: sizes) { + mymap.clear(); + TestSuite::_msg("Sample Set Size = %d\n", size); + prepare_sample_set(DistDef::Type::RANDOM, size, 0, median, sigma, 0, mymap); + TestSuite::_msg("\nMean: %f", get_mean(mymap)); + TestSuite::_msg("\nSD: %f", get_sigma(mymap)); + TestSuite::_msg("\nMin: %zu", get_min(mymap)); + TestSuite::_msg("\nMax: %zu", get_max(mymap)); + for (uint64_t ii=1; ii<=10; ++ii) { + // Random: 100% value should be in + // [median - sigma, median + sigma). + uint64_t val = (median - sigma) + (sigma * 2) * ii / 10; + TestSuite::_msg( "\nCDF <= %zu: %f", + val, get_cdf(mymap, val) ); + } + TestSuite::_msg("\n\n"); + } + return 0; +} + +int normal_dist_test(const std::vector& sizes, + uint64_t median, + uint64_t sigma) +{ + std::map mymap; + TestSuite::_msg("Normal Distribution: \n"); + for (int size: sizes) { + mymap.clear(); + TestSuite::_msg("Sample Set Size = %d\n", size); + prepare_sample_set(DistDef::Type::NORMAL, size, 0, median, sigma, 0, mymap); + TestSuite::_msg("\nMean: %f", get_mean(mymap)); + TestSuite::_msg("\nSD: %f", get_sigma(mymap)); + TestSuite::_msg("\nMin: %zu", get_min(mymap)); + TestSuite::_msg("\nMax: %zu", get_max(mymap)); + for (uint64_t ii=1; ii<=12; ++ii) { + // Normal: 99.7% value should be in + // [median - 3*sigma, median + 3*sigma]. + uint64_t val = (median - 3 * sigma) + (sigma * 6) * ii / 12; + TestSuite::_msg( "\nCDF <= %zu: %f", + val, get_cdf(mymap, val) ); + } + TestSuite::_msg("\n\n"); + } + return 0; +} + +int zipf_dist_test(const std::vector& sizes, + uint64_t zipf_n, + const std::vector& alphas) +{ + std::map mymap; + TestSuite::_msg("Zipfian Distribution: \n"); + for (int size: sizes) { + TestSuite::_msg("Sample Set Size = %d\n", size); + for (double alpha: alphas) { + mymap.clear(); + prepare_sample_set(DistDef::Type::ZIPF, size, zipf_n, 0, 0, alpha, mymap); + TestSuite::_msg("\nMean: %f", get_mean(mymap)); + TestSuite::_msg("\nSD: %f", get_sigma(mymap)); + TestSuite::_msg("\nMin: %zu", get_min(mymap)); + TestSuite::_msg("\nMax: %zu", get_max(mymap)); + for (uint64_t ii=1; ii<=10; ++ii) { + uint64_t val = zipf_n * ii / 10; + TestSuite::_msg( "\nCDF <= %zu: %f", + val, get_cdf(mymap, val) ); + } + TestSuite::_msg("\n\n"); + } + TestSuite::_msg("\n\n"); + } + return 0; +} + +int deterministic_random_test(size_t NUM) { + std::list random_numbers; + DistDef dd(DistDef::RANDOM, NUM, NUM); + + // 1st run: collect random numbers. + for (size_t ii=0; ii random_numbers; + DistDef dd(DistDef::NORMAL, NUM, NUM/10); + + // 1st run: collect random numbers. + for (size_t ii=0; ii random_numbers; + DistDef dd(DistDef::ZIPF, NUM, NUM, 1.0, 10); + + // 1st run: collect random numbers. + for (size_t ii=0; ii count; + auto entry = random_numbers.begin(); + for (size_t ii=0; ii random_numbers; + size_t RANGE = 1000; + DistDef dd(DistDef::ZIPF, RANGE / 2, RANGE / 2, 1.0, 10); + + // 1st run: collect random numbers. + for (size_t ii=0; ii freq_count; + auto entry = random_numbers.begin(); + for (size_t ii=0; ii elems; + for (auto& entry: freq_count) { + elems.push_back( Elem(entry.second, entry.first) ); + } + std::sort( elems.begin(), elems.end(), + [](const Elem& a, const Elem& b) -> bool { + return (a.count > b.count); + } ); + + size_t sum = 0; + size_t id = 0; + double last_print = 0.0; + bool print_all = false; + for (const Elem& entry: elems) { + sum += entry.count; + id++; + if ( (id * 100.0 / elems.size()) >= last_print + 10 || + print_all ) { + TestSuite::_msg("%zu\t%zu\t(%.1f)\t%zu\t%zu\n", + sum, id, (id * 100.0 / elems.size()), + entry.count, entry.number); + last_print += 10; + } + } + + return 0; +} + +} + +using namespace dist_def_test; + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + ts.options.printTestMessage = true; + + std::vector sizes { 1000, 10000, 100000 }; + int zipf_n = 10000; + std::vector alphas { 0.1, 0.27, 0.99 }; + uint64_t median = 5000, sigma = 5000; + + ts.doTest( "uniform random test", + random_dist_test, + sizes, median, sigma ); + + ts.doTest( "normal distribution test", + normal_dist_test, + sizes, median, sigma/4 ); + + ts.doTest( "zipfian distribution test", + zipf_dist_test, + sizes, zipf_n, alphas ); + + ts.doTest( "deterministic uniform random test", + deterministic_random_test, + 100000 ); + + ts.doTest( "deterministic normal distribution test", + deterministic_normal_dist_test, + 100000 ); + + ts.doTest( "deterministic zipf distribution test", + deterministic_zipf_dist_test, + 100000 ); + + ts.doTest( "deterministic zipf distribution test2", + deterministic_zipf_dist_test2, + 100000 ); + + return 0; +} diff --git a/tests/bench/example_config.json b/tests/bench/example_config.json new file mode 100644 index 0000000..5b7b246 --- /dev/null +++ b/tests/bench/example_config.json @@ -0,0 +1,55 @@ +{ + "db_path": "./bench_db", + "log_path": "./", + "duration_sec": 30, + "duration_ops": 0, + "warming_up_sec": 3, + "cooling_down_sec": 5, + "num_kv_pairs": 200000, + "initial_load": true, + "initial_load_rate": 50000, + "initial_load_order": "random", + "key": { + "type": "random", + "median": 8, + "sigma": 4 + }, + "value": { + "type": "normal", + "median": 1024, + "sigma": 16 + }, + "workers": [ + { + "type": "writer", + "rate": 2000 + }, + { + "type": "point_reader", + "rate": 2000 + }, + { + "type": "range_reader", + "rate": 10, + "batch": { + "type": "normal", + "median": 100, + "sigma": 20 + } + } + ], + "db_configs": { + "cache_size_mb": 1024, + "wal_size_mb": 64, + "compaction_factor": 240, + "max_tiering_limit": 10, + "merge_mode": "appending", + "block_reuse_factor": 0, + "l0_table_size_mb": 1024, + "l1_table_size_mb": 2560, + "l1_size_mb": 122880, + "bloom_filter_bits": 10.0, + "num_table_writers": 8, + "num_compactor_threads": 2 + } +} \ No newline at end of file diff --git a/tests/bench/json.hpp b/tests/bench/json.hpp new file mode 100644 index 0000000..3e2c769 --- /dev/null +++ b/tests/bench/json.hpp @@ -0,0 +1,676 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright: +See URL: https://github.com/nbsdx/SimpleJSON + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// LCOV_EXCL_START + +namespace json { + +using std::map; +using std::deque; +using std::string; +using std::enable_if; +using std::initializer_list; +using std::is_same; +using std::is_convertible; +using std::is_integral; +using std::is_floating_point; + +namespace { + string json_escape( const string &str ) { + string output; + for( unsigned i = 0; i < str.length(); ++i ) + switch( str[i] ) { + case '\"': output += "\\\""; break; + case '\\': output += "\\\\"; break; + case '\b': output += "\\b"; break; + case '\f': output += "\\f"; break; + case '\n': output += "\\n"; break; + case '\r': output += "\\r"; break; + case '\t': output += "\\t"; break; + default : output += str[i]; break; + } + return std::move( output ); + } +} + +class JSON +{ + union BackingData { + BackingData( double d ) : Float( d ){} + BackingData( long l ) : Int( l ){} + BackingData( bool b ) : Bool( b ){} + BackingData( string s ) : String( new string( s ) ){} + BackingData() : Int( 0 ){} + + deque *List; + map *Map; + string *String; + double Float; + long Int; + bool Bool; + } Internal; + + public: + enum class Class { + Null, + Object, + Array, + String, + Floating, + Integral, + Boolean + }; + + template + class JSONWrapper { + Container *object; + + public: + JSONWrapper( Container *val ) : object( val ) {} + JSONWrapper( std::nullptr_t ) : object( nullptr ) {} + + typename Container::iterator begin() { return object ? object->begin() : typename Container::iterator(); } + typename Container::iterator end() { return object ? object->end() : typename Container::iterator(); } + typename Container::const_iterator begin() const { return object ? object->begin() : typename Container::iterator(); } + typename Container::const_iterator end() const { return object ? object->end() : typename Container::iterator(); } + }; + + template + class JSONConstWrapper { + const Container *object; + + public: + JSONConstWrapper( const Container *val ) : object( val ) {} + JSONConstWrapper( std::nullptr_t ) : object( nullptr ) {} + + typename Container::const_iterator begin() const { return object ? object->begin() : typename Container::const_iterator(); } + typename Container::const_iterator end() const { return object ? object->end() : typename Container::const_iterator(); } + }; + + JSON() : Internal(), Type( Class::Null ){} + + JSON( initializer_list list ) + : JSON() + { + SetType( Class::Object ); + for( auto i = list.begin(), e = list.end(); i != e; ++i, ++i ) + operator[]( i->ToString() ) = *std::next( i ); + } + + JSON( JSON&& other ) + : Internal( other.Internal ) + , Type( other.Type ) + { other.Type = Class::Null; other.Internal.Map = nullptr; } + + JSON& operator=( JSON&& other ) { + ClearInternal(); + Internal = other.Internal; + Type = other.Type; + other.Internal.Map = nullptr; + other.Type = Class::Null; + return *this; + } + + JSON( const JSON &other ) { + switch( other.Type ) { + case Class::Object: + Internal.Map = + new map( other.Internal.Map->begin(), + other.Internal.Map->end() ); + break; + case Class::Array: + Internal.List = + new deque( other.Internal.List->begin(), + other.Internal.List->end() ); + break; + case Class::String: + Internal.String = + new string( *other.Internal.String ); + break; + default: + Internal = other.Internal; + } + Type = other.Type; + } + + JSON& operator=( const JSON &other ) { + // Check self assigning. + if (&other == this) return *this; + + ClearInternal(); + switch( other.Type ) { + case Class::Object: + Internal.Map = + new map( other.Internal.Map->begin(), + other.Internal.Map->end() ); + break; + case Class::Array: + Internal.List = + new deque( other.Internal.List->begin(), + other.Internal.List->end() ); + break; + case Class::String: + Internal.String = + new string( *other.Internal.String ); + break; + default: + Internal = other.Internal; + } + Type = other.Type; + return *this; + } + + ~JSON() { + switch( Type ) { + case Class::Array: + delete Internal.List; + break; + case Class::Object: + delete Internal.Map; + break; + case Class::String: + delete Internal.String; + break; + default:; + } + } + + template + JSON( T b, typename enable_if::value>::type* = 0 ) : Internal( b ), Type( Class::Boolean ){} + + template + JSON( T i, typename enable_if::value && !is_same::value>::type* = 0 ) : Internal( (long)i ), Type( Class::Integral ){} + + template + JSON( T f, typename enable_if::value>::type* = 0 ) : Internal( (double)f ), Type( Class::Floating ){} + + template + JSON( T s, typename enable_if::value>::type* = 0 ) : Internal( string( s ) ), Type( Class::String ){} + + JSON( std::nullptr_t ) : Internal(), Type( Class::Null ){} + + static JSON Make( Class type ) { + JSON ret; ret.SetType( type ); + return ret; + } + + inline static JSON Load( const string & ); + + template + void append( T arg ) { + SetType( Class::Array ); Internal.List->emplace_back( arg ); + } + + template + void append( T arg, U... args ) { + append( arg ); append( args... ); + } + + template + typename enable_if::value, JSON&>::type operator=( T b ) { + SetType( Class::Boolean ); Internal.Bool = b; return *this; + } + + template + typename enable_if::value && !is_same::value, JSON&>::type operator=( T i ) { + SetType( Class::Integral ); Internal.Int = i; return *this; + } + + template + typename enable_if::value, JSON&>::type operator=( T f ) { + SetType( Class::Floating ); Internal.Float = f; return *this; + } + + template + typename enable_if::value, JSON&>::type operator=( T s ) { + SetType( Class::String ); *Internal.String = string( s ); return *this; + } + + JSON& operator[]( const string &key ) { + SetType( Class::Object ); return Internal.Map->operator[]( key ); + } + + JSON& operator[]( unsigned index ) { + SetType( Class::Array ); + if( index >= Internal.List->size() ) Internal.List->resize( index + 1 ); + return Internal.List->operator[]( index ); + } + + JSON &at( const string &key ) { + return operator[]( key ); + } + + const JSON &at( const string &key ) const { + return Internal.Map->at( key ); + } + + JSON &at( unsigned index ) { + return operator[]( index ); + } + + const JSON &at( unsigned index ) const { + return Internal.List->at( index ); + } + + int length() const { + if( Type == Class::Array ) + return Internal.List->size(); + else + return -1; + } + + bool hasKey( const string &key ) const { + if( Type == Class::Object ) + return Internal.Map->find( key ) != Internal.Map->end(); + return false; + } + + int size() const { + if( Type == Class::Object ) + return Internal.Map->size(); + else if( Type == Class::Array ) + return Internal.List->size(); + else + return -1; + } + + Class JSONType() const { return Type; } + + /// Functions for getting primitives from the JSON object. + bool IsNull() const { return Type == Class::Null; } + bool NotNull() const { return !IsNull(); } + + string ToString() const { bool b; return std::move( ToString( b ) ); } + string ToString( bool &ok ) const { + ok = (Type == Class::String); + return ok ? std::move( json_escape( *Internal.String ) ): string(""); + } + + double ToFloat() const { bool b; return ToFloat( b ); } + double ToFloat( bool &ok ) const { + ok = (Type == Class::Floating); + return ok ? Internal.Float : 0.0; + } + + long ToInt() const { bool b; return ToInt( b ); } + long ToInt( bool &ok ) const { + ok = (Type == Class::Integral); + return ok ? Internal.Int : 0; + } + + bool ToBool() const { bool b; return ToBool( b ); } + bool ToBool( bool &ok ) const { + ok = (Type == Class::Boolean); + return ok ? Internal.Bool : false; + } + + JSONWrapper> ObjectRange() { + if( Type == Class::Object ) + return JSONWrapper>( Internal.Map ); + return JSONWrapper>( nullptr ); + } + + JSONWrapper> ArrayRange() { + if( Type == Class::Array ) + return JSONWrapper>( Internal.List ); + return JSONWrapper>( nullptr ); + } + + JSONConstWrapper> ObjectRange() const { + if( Type == Class::Object ) + return JSONConstWrapper>( Internal.Map ); + return JSONConstWrapper>( nullptr ); + } + + + JSONConstWrapper> ArrayRange() const { + if( Type == Class::Array ) + return JSONConstWrapper>( Internal.List ); + return JSONConstWrapper>( nullptr ); + } + + string dump( int depth = 1, string tab = " ") const { + string pad = ""; + for( int i = 0; i < depth; ++i, pad += tab ); + + switch( Type ) { + case Class::Null: + return "null"; + case Class::Object: { + string s = "{\n"; + bool skip = true; + for( auto &p : *Internal.Map ) { + if( !skip ) s += ",\n"; + s += ( pad + "\"" + p.first + "\" : " + p.second.dump( depth + 1, tab ) ); + skip = false; + } + s += ( "\n" + pad.erase( 0, 2 ) + "}" ) ; + return s; + } + case Class::Array: { + string s = "["; + bool skip = true; + for( auto &p : *Internal.List ) { + if( !skip ) s += ", "; + s += p.dump( depth + 1, tab ); + skip = false; + } + s += "]"; + return s; + } + case Class::String: + return "\"" + json_escape( *Internal.String ) + "\""; + case Class::Floating: + return std::to_string( Internal.Float ); + case Class::Integral: + return std::to_string( Internal.Int ); + case Class::Boolean: + return Internal.Bool ? "true" : "false"; + default: + return ""; + } + return ""; + } + + friend std::ostream& operator<<( std::ostream&, const JSON & ); + + private: + void SetType( Class type ) { + if( type == Type ) + return; + + ClearInternal(); + + switch( type ) { + case Class::Null: Internal.Map = nullptr; break; + case Class::Object: Internal.Map = new map(); break; + case Class::Array: Internal.List = new deque(); break; + case Class::String: Internal.String = new string(); break; + case Class::Floating: Internal.Float = 0.0; break; + case Class::Integral: Internal.Int = 0; break; + case Class::Boolean: Internal.Bool = false; break; + } + + Type = type; + } + + private: + /* beware: only call if YOU know that Internal is allocated. No checks performed here. + This function should be called in a constructed JSON just before you are going to + overwrite Internal... + */ + void ClearInternal() { + switch( Type ) { + case Class::Object: delete Internal.Map; break; + case Class::Array: delete Internal.List; break; + case Class::String: delete Internal.String; break; + default:; + } + } + + private: + + Class Type = Class::Null; +}; + +inline JSON Array() { + return std::move( JSON::Make( JSON::Class::Array ) ); +} + +template +inline JSON Array( T... args ) { + JSON arr = JSON::Make( JSON::Class::Array ); + arr.append( args... ); + return std::move( arr ); +} + +inline JSON Object() { + return std::move( JSON::Make( JSON::Class::Object ) ); +} + +inline std::ostream& operator<<( std::ostream &os, const JSON &json ) { + os << json.dump(); + return os; +} + +namespace { + JSON parse_next( const string &, size_t & ); + + void consume_ws( const string &str, size_t &offset ) { + while( isspace( str[offset] ) ) ++offset; + } + + JSON parse_object( const string &str, size_t &offset ) { + JSON Object = JSON::Make( JSON::Class::Object ); + + ++offset; + consume_ws( str, offset ); + if( str[offset] == '}' ) { + ++offset; return std::move( Object ); + } + + while( true ) { + JSON Key = parse_next( str, offset ); + consume_ws( str, offset ); + if( str[offset] != ':' ) { + std::cerr << "Error: Object: Expected colon, found '" << str[offset] << "'\n"; + break; + } + consume_ws( str, ++offset ); + JSON Value = parse_next( str, offset ); + Object[Key.ToString()] = Value; + + consume_ws( str, offset ); + if( str[offset] == ',' ) { + ++offset; continue; + } + else if( str[offset] == '}' ) { + ++offset; break; + } + else { + std::cerr << "ERROR: Object: Expected comma, found '" << str[offset] << "'\n"; + break; + } + } + + return std::move( Object ); + } + + JSON parse_array( const string &str, size_t &offset ) { + JSON Array = JSON::Make( JSON::Class::Array ); + unsigned index = 0; + + ++offset; + consume_ws( str, offset ); + if( str[offset] == ']' ) { + ++offset; return std::move( Array ); + } + + while( true ) { + Array[index++] = parse_next( str, offset ); + consume_ws( str, offset ); + + if( str[offset] == ',' ) { + ++offset; continue; + } + else if( str[offset] == ']' ) { + ++offset; break; + } + else { + std::cerr << "ERROR: Array: Expected ',' or ']', found '" << str[offset] << "'\n"; + return std::move( JSON::Make( JSON::Class::Array ) ); + } + } + + return std::move( Array ); + } + + JSON parse_string( const string &str, size_t &offset ) { + JSON String; + string val; + for( char c = str[++offset]; c != '\"' ; c = str[++offset] ) { + if( c == '\\' ) { + switch( str[ ++offset ] ) { + case '\"': val += '\"'; break; + case '\\': val += '\\'; break; + case '/' : val += '/' ; break; + case 'b' : val += '\b'; break; + case 'f' : val += '\f'; break; + case 'n' : val += '\n'; break; + case 'r' : val += '\r'; break; + case 't' : val += '\t'; break; + case 'u' : { + val += "\\u" ; + for( unsigned i = 1; i <= 4; ++i ) { + c = str[offset+i]; + if( (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') ) + val += c; + else { + std::cerr << "ERROR: String: Expected hex character in unicode escape, found '" << c << "'\n"; + return std::move( JSON::Make( JSON::Class::String ) ); + } + } + offset += 4; + } break; + default : val += '\\'; break; + } + } + else + val += c; + } + ++offset; + String = val; + return std::move( String ); + } + + JSON parse_number( const string &str, size_t &offset ) { + JSON Number; + string val, exp_str; + char c; + bool isDouble = false; + long exp = 0; + while( true ) { + c = str[offset++]; + if( (c == '-') || (c >= '0' && c <= '9') ) + val += c; + else if( c == '.' ) { + val += c; + isDouble = true; + } + else + break; + } + if( c == 'E' || c == 'e' ) { + c = str[ offset++ ]; + if( c == '-' ){ ++offset; exp_str += '-';} + while( true ) { + c = str[ offset++ ]; + if( c >= '0' && c <= '9' ) + exp_str += c; + else if( !isspace( c ) && c != ',' && c != ']' && c != '}' ) { + std::cerr << "ERROR: Number: Expected a number for exponent, found '" << c << "'\n"; + return std::move( JSON::Make( JSON::Class::Null ) ); + } + else + break; + } + exp = std::stol( exp_str ); + } + else if( !isspace( c ) && c != ',' && c != ']' && c != '}' ) { + std::cerr << "ERROR: Number: unexpected character '" << c << "'\n"; + return std::move( JSON::Make( JSON::Class::Null ) ); + } + --offset; + + if( isDouble ) + Number = std::stod( val ) * std::pow( 10, exp ); + else { + if( !exp_str.empty() ) + Number = std::stol( val ) * std::pow( 10, exp ); + else + Number = std::stol( val ); + } + return std::move( Number ); + } + + JSON parse_bool( const string &str, size_t &offset ) { + JSON Bool; + if( str.substr( offset, 4 ) == "true" ) + Bool = true; + else if( str.substr( offset, 5 ) == "false" ) + Bool = false; + else { + std::cerr << "ERROR: Bool: Expected 'true' or 'false', found '" << str.substr( offset, 5 ) << "'\n"; + return std::move( JSON::Make( JSON::Class::Null ) ); + } + offset += (Bool.ToBool() ? 4 : 5); + return std::move( Bool ); + } + + JSON parse_null( const string &str, size_t &offset ) { + JSON Null; + if( str.substr( offset, 4 ) != "null" ) { + std::cerr << "ERROR: Null: Expected 'null', found '" << str.substr( offset, 4 ) << "'\n"; + return std::move( JSON::Make( JSON::Class::Null ) ); + } + offset += 4; + return std::move( Null ); + } + + JSON parse_next( const string &str, size_t &offset ) { + char value; + consume_ws( str, offset ); + value = str[offset]; + switch( value ) { + case '[' : return std::move( parse_array( str, offset ) ); + case '{' : return std::move( parse_object( str, offset ) ); + case '\"': return std::move( parse_string( str, offset ) ); + case 't' : + case 'f' : return std::move( parse_bool( str, offset ) ); + case 'n' : return std::move( parse_null( str, offset ) ); + default : if( ( value <= '9' && value >= '0' ) || value == '-' ) + return std::move( parse_number( str, offset ) ); + } + std::cerr << "ERROR: Parse: Unknown starting character '" << value << "'\n"; + return JSON(); + } +} + +JSON JSON::Load( const string &str ) { + size_t offset = 0; + return std::move( parse_next( str, offset ) ); +} + +} // End Namespace json + +// LCOV_EXCL_STOP + diff --git a/tests/bench/json_common.h b/tests/bench/json_common.h new file mode 100644 index 0000000..cc934ed --- /dev/null +++ b/tests/bench/json_common.h @@ -0,0 +1,77 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "config_test_common.h" +#include "json.hpp" + +#include +#include + +namespace jungle_bench { + +#define VOID_UNUSED void __attribute__((unused)) + +template +static VOID_UNUSED _jint(T& t, json::JSON& obj, const std::string field) { + if (obj[field].IsNull()) return; + t = obj[field].ToInt(); +} + +template +static VOID_UNUSED _jfloat(T& t, json::JSON& obj, const std::string field) { + if (obj[field].IsNull()) return; + t = obj[field].ToFloat(); +} + +template +static VOID_UNUSED _jstr(T& t, json::JSON& obj, const std::string field) { + if (obj[field].IsNull()) return; + t = obj[field].ToString(); +} + +template +static VOID_UNUSED _jbool(T& t, json::JSON& obj, const std::string field) { + if (obj[field].IsNull()) return; + t = obj[field].ToBool(); +} + +// Write given json object to the file. +static VOID_UNUSED write_json_object(const std::string& filename, + const json::JSON& obj) +{ + std::ofstream fs; + fs.open(filename.c_str()); + fs << obj; + fs.close(); +} + +// Read json object from the file. +static VOID_UNUSED read_json_object(const std::string& filename, + json::JSON& obj_out) +{ + std::ifstream fs; + std::stringstream ss; + fs.open(filename.c_str()); + if (!fs.good()) return; + ss << fs.rdbuf(); + fs.close(); + obj_out = json::JSON::Load(ss.str()); +} + +} // namespace jungle_bench; + diff --git a/tests/bench/json_to_dist_def.h b/tests/bench/json_to_dist_def.h new file mode 100644 index 0000000..0ccc169 --- /dev/null +++ b/tests/bench/json_to_dist_def.h @@ -0,0 +1,52 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "dist_def.h" + +static DistDef load_dist_def_from_json(json::JSON& obj) { + std::string type_str; + uint64_t median = 100; + uint64_t sigma = 50; + + if (obj["type"].NotNull()) type_str = obj["type"].ToString(); + if (obj["median"].NotNull()) median = obj["median"].ToInt(); + if (obj["sigma"].NotNull()) sigma = obj["sigma"].ToInt(); + + DistDef::Type type = DistDef::RANDOM; + if (!type_str.empty()) { + if (type_str[0] == 'n' || type_str[0] == 'N') type = DistDef::NORMAL; + else if (type_str[0] == 'z' || type_str[0] == 'Z') type = DistDef::ZIPF; + } + + switch (type) { + case DistDef::RANDOM: + case DistDef::NORMAL: + default: + return DistDef(type, median, sigma); + + case DistDef::ZIPF: + uint64_t n = 10; + double alpha = 1.0; + if (obj["n"].NotNull()) n = obj["n"].ToInt(); + if (obj["alpha"].NotNull()) alpha = obj["alpha"].ToFloat(); + return DistDef(type, median, sigma, alpha, n); + } + + return DistDef(type, median, sigma); +} + diff --git a/tests/config_test_common.h b/tests/config_test_common.h new file mode 100644 index 0000000..d333fa2 --- /dev/null +++ b/tests/config_test_common.h @@ -0,0 +1,26 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include + +#define TEST_CUSTOM_DB_CONFIG(config) \ + const char* direct_env = std::getenv("TEST_ENABLE_DIRECTIO"); \ + if (direct_env != nullptr && !std::strcmp(direct_env, "true")) { \ + config.directIo = true; \ + } \ + diff --git a/tests/jungle/basic_op_test.cc b/tests/jungle/basic_op_test.cc new file mode 100644 index 0000000..cd0d5b3 --- /dev/null +++ b/tests/jungle/basic_op_test.cc @@ -0,0 +1,1780 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "jungle_test_common.h" + +#include + +#include + +int basic_operations_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DB* db; + jungle::Status s; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Min seqnum should fail. + { uint64_t seq; + s = db->getMinSeqNum(seq); + CHK_NOT(s); } + + // Set KV pairs. + int n = 5; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Overwriting existing seq num should fail. + s = db->setSN(0, kv[0]); + CHK_NOT(s); + + // Get KV pairs. + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Min seqnum == 1. + { uint64_t seq; + s = db->getMinSeqNum(seq); + CHK_OK(s); + CHK_EQ(1, seq); } + + // Max seqnum == n. + { uint64_t seqnum; + s = db->getMaxSeqNum(seqnum); + CHK_EQ(n, seqnum); } + + // Sync. + s = db->sync(); + CHK_OK(s); + + // Sync again (nothing to sync). + s = db->sync(); + CHK_OK(s); + + // Get KV pairs (after sync). + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Flush all. + jungle::FlushOptions f_options; + s = db->flushLogs(f_options); + CHK_OK(s); + + // Min seqnum fail. + { uint64_t seq; + s = db->getMinSeqNum(seq); + CHK_NOT(s); } + + // Flush seqnum == n. + { uint64_t seq; + s = db->getLastFlushedSeqNum(seq); + CHK_OK(s); + CHK_EQ(n, seq); } + + // Flush again (nothing to flush). + s = db->flushLogs(f_options); + CHK_OK(s); + + // Get KV pairs (after purge). + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int many_logs_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DB* db; + jungle::Status s; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 7; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + size_t NUM = 100; + size_t PRIME = 17; + for (size_t ii=0; iiset( jungle::KV(key_str, val_str) ) ); + } + + for (size_t ii=0; iiget( key_str, value_out) ); + CHK_EQ( value_exp, value_out.toString() ); + } + + // Close DB. + CHK_Z( jungle::DB::close(db) ); + + // Free all resources for jungle. + jungle::shutdown(); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int overwrite_seq() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DB* db; + jungle::Status s; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.allowOverwriteSeqNum = true; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Get KV pairs. + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Max seqnum == n. + uint64_t seqnum; + s = db->getMaxSeqNum(seqnum); + CHK_EQ(n, (int)seqnum); + + // Sync. + s = db->sync(); + CHK_OK(s); + + // Another KV with duplicate seq nums. + std::vector kv2(n); + + CHK_Z(_init_kv_pairs(n, kv2, "key2_", "value2_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv2)); + + // Sync again. + s = db->sync(); + CHK_OK(s); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // It shouldn't be log mode. + CHK_FALSE( jungle::DB::isLogSectionMode(filename) ); + + // Reload DB. + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Write more. + std::vector kv3(n+2); + + CHK_Z(_init_kv_pairs(n+2, kv3, "key3_", "value3_")); + CHK_Z(_set_byseq_kv_pairs(n-2, n+2, 0, db, kv3)); + + // Sync again. + s = db->sync(); + CHK_OK(s); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + _free_kv_pairs(n+2, kv3); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int overwrite_seq_last_record() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DB* db; + jungle::Status s; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.allowOverwriteSeqNum = true; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Get KV pairs. + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Max seqnum == n. + uint64_t seqnum; + CHK_Z(db->getMaxSeqNum(seqnum)); + CHK_EQ(n, (int)seqnum); + + // Sync. + CHK_Z(db->sync()); + + // Overwrite the last one. + jungle::KV new_kv("new_key", "new_value"); + CHK_Z( db->setSN(n, new_kv) ); + + // Get KV pairs. + CHK_Z(_get_byseq_check(0, n-1, 0, db, kv)); + { + jungle::KV kv_out; + jungle::KV::Holder h(kv_out); + CHK_Z( db->getSN(n, kv_out) ); + CHK_EQ(new_kv.key, kv_out.key); + CHK_EQ(new_kv.value, kv_out.value); + } + + // Sync again. + CHK_Z(db->sync()); + + // Close DB. + CHK_Z(jungle::DB::close(db)); + + // Reload DB. + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Get KV pairs. + CHK_Z(_get_byseq_check(0, n-1, 0, db, kv)); + { + jungle::KV kv_out; + jungle::KV::Holder h(kv_out); + CHK_Z( db->getSN(n, kv_out) ); + CHK_EQ(new_kv.key, kv_out.key); + CHK_EQ(new_kv.value, kv_out.value); + } + + // Close DB. + CHK_Z(jungle::DB::close(db)); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int load_db_sync() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DB* db; + jungle::Status s; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Get KV pairs. + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Sync. + s = db->sync(); + CHK_OK(s); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Reopen. + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Get KV pairs. + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Put more keys. + std::vector kv_second(n); + CHK_Z(_init_kv_pairs(n, kv_second, "key_v2", "value_v2")); + CHK_Z(_set_byseq_kv_pairs(0, n, n, db, kv_second)); + + // Get KV pairs (before sync). + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + CHK_Z(_get_byseq_check(0, n, n, db, kv_second)); + CHK_Z(_get_bykey_check(0, n, db, kv_second)); + + // 2nd sync. + s = db->sync(); + CHK_OK(s); + + // Get KV pairs (after sync). + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + CHK_Z(_get_byseq_check(0, n, n, db, kv_second)); + CHK_Z(_get_bykey_check(0, n, db, kv_second)); + + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv_second); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int load_db_flush() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DB* db; + jungle::Status s; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Get KV pairs. + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Sync and flush + s = db->sync(); + CHK_OK(s); + jungle::FlushOptions f_options; + s = db->flushLogs(f_options); + CHK_OK(s); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Reopen + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Get KV pairs. + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Put more keys. + std::vector kv_second(n); + CHK_Z(_init_kv_pairs(n, kv_second, "key_v2", "value_v2")); + CHK_Z(_set_byseq_kv_pairs(0, n, n, db, kv_second)); + + // Sync and flush. + s = db->sync(); + CHK_OK(s); + s = db->flushLogs(f_options); + CHK_OK(s); + + // Get KV pairs. + CHK_Z(_get_bykey_check(0, n, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv_second)); + + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv_second); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int log_dedup() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DB* db; + jungle::Status s; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + int n = 5; + std::vector kv(n); + // Same key, different value. + for (int i=0; isetSN(i+1, kv[i]); + } + + // Get KV pairs. + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + + jungle::SizedBuf value_out; + db->get(kv[0].key, value_out); + CHK_EQ(kv[n-1].value, value_out); + value_out.free(); + + // Sync. + s = db->sync(); + CHK_OK(s); + + // Get KV pairs (after sync). + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + + db->get(kv[0].key, value_out); + CHK_EQ(kv[n-1].value, value_out); + value_out.free(); + + // Purge all. + jungle::FlushOptions f_options; + s = db->flushLogs(f_options); + CHK_OK(s); + + // Get KV pairs (after purge). + db->get(kv[0].key, value_out); + CHK_EQ(kv[n-1].value, value_out); + value_out.free(); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int deletion_op() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DB* db; + jungle::Status s; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync. + s = db->sync(); + CHK_OK(s); + + // Delete some keys. + int delete_upto_exclusive = 2; + for (int i=0; idelSN(n+i+1, kv[i].key); + CHK_OK(s); + } + s = db->sync(); + CHK_OK(s); + + jungle::SizedBuf value_ret; + CHK_Z(_get_bykey_check(0, delete_upto_exclusive, db, kv, false)); + CHK_Z(_get_bykey_check(delete_upto_exclusive, n, db, kv)); + + // Should be able to get using `meta_only` flag. + for (int ii=0; iigetRecordByKey(kv[ii].key, rec_out, true); + CHK_Z(s); + CHK_OK(rec_out.isDel()); + CHK_EQ(kv[ii].key, rec_out.kv.key); + rec_out.free(); + } + + // Purge all. + jungle::FlushOptions f_options; + s = db->flushLogs(f_options); + CHK_OK(s); + + CHK_Z( _get_bykey_check(0, delete_upto_exclusive, db, kv, false) ); + CHK_Z( _get_bykey_check(delete_upto_exclusive, n, db, kv) ); + + // Should be able to get using `meta_only` flag. + for (int ii=0; iigetRecordByKey(kv[ii].key, rec_out, true); + CHK_Z(s); + CHK_OK(rec_out.isDel()); + CHK_EQ(kv[ii].key, rec_out.kv.key); + rec_out.free(); + } + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int multiple_log_files() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DB* db; + jungle::Status s; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 100; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Get KV pairs. + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Sync. + s = db->sync(); + CHK_OK(s); + + // Get KV pairs (after sync). + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Purge all. + jungle::FlushOptions f_options; + s = db->flushLogs(f_options); + CHK_OK(s); + + // Get KV pairs (after purge). + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Re-open + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Get KV pairs (after purge). + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int multiple_kvs() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DBGroup* group; + jungle::DB* db; + jungle::Status s; + + // Open DB group and default DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DBGroup::open(&group, filename, config); + CHK_OK(s); + s = group->openDefaultDB(&db); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync. + s = db->sync(); + CHK_OK(s); + + // Another DB + jungle::DB* meta_store; + s = group->openDB(&meta_store, "meta"); + CHK_OK(s); + + std::vector kv_another(n); + CHK_Z(_init_kv_pairs(n, kv_another, "key", "value_another")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, meta_store, kv_another)); + s = meta_store->sync(); + CHK_OK(s); + + // Purge all. + jungle::FlushOptions f_options; + s = db->flushLogs(f_options); + CHK_OK(s); + s = meta_store->flushLogs(f_options); + CHK_OK(s); + + // Get KV pairs (after purge). + CHK_Z(_get_bykey_check(0, n, db, kv)); + CHK_Z(_get_bykey_check(0, n, meta_store, kv_another)); + + // Close DB. + s = jungle::DB::close(meta_store); + CHK_OK(s); + s = jungle::DB::close(db); + CHK_OK(s); + s = jungle::DBGroup::close(group); + CHK_OK(s); + + // reopen + s = jungle::DBGroup::open(&group, filename, config); + CHK_OK(s); + s = group->openDefaultDB(&db); + CHK_OK(s); + s = group->openDB(&meta_store, "meta"); + CHK_OK(s); + + // Get KV pairs. + CHK_Z(_get_bykey_check(0, n, db, kv)); + CHK_Z(_get_bykey_check(0, n, meta_store, kv_another)); + + // Close DB. + s = jungle::DB::close(meta_store); + CHK_OK(s); + s = jungle::DB::close(db); + CHK_OK(s); + s = jungle::DBGroup::close(group); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv_another); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int set_by_key() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DB* db; + jungle::Status s; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_bykey_kv_pairs(0, n, db, kv)); + + // Get KV pairs. + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Sync. + s = db->sync(); + CHK_OK(s); + + // Get KV pairs (after sync). + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Purge all. + jungle::FlushOptions f_options; + s = db->flushLogs(f_options); + CHK_OK(s); + + // Get KV pairs (after purge). + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int command_marker() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DB* db; + jungle::Status s; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Set a command marker. + jungle::Record marker(jungle::Record::COMMAND); + marker.kv.alloc("marker_key", "marker_value"); + s = db->setRecord(marker); + CHK_OK(s); + + // Sync. + s = db->sync(); + CHK_OK(s); + + // Get KV pairs. + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Get marker, by calling getRecord(). + int marker_seqnum = n+1; + jungle::Record rec_out; + s = db->getRecord(marker_seqnum, rec_out); + CHK_OK(s); + CHK_EQ(marker.kv.key, rec_out.kv.key); + CHK_EQ(marker.kv.value, rec_out.kv.value); + CHK_EQ(marker_seqnum, (int)rec_out.seqNum); + rec_out.free(); + + // Marker is invisible by getSN(). + jungle::KV kv_out; + s = db->getSN(marker_seqnum, kv_out); + CHK_NOT(s); + + // Marker is visible by iterator; + jungle::Iterator itr; + s = itr.initSN(db); + CHK_OK(s); + + int count = 0; + do { + s = itr.get(rec_out); + if (!s) break; + + if ((int)rec_out.seqNum == marker_seqnum) { + CHK_EQ(marker.kv.key, rec_out.kv.key); + CHK_EQ(marker.kv.value, rec_out.kv.value); + } + rec_out.free(); + count++; + } while(itr.next()); + + s = itr.close(); + CHK_OK(s); + + CHK_EQ(n+1, count); + + // Purge all. + jungle::FlushOptions f_options; + s = db->flushLogs(f_options); + CHK_OK(s); + + // Marker is not a normal key-value pair. + jungle::SizedBuf value_out; + s = db->get(marker.kv.key, value_out); + CHK_NOT(s); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + marker.free(); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int multiple_handles() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DBGroup* group; + jungle::DB *db, *db_another; + jungle::DB *kvs, *kvs_another; + + // Open DB. + s = jungle::DBGroup::open(&group, filename, config); + CHK_OK(s); + s = group->openDefaultDB(&db); + CHK_OK(s); + + // Open the same DB using another handle. + s = group->openDefaultDB(&db_another); + CHK_OK(s); + CHK_EQ((uint64_t)db, (uint64_t)db_another); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync. + s = db->sync(); + CHK_OK(s); + + // Another handle can get KV pairs. + CHK_Z(_get_byseq_check(0, n, 0, db_another, kv)); + CHK_Z(_get_bykey_check(0, n, db_another, kv)); + + // Another KVS + s = group->openDB(&kvs, "meta"); + CHK_OK(s); + + s = group->openDB(&kvs_another, "meta"); + CHK_OK(s); + + std::vector kv_another(n); + CHK_Z(_init_kv_pairs(n, kv_another, "key", "value_another")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, kvs, kv_another)); + + // Sync. + s = kvs->sync(); + CHK_OK(s); + + // Another handle can get KV pairs. + CHK_Z(_get_byseq_check(0, n, 0, kvs_another, kv_another)); + CHK_Z(_get_bykey_check(0, n, kvs_another, kv_another)); + + // Flush all. + jungle::FlushOptions f_options; + s = db->flushLogs(f_options); + CHK_OK(s); + s = kvs->flushLogs(f_options); + CHK_OK(s); + + // Get KV pairs (after purge). + CHK_Z(_get_bykey_check(0, n, db, kv)); + CHK_Z(_get_bykey_check(0, n, db_another, kv)); + CHK_Z(_get_bykey_check(0, n, kvs, kv_another)); + CHK_Z(_get_bykey_check(0, n, kvs_another, kv_another)); + + // Close DB. + s = jungle::DB::close(kvs_another); + CHK_OK(s); + s = jungle::DB::close(kvs); + CHK_OK(s); + s = jungle::DB::close(db_another); + CHK_OK(s); + s = jungle::DB::close(db); + CHK_OK(s); + s = jungle::DBGroup::close(group); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv_another); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int multiple_group_handles() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DBGroup *g1, *g2; + jungle::DB *db1, *db2; + jungle::DB *kvs1; + + // Open 1st group, default db, meta kvs. + s = jungle::DBGroup::open(&g1, filename, config); + CHK_OK(s); + s = g1->openDefaultDB(&db1); + CHK_OK(s); + s = g1->openDB(&kvs1, "meta"); + CHK_OK(s); + + // Open 2nd group of the same file, and open default db only. + s = jungle::DBGroup::open(&g2, filename, config); + CHK_OK(s); + CHK_EQ((uint64_t)g1, (uint64_t)g2); + s = g2->openDefaultDB(&db2); + CHK_OK(s); + + // Close 1st group. + s = jungle::DB::close(kvs1); + CHK_OK(s); + s = jungle::DB::close(db1); + CHK_OK(s); + s = jungle::DBGroup::close(g1); + CHK_OK(s); + + // Close 2nd group. + s = jungle::DB::close(db2); + CHK_OK(s); + s = jungle::DBGroup::close(g2); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int group_handle_misuse() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DBGroup* group; + jungle::DB* db; + + // Directly open DB first. + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Try to open group on the same file. + s = jungle::DBGroup::open(&group, filename, config); + // Should fail. + CHK_NOT(s); + + // Close. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int purge_only_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.maxEntriesInLogFile = 10; + config.logSectionOnly = true; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 50; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync. + CHK_Z( db->sync() ); + + // Flush (purge-only). + jungle::FlushOptions options; + options.purgeOnly = true; + // purge upto key19. + CHK_Z(db->flushLogs(options, 20)); + + // key20 -> seq number 21. + uint64_t seq_num_out; + CHK_Z(db->getMinSeqNum(seq_num_out)); + CHK_EQ(21, seq_num_out); + + // purge all. + CHK_Z(db->flushLogs(options)); + + // All KVs are gone. + CHK_Z(_get_bykey_check(0, n, db, kv, false)); + + s = jungle::DB::close(db); + CHK_Z(s); + + // Reopen, and they should not be visiable. + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Still all KVs should not be there. + CHK_Z(_get_bykey_check(0, n, db, kv, false)); + + s = jungle::DB::close(db); + CHK_Z(s); + + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +static int _meta_test_deleted_check(jungle::DB* db, + int n, + std::vector& rec) { + for (int ii=0; iigetRecordByKey(rr.kv.key, rr_ret)); + // Get meta should succeed. + CHK_Z(db->getRecordByKey(rr.kv.key, rr_ret, true)); + std::string chk_str("meta_deleted" + TestSuite::lzStr(3, ii)); + jungle::SizedBuf exp_meta(chk_str); + CHK_EQ(exp_meta, rr_ret.meta); + } else { + // Otherwise: normal. + CHK_Z(db->getRecordByKey(rr.kv.key, rr_ret)); + } + rr_ret.free(); + } + return 0; +} + +int meta_test_log() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Set KV pairs. + int n = 10; + std::vector rec(n); + for (int ii=0; iisetRecord(rr)); + } + + // Sync. + CHK_Z(db->sync()); + + // Get. + for (int ii=0; iigetRecordByKey(rr.kv.key, rr_ret)); + CHK_EQ(rr.meta, rr_ret.meta); + rr_ret.free(); + } + + // Delete even numbers. + for (int ii=0; iidelRecord(rr) ); + rr.free(); + } + CHK_Z(db->sync()); + + // Deleted record meta check. + CHK_Z(_meta_test_deleted_check(db, n, rec)); + + // Close and re-open. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Deleted record meta check: again. + CHK_Z(_meta_test_deleted_check(db, n, rec)); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + for (int ii=0; ii rec(n); + for (int ii=0; iisetRecord(rr)); + } + + // Sync and flush. + CHK_Z(db->sync()); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Get. + for (int ii=0; iigetRecordByKey(rr.kv.key, rr_ret)); + CHK_EQ(rr.meta, rr_ret.meta); + rr_ret.free(); + } + + // Delete even numbers. + for (int ii=0; iidelRecord(rr) ); + rr.free(); + } + CHK_Z(db->sync()); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Deleted record meta check. + CHK_Z(_meta_test_deleted_check(db, n, rec)); + + // Close and re-open. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Deleted record meta check: again. + CHK_Z(_meta_test_deleted_check(db, n, rec)); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + for (int ii=0; ii(ctx); + ea->invoke(); + } + return 0; +} + +int async_flush_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.flusherSleepDuration_ms = 1000; + jungle::init(g_config); + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Set KV pairs. + int n = 10; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync and async flush. + CHK_Z(db->sync()); + + size_t counter = 0; + size_t expected_count = 5; + EventAwaiter ea; + + for (size_t ii=0; iiflushLogsAsync + ( jungle::FlushOptions(), + std::bind( async_flush_test_cb, + &counter, + expected_count, + std::placeholders::_1, + std::placeholders::_2 ), + &ea ) ); + } + // Wait for handler. + ea.wait(); + + // All callbacks should have been invoked. + CHK_EQ( expected_count, counter ); + + // Get. + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Invoke async flush and close DB without waiting. + CHK_Z(db->flushLogsAsync(jungle::FlushOptions(), nullptr, nullptr)); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int async_flush_verbose_test(bool debug_level) { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.flusherSleepDuration_ms = 1000; + jungle::init(g_config); + + config.logSectionOnly = true; + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Set KV pairs. + size_t NUM = 1000; + std::vector kv(NUM); + CHK_Z(_init_kv_pairs(NUM, kv, "key", "value")); + + CHK_EQ(4, db->getLogLevel()); + if (debug_level) { + db->setLogLevel(5); + CHK_EQ(5, db->getLogLevel()); + } + + const size_t EXP_COUNT = 11; + for (size_t ii=0; iisetSN(jj+1, kv[jj]) ); + + jungle::FlushOptions f_opt; + f_opt.syncOnly = true; + f_opt.callFsync = false; + CHK_Z( db->flushLogsAsync + ( f_opt, + std::bind( async_flush_test_cb, + &counter, + upto - ii, + std::placeholders::_1, + std::placeholders::_2 ), + &ea, + jj ) ); + } + + // Wait for handler. + ea.wait(); + // All callbacks should have been invoked. + CHK_EQ( upto - ii, counter ); + } + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + _free_kv_pairs(NUM, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int async_flush_verbose_with_delay_test(bool debug_level) { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.flusherSleepDuration_ms = 500; + jungle::init(g_config); + + config.logSectionOnly = true; + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Set KV pairs. + size_t NUM = 10000; + std::vector kv(NUM); + CHK_Z(_init_kv_pairs(NUM, kv, "key", "value")); + + CHK_EQ(4, db->getLogLevel()); + if (debug_level) { + db->setLogLevel(5); + CHK_EQ(5, db->getLogLevel()); + } + + TestSuite::Timer timer(1050); + TestSuite::WorkloadGenerator wg(1000); + for (size_t ii=0; iisetSN(ii+1, kv[ii]) ); + + jungle::FlushOptions f_opt; + f_opt.syncOnly = true; + f_opt.callFsync = false; + f_opt.execDelayUs = 100*1000; + CHK_Z( db->flushLogsAsync( f_opt, nullptr, nullptr ) ); + + wg.addNumOpsDone(1); + ii += 1; + } + CHK_Z(jungle::DB::close(db)); + + // Wait one more second to see if flusher can handle stale request. + TestSuite::sleep_ms(1000); + + CHK_Z(jungle::shutdown()); + _free_kv_pairs(NUM, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int flush_beyond_sync_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DB* db; + jungle::Status s; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 7; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + size_t NUM = 100; + for (size_t ii=0; iiset( jungle::KV(key_str, val_str) ) ); + if (ii == NUM/2) { + // Sync logs in the middle. + CHK_Z( db->sync(false) ); + } + } + + for (size_t ii=0; iiget( key_str, value_out) ); + CHK_EQ( value_exp, value_out.toString() ); + } + + // Normal flush (upto last sync). + jungle::FlushOptions f_opt; + CHK_Z( db->flushLogs(f_opt) ); + + // Flush should have done upto the last sync. + uint64_t seq_num_out = 0; + CHK_Z( db->getLastFlushedSeqNum(seq_num_out) ); + CHK_EQ(NUM/2 + 1, seq_num_out); + + // Flush beyond sync (upto the latest). + f_opt.beyondLastSync = true; + CHK_Z( db->flushLogs(f_opt) ); + + // Flush should have done upto the latest. + CHK_Z( db->getLastFlushedSeqNum(seq_num_out) ); + CHK_EQ(NUM, seq_num_out); + + // Close DB without sync. + CHK_Z( jungle::DB::close(db) ); + + // Reopen. + CHK_Z( jungle::DB::open(&db, filename, config) ); + + // Put more logs. + for (size_t ii=NUM; iiset( jungle::KV(key_str, val_str) ) ); + } + + // Check, all data should exist. + for (size_t ii=0; iiget( key_str, value_out) ); + CHK_EQ( value_exp, value_out.toString() ); + } + + CHK_Z( jungle::DB::close(db) ); + + // Free all resources for jungle. + jungle::shutdown(); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int get_stat_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + jungle::GlobalConfig g_config; + g_config.fdbCacheSize = 128*1024*1024; + jungle::init(g_config); + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Set KV pairs. + int n = 10; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync and async flush. + CHK_Z(db->sync()); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + jungle::DBStats stats_out; + CHK_Z(db->getStats(stats_out)); + CHK_EQ(n, stats_out.numKvs); + CHK_GT(stats_out.workingSetSizeByte, n*10); + CHK_EQ(stats_out.cacheSizeByte, g_config.fdbCacheSize); + CHK_GT(stats_out.cacheUsedByte, 0); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int double_shutdown_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Set KV pairs. + int n = 10; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + CHK_Z(db->sync()); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + s = jungle::shutdown(); + CHK_EQ(jungle::Status(jungle::Status::ALREADY_SHUTDOWN), s); + + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int reopen_empty_db_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + CHK_Z(jungle::DB::open(&db, filename, config)); + // Sync & close without any insert. + s = db->sync(); // Will return error. + CHK_Z(jungle::DB::close(db)); + + // Reopen & set KV pairs. + CHK_Z(jungle::DB::open(&db, filename, config)); + int n = 10; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + CHK_Z(db->sync()); + CHK_Z(jungle::DB::close(db)); + + // Reopen & check. + CHK_Z(jungle::DB::open(&db, filename, config)); + CHK_Z(_get_byseq_check(0, n, 0, db, kv)); + CHK_Z(jungle::DB::close(db)); + + CHK_Z(jungle::shutdown()); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int different_l0_partitions() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.numL0Partitions = 1; + CHK_Z(jungle::DB::open(&db, filename, config)); + int n = 10; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_bykey_kv_pairs(0, n, db, kv)); + CHK_Z(db->sync()); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + CHK_Z(jungle::DB::close(db)); + + // Change the number of partitions, + // but it should be ignored internally. + config.numL0Partitions = 4; + + // Reopen & check. + CHK_Z(jungle::DB::open(&db, filename, config)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Insert more. + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key_new", "value_new")); + CHK_Z(_set_bykey_kv_pairs(0, n, db, kv2)); + CHK_Z(db->sync()); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Check both. + CHK_Z(_get_bykey_check(0, n, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv2)); + CHK_Z(jungle::DB::close(db)); + + // Reopen & check. + CHK_Z(jungle::DB::open(&db, filename, config)); + CHK_Z(_get_bykey_check(0, n, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv2)); + CHK_Z(jungle::DB::close(db)); + + CHK_Z(jungle::shutdown()); + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int add_new_log_file_race_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.maxEntriesInLogFile = 10; + CHK_Z(jungle::DB::open(&db, filename, config)); + + jungle::DebugParams dp; + dp.addNewLogFileCb = [&db](const jungle::DebugParams::GenericCbParams& pp) { + db->sync(false); + uint64_t seq_num_out = 0; + CHK_Z( db->getLastSyncedSeqNum(seq_num_out) ); + return 0; + }; + jungle::DB::setDebugParams(dp); + + for (size_t ii=0; ii<11; ++ii) { + std::string key_str = "k" + std::to_string(ii); + std::string val_str = "v" + std::to_string(ii); + CHK_Z( db->set( jungle::KV(key_str, val_str) ) ); + } + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + //ts.options.printTestMessage = true; + ts.doTest("basic operation test", basic_operations_test); + ts.doTest("many logs test", many_logs_test); + ts.doTest("overwrite sequence number test", overwrite_seq); + ts.doTest("overwrite last sequence number test", overwrite_seq_last_record); + ts.doTest("load existing db test (sync)", load_db_sync); + ts.doTest("load existing db test (flush)", load_db_flush); + ts.doTest("log deduplication test", log_dedup); + ts.doTest("deletion test", deletion_op); + ts.doTest("multiple log files test", multiple_log_files); + ts.doTest("multiple KV Stores test", multiple_kvs); + ts.doTest("set by key test", set_by_key); + ts.doTest("command marker test", command_marker); + ts.doTest("multiple handles test", multiple_handles); + ts.doTest("multiple group handles test", multiple_group_handles); + ts.doTest("group handle misuse test", group_handle_misuse); + ts.doTest("purge only test", purge_only_test); + ts.doTest("meta test log", meta_test_log); + ts.doTest("meta test table", meta_test_table); + ts.doTest("async flush test", async_flush_test); + ts.doTest("async flush verbose test", async_flush_verbose_test, + TestRange( {false, true} ) ); + ts.doTest("async flush verbose with delay test", + async_flush_verbose_with_delay_test, + TestRange( { true } ) ); + ts.doTest("flush beyond sync test", flush_beyond_sync_test); + ts.doTest("get stat test", get_stat_test); + ts.doTest("double shutdown test", double_shutdown_test); + ts.doTest("reopen empty db test", reopen_empty_db_test); + ts.doTest("different number of L0 partitions test", different_l0_partitions); + ts.doTest("add new log file race test", add_new_log_file_race_test); + + return 0; +} diff --git a/tests/jungle/casual_test.cc b/tests/jungle/casual_test.cc new file mode 100644 index 0000000..d652c42 --- /dev/null +++ b/tests/jungle/casual_test.cc @@ -0,0 +1,729 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "jungle_test_common.h" + +#include "internal_helper.h" +#include "latency_collector.h" +#include "latency_dump.h" + +#include + +#include + +LatencyCollector global_lat; + +int bench_jungle() { + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.fdbCacheSize = (uint64_t)1024*1024*1024; // 1GB + jungle::init(g_config); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + //config.maxEntriesInLogFile = 1000; + + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + uint64_t idx = 0; + char val_char[257]; + memset(val_char, 'x', 256); + val_char[256] = 0; + std::string val(val_char); + + std::vector key_order(1000000); + for (uint64_t ii=0; ii<1000000; ++ii) key_order[ii] = ii; + for (uint64_t ii=0; ii<1000000; ++ii) { + uint64_t r1 = rand() % 1000000; + uint64_t r2 = rand() % 1000000; + uint64_t temp = key_order[r1]; + key_order[r1] = key_order[r2]; + key_order[r2] = temp; + } + + TestSuite::Progress pp(1000000, "populating"); + TestSuite::Timer tt; + for (uint64_t ii=0; ii<1000000; ++ii) { + std::string key = "k" + TestSuite::lzStr(7, key_order[ii]); + { collectBlockLatency(&global_lat, "set"); + CHK_Z(db->set(jungle::KV(key, val))); + } + idx++; + + if (idx && idx % 4000 == 0 && true) { + collectBlockLatency(&global_lat, "checkpoint"); + db->sync(false); + } + + pp.update(idx); + } + pp.done(); + TestSuite::_msg("%ld\n", tt.getTimeUs()); + TestSuite::_msg("%ld writes\n", idx); + + TestSuite::_msg("press enter..\n"); + int a = getc(stdin); (void)a; + + // Close, reopen, verify (twice). + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + pp = TestSuite::Progress(idx, "verifying"); + tt.reset(); + for (uint64_t ii=0; iiget(key_req, value_out); + } + CHK_Z(s); + value_out.free(); + pp.update(ii); + } + pp.done(); + TestSuite::_msg("%ld\n", tt.getTimeUs()); + + //TestSuite::_msg("%s\n", global_lat.dump().c_str()); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TestSuite::clearTestFile(prefix, TestSuite::END_OF_TEST); + return 0; +} + +#include + +int bench_fdb() { + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + fdb_config config = fdb_get_default_config(); + config.seqtree_opt = FDB_SEQTREE_USE; + config.buffercache_size = (uint64_t)1024*1024*1024; + + fdb_kvs_config kvs_config = fdb_get_default_kvs_config(); + fdb_file_handle *dbfile; + fdb_kvs_handle *db; + fdb_status s; + + s = fdb_open(&dbfile, filename.c_str(), &config); + s = fdb_kvs_open(dbfile, &db, NULL, &kvs_config); + + uint64_t idx = 0; + char val_char[257]; + memset(val_char, 'x', 256); + val_char[256] = 0; + std::string val(val_char); + + std::vector key_order(1000000); + for (uint64_t ii=0; ii<1000000; ++ii) key_order[ii] = ii; + for (uint64_t ii=0; ii<1000000; ++ii) { + uint64_t r1 = rand() % 1000000; + uint64_t r2 = rand() % 1000000; + uint64_t temp = key_order[r1]; + key_order[r1] = key_order[r2]; + key_order[r2] = temp; + } + + TestSuite::Progress pp(1000000, "populating"); + TestSuite::Timer tt; + for (uint64_t ii=0; ii<1000000; ++ii) { + std::string key = "k" + TestSuite::lzStr(7, key_order[ii]); + { collectBlockLatency(&global_lat, "set"); + s = fdb_set_kv(db, key.c_str(), key.size(), val.c_str(), val.size()); + } + idx++; + + if (idx && idx % 4000 == 0 && true) { + collectBlockLatency(&global_lat, "checkpoint"); + fdb_commit(dbfile, FDB_COMMIT_MANUAL_WAL_FLUSH); + } + pp.update(idx); + } + pp.done(); + TestSuite::_msg("%ld\n", tt.getTimeUs()); + TestSuite::_msg("%ld writes\n", idx); + fdb_commit(dbfile, FDB_COMMIT_MANUAL_WAL_FLUSH); + + pp = TestSuite::Progress(idx, "verifying"); + tt.reset(); + for (uint64_t ii=0; ii key_order(num); + for (uint64_t ii=0; iiset(jungle::KV(key, val))); + } + idx++; + + if (idx && idx % 4000 == 0 && true) { + collectBlockLatency(&global_lat, "checkpoint"); + db->sync(false); + } + + pp.update(idx); + } + pp.done(); + + uint64_t time_us = 0; + time_us = tt.getTimeUs(); + TestSuite::_msg("%s\n", TestSuite::usToString(time_us).c_str()); + TestSuite::_msg("%s ops/s\n", + TestSuite::throughputStr(num, time_us).c_str()); + + TestSuite::_msg("press enter..\n"); + int a = getc(stdin); (void)a; + + // Close, reopen, verify (twice). + CHK_Z(jungle::DB::close(db)); + + tt.reset(); + CHK_Z(jungle::DB::open(&db, filename, config)); + time_us = tt.getTimeUs(); + TestSuite::_msg("%s\n", TestSuite::usToString(time_us).c_str()); + TestSuite::_msg("%s ops/s\n", + TestSuite::throughputStr(num, time_us).c_str()); + + pp = TestSuite::Progress(idx, "verifying"); + tt.reset(); + for (uint64_t ii=0; iiget(key_req, value_out); + } + CHK_Z(s); + value_out.free(); + pp.update(ii); + } + pp.done(); + + time_us = tt.getTimeUs(); + TestSuite::_msg("%s\n", TestSuite::usToString(time_us).c_str()); + TestSuite::_msg("%s ops/s\n", TestSuite::throughputStr(num, time_us).c_str()); + + //TestSuite::_msg("%s\n", global_lat.dump().c_str()); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int see_auto_flush(uint64_t num_records) { + jungle::Status s; + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + jungle::init(g_config); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 1000; + + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + TestSuite::Progress pp(num_records); + for (size_t ii=0; iiset(jungle::KV(key, val)) ); + pp.update(ii); + } + pp.done(); + + TestSuite::sleep_sec(5, "flushing.."); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int load_and_print() { + std::string filename = "./db_to_load"; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + jungle::Status s; + CHK_Z(jungle::DB::open(&db, filename, config)); + + jungle::Iterator itr; + itr.init(db); + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + TestSuite::_msg("seq %zu %s\n", + rec_out.seqNum, + jungle::HexDump::toString(rec_out.kv.key.toString()).c_str()); + } while (itr.next().ok()); + itr.close(); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + return 0; +} + +int flush_and_delete() { + jungle::Status s; + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t num_records = 10; + + for (size_t ii=0; iiset(jungle::KV(key, val)) ); + } + + db->sync(); + db->flushLogs(jungle::FlushOptions()); + + for (size_t ii=0; iidel(jungle::SizedBuf(key)) ); + } + + // Now log section has deletion markers only. + + for (size_t ii=0; iiget(jungle::SizedBuf(key), val_out) ); + val_out.free(); + } + + jungle::Iterator itr; + itr.init(db); + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + TestSuite::_msg("seq %zu %s\n", + rec_out.seqNum, + jungle::HexDump::toString(rec_out.kv.key.toString()).c_str()); + } while (itr.next().ok()); + itr.close(); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int compaction_flush_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.numCompactorThreads = 1; + g_config.flusherSleepDuration_ms = 500; + g_config.compactorSleepDuration_ms = 1000; + jungle::DB::init(g_config); + + // Open DB with compaction callback function. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + size_t num = 100; + CHK_Z( _set_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Set delay for compaction. + jungle::DebugParams d_params; + d_params.compactionDelayUs = 100000000; + d_params.urgentCompactionFilesize = 60000; + jungle::setDebugParams(d_params); +/* + // Do compaction in background. + CompactorArgs c_args; + c_args.db = db; + TestSuite::ThreadHolder h(&c_args, compactor, nullptr); +*/ + TestSuite::sleep_sec(1, "Wait for compaction to start"); + + // Put more. + CHK_Z( _set_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + // Request async flush. + CHK_Z( db->flushLogsAsync(jungle::FlushOptions(), nullptr, nullptr) ); + + TestSuite::sleep_sec(3600, "Wait for compaction to start"); + +/* + // Wait more. + h.join(); + CHK_Z(h.getResult()); +*/ + // Close DB while copmaction is in progress, + // compaction should be cancelled immediately. + CHK_Z(jungle::DB::close(db)); + + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int temp_compaction() { + std::string filename = "./bed"; + + jungle::Status s; + jungle::DB* db; + + // Open DB with compaction callback function. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + for (size_t ii=0; ii<4; ++ii) { + CHK_Z( db->compactL0(jungle::CompactOptions(), ii) ); + } + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + return 0; +} + +int multi_db_compact_order_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.numCompactorThreads = 1; + g_config.flusherSleepDuration_ms = 1000; + g_config.compactorSleepDuration_ms = 3000; + jungle::DB::init(g_config); + + // Open DB with compaction callback function. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + std::vector dbs(4, nullptr); + size_t cnt = 0; + for (auto& entry: dbs) { + jungle::DB* db = entry; + std::string f_local = filename + "_" + std::to_string(cnt++); + CHK_Z( jungle::DB::open(&db, f_local, config) ); + db->setLogLevel(5); + } + + TestSuite::sleep_sec(3600, "Wait for compaction to start"); + + for (jungle::DB* db: dbs) { + CHK_Z(jungle::DB::close(db)); + } + + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int split_testbed() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB with compaction callback function. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxL0TableSize = 64*1024*1024; + config.maxL1TableSize = 64*1024*1024; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + const size_t NUM = 2000000; + std::string key_payload(48, 'x'); + std::string val_payload(512, 'x'); + TestSuite::Progress pp(NUM); + for (size_t ii=0; iiset( jungle::KV(key_str, val_payload) ) ); + pp.update(ii); + } + pp.done(); + db->sync(false); + db->flushLogs(jungle::FlushOptions()); + + pp = TestSuite::Progress(4); + for (size_t ii=0; ii<4; ++ii) { + CHK_Z( db->compactL0(jungle::CompactOptions(), ii) ); + pp.update(ii); + } + pp.done(); + + CHK_Z(db->splitLevel(jungle::CompactOptions(), 1)); + + TestSuite::sleep_sec(60); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int delete_compaction_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.numCompactorThreads = 2; + g_config.compactorSleepDuration_ms = 1000; + g_config.fdbCacheSize = (uint64_t)2*1024*1024*1024; + jungle::DB::init(g_config); + + jungle::DBConfig d_config; + d_config.maxL0TableSize = 128*1024*1024; + d_config.maxL1TableSize = 128*1024*1024; + jungle::DB* db = nullptr; + CHK_Z( jungle::DB::open(&db, filename, d_config) ); + + char val_str_raw[1024]; + memset(val_str_raw, 'x', 1024); + + const size_t NUM = 1000000; + TestSuite::Progress pp(NUM, "insert"); + for (size_t ii=0; iiset( jungle::KV( jungle::SizedBuf(key_str), + jungle::SizedBuf(1024, val_str_raw) ) ) ); + pp.update(ii+1); + } + pp.done(); + + // Flush all logs. + db->sync(false); + db->flushLogs(jungle::FlushOptions()); + // Wait. + TestSuite::sleep_sec(5, "wait for flushing"); + + // Compact L0. + for (size_t ii=0; ii<4; ++ii) { + db->compactL0(jungle::CompactOptions(), ii); + } + + pp = TestSuite::Progress(NUM / 2, "delete"); + for (size_t ii=0; iidel( jungle::SizedBuf(key_str) ) ); + } + pp.update(ii+1); + } + pp.done(); + + // Flush all logs. + db->sync(false); + db->flushLogs(jungle::FlushOptions()); + // Wait. + TestSuite::sleep_sec(5, "wait for flushing"); + + // Compact L0. + for (size_t ii=0; ii<4; ++ii) { + db->compactL0(jungle::CompactOptions(), ii); + } + + // Set urgent compaction. + jungle::DebugParams d_params; + d_params.urgentCompactionRatio = 120; + jungle::setDebugParams(d_params); + + // Wait. + TestSuite::sleep_sec(10, "wait for compaction"); + + // Flush all logs. + db->sync(false); + db->flushLogs(jungle::FlushOptions()); + // Compact L0. + for (size_t ii=0; ii<4; ++ii) { + db->compactL0(jungle::CompactOptions(), ii); + } + + // Check. + pp = TestSuite::Progress(NUM, "verify"); + for (size_t ii=0; iiget(jungle::SizedBuf(key_str), value_out); + if (ii < NUM/2 && ii % 100 != 0) { + CHK_FALSE(s); + } else { + CHK_OK(s); + } + pp.update(ii+1); + } + pp.done(); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z( jungle::shutdown() ); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +/* +int basic_test_template() { + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + jungle::shutdown(); + TestSuite::clearTestFile(prefix); + return 0; +} +*/ + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + ts.options.printTestMessage = true; + ts.options.preserveTestFiles = true; + + ts.doTest("test", delete_compaction_test); + return 0; + + ts.doTest("temp split", split_testbed); + ts.doTest("multi db compact order test", multi_db_compact_order_test); + + ts.doTest( "bench jungle", bench_jungle ); + ts.doTest( "bench fdb", bench_fdb ); + ts.doTest( "long log read speed", + long_log_read_speed, + TestRange({1000000}) ); + ts.doTest( "see auto flushing", + see_auto_flush, + TestRange({100000}) ); + ts.doTest("load and print", load_and_print); + ts.doTest("flush and delete", flush_and_delete); + ts.doTest("multi db compact order test", multi_db_compact_order_test); + + return 0; +} diff --git a/tests/jungle/compaction_test.cc b/tests/jungle/compaction_test.cc new file mode 100644 index 0000000..e30249b --- /dev/null +++ b/tests/jungle/compaction_test.cc @@ -0,0 +1,964 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "jungle_test_common.h" + +#include "internal_helper.h" + +#include + +#include + +namespace compaction_test { + +int single_thread_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = false; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t num = 100; + + // Write even numbers. + CHK_Z(_set_keys(db, 0, num, 2, "k%06zu", "v%06zu")); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Write odd numbers. + CHK_Z(_set_keys(db, 1, num, 2, "k%06zu", "v%06zu")); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Point query check. + CHK_Z(_get_keys(db, 0, num, 1, "k%06zu", "v%06zu")); + + // Range query check. + CHK_Z(_iterate_keys(db, 0, num-1, 1, "k%06zu", "v%06zu")); + + // Compact more. + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Check again. + CHK_Z(_get_keys(db, 0, num, 1, "k%06zu", "v%06zu")); + CHK_Z(_iterate_keys(db, 0, num-1, 1, "k%06zu", "v%06zu")); + + // Close and re-open. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Check again. + CHK_Z(_get_keys(db, 0, num, 1, "k%06zu", "v%06zu")); + CHK_Z(_iterate_keys(db, 0, num-1, 1, "k%06zu", "v%06zu")); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +struct WriterArgs : TestSuite::ThreadArgs { + WriterArgs() + : durationSec(1) + , num(1000) + , db(nullptr) + , stopSignal(false) + , rptResult(0) + {} + size_t durationSec; + size_t num; + jungle::DB* db; + std::atomic stopSignal; + size_t rptResult; +}; + +int writer(TestSuite::ThreadArgs* t_args) { + WriterArgs* args = static_cast(t_args); + TestSuite::Timer tt(args->durationSec * 1000); + + size_t rpt = 0; + while (!tt.timeover() && !args->stopSignal) { + rpt++; + CHK_Z( _set_keys( args->db, 0, args->num, 1, + "k%06zu", "v%06zu_" + std::to_string(rpt) ) ); + // Sync & flush. + CHK_Z(args->db->sync(false)); + CHK_Z(args->db->flushLogs(jungle::FlushOptions())); + } + args->rptResult = rpt; + return 0; +} + +int concurrent_writer_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.nextLevelExtension = false; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t duration_sec = 1; + size_t num = 1000; + + WriterArgs w_args; + w_args.db = db; + w_args.num = num; + w_args.durationSec = duration_sec; + TestSuite::ThreadHolder h(&w_args, writer, nullptr); + + TestSuite::Timer tt(duration_sec * 1000); + while (!tt.timeover()) { + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + TestSuite::sleep_ms(100); + } + + h.join(); + CHK_Z(h.getResult()); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Check. + size_t rpt = w_args.rptResult; + CHK_Z(_get_keys(db, 0, num, 1, "k%06zu", "v%06zu_" + std::to_string(rpt))); + CHK_Z(_iterate_keys(db, 0, num-1, 1, "k%06zu", "v%06zu_" + std::to_string(rpt))); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int irrelevant_table_file_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = false; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t num = 100; + + _set_keys(db, 0, num, 1, "k%06zu", "v%06zu"); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Create dummy table files: 4 and 5. + for ( std::string suffix: + {"/table0000_00000004", "/table0000_00000005"} ) { + std::string table_file = filename + suffix; + std::ofstream fs_out; + fs_out.open(table_file); + CHK_OK(fs_out.good()); + fs_out.write("garbage", 7); + fs_out.close(); + } + + // Do compaction. + for (size_t ii=0; iicompactL0(jungle::CompactOptions(), ii)); + } + + // Close. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int auto_compactor_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + jungle::GlobalConfig g_config; + g_config.numCompactorThreads = 1; + g_config.compactorSleepDuration_ms = 5000; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = false; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t num = 400000; + + _set_keys(db, 0, num, 1, "k%06zu", "v%06zu"); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Sleep 10 secs. + TestSuite::sleep_sec(20, "compactor"); + + // Close. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +jungle::CompactionCbDecision callback(const jungle::CompactionCbParams& params) { + // Drops all odd number KVs. + std::string num_str = std::string((char*)params.rec.kv.key.data + 1, + params.rec.kv.key.size - 1); + size_t num = atoi(num_str.c_str()); + if (num % 2 == 1) { + return jungle::CompactionCbDecision::DROP; + } + return jungle::CompactionCbDecision::KEEP; +} + +int callback_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB with compaction callback function. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + // This callback function will drop all odd number KVs. + config.compactionCbFunc = callback; + config.numL0Partitions = 4; + config.nextLevelExtension = false; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t num = 100; + + CHK_Z(_set_keys(db, 0, num, 1, "k%06zu", "v%06zu")); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + for (size_t ii=0; iiget(key, val) ); + } else { + CHK_NOT( db->get(key, val) ); + } + } + + jungle::Iterator itr; + CHK_Z( itr.init(db) ); + size_t idx = 0; + do { + jungle::Record rec; + jungle::Record::Holder h_rec(rec); + s = itr.get(rec); + if (!s) break; + + char key_str[256]; + sprintf(key_str, "k%06zu", idx); + jungle::SizedBuf key(key_str); + + CHK_EQ(key, rec.kv.key); + idx += 2; + + } while (itr.next().ok()); + CHK_Z( itr.close() ); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int tombstone_compaction_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB with compaction callback function. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = false; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + size_t num = 100; + CHK_Z( _set_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Delete key 50 + jungle::SizedBuf key_to_del("k000050"); + CHK_Z( db->del(key_to_del) ); + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Check before compaction. + for (size_t ii=0; iiget(key, val) ); + } else { + CHK_NOT( db->get(key, val) ); + } + } + + // Compaction again. + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Check after copmaction. + for (size_t ii=0; iiget(key, val) ); + } else { + CHK_NOT( db->get(key, val) ); + } + } + + jungle::Iterator itr; + CHK_Z( itr.init(db) ); + size_t idx = 0; + do { + jungle::Record rec; + jungle::Record::Holder h_rec(rec); + s = itr.get(rec); + if (!s) break; + + char key_str[256]; + sprintf(key_str, "k%06zu", idx); + jungle::SizedBuf key(key_str); + + CHK_EQ(key, rec.kv.key); + if (idx == 49) idx += 2; + else idx++; + + } while (itr.next().ok()); + CHK_Z( itr.close() ); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; + + return 0; +} + +struct CompactorArgs : TestSuite::ThreadArgs { + CompactorArgs() + : db(nullptr) + , hashNum(0) + , numPartitions(4) + , expResult(jungle::Status::COMPACTION_CANCELLED) {} + jungle::DB* db; + size_t hashNum; + size_t numPartitions; + jungle::Status expResult; +}; + +int compactor(TestSuite::ThreadArgs* t_args) { + CompactorArgs* args = static_cast(t_args); + jungle::CompactOptions c_opt; + + if (args->hashNum != _SCU32(-1)) { + // Hash number is given. + jungle::Status s = args->db->compactL0(c_opt, args->hashNum); + CHK_EQ(args->expResult, s); + + } else { + // Not given, compact all. + for (size_t ii=0; iinumPartitions; ++ii) { + jungle::Status s = args->db->compactL0(c_opt, ii); + CHK_EQ(args->expResult, s); + } + } + return 0; +} + +int compaction_cancel_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB with compaction callback function. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = false; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + size_t num = 100; + CHK_Z( _set_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Set delay for compaction. + jungle::DebugParams d_params; + d_params.compactionDelayUs = 500000; // 0.5 sec per record. + jungle::setDebugParams(d_params); + + // Do compaction in background. + CompactorArgs c_args; + c_args.db = db; + TestSuite::ThreadHolder h(&c_args, compactor, nullptr); + + TestSuite::sleep_sec(1, "Wait for compaction to start"); + + // Close DB while copmaction is in progress, + // compaction should be cancelled immediately. + CHK_Z(jungle::DB::close(db)); + + h.join(); + CHK_Z(h.getResult()); + + // Re-open. + CHK_Z( jungle::DB::open(&db, filename, config) ); + CHK_Z( _get_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + // Remove compaction delay. + d_params.compactionDelayUs = 0; + jungle::setDebugParams(d_params); + + // Compaction should work this time. + c_args.db = db; + c_args.expResult = jungle::Status::OK; + CHK_Z( compactor(&c_args) ); + + // All keys should be there. + CHK_Z( _get_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + CHK_Z( jungle::DB::close(db) ); + + // Re-open and check again. + CHK_Z( jungle::DB::open(&db, filename, config) ); + CHK_Z( _get_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + CHK_Z( jungle::DB::close(db) ); + + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int merge_compaction_cancel_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB with compaction callback function. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = false; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + size_t num = 100; + CHK_Z( _set_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction once. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Set delay for compaction. + jungle::DebugParams d_params; + d_params.compactionDelayUs = 500000; // 0.5 sec per record. + jungle::setDebugParams(d_params); + + // Do compaction in background. + CompactorArgs c_args; + c_args.db = db; + TestSuite::ThreadHolder h(&c_args, compactor, nullptr); + + TestSuite::sleep_sec(1, "Wait for compaction to start"); + + // Close DB while copmaction is in progress, + // compaction should be cancelled immediately. + CHK_Z(jungle::DB::close(db)); + + h.join(); + CHK_Z(h.getResult()); + + // Re-open. + CHK_Z( jungle::DB::open(&db, filename, config) ); + CHK_Z( _get_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + // Remove compaction delay. + d_params.compactionDelayUs = 0; + jungle::setDebugParams(d_params); + + // Compaction should work this time. + c_args.db = db; + c_args.expResult = jungle::Status::OK; + CHK_Z( compactor(&c_args) ); + + // All keys should be there. + CHK_Z( _get_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + CHK_Z( jungle::DB::close(db) ); + + // Re-open and check again. + CHK_Z( jungle::DB::open(&db, filename, config) ); + CHK_Z( _get_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + CHK_Z( jungle::DB::close(db) ); + + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +jungle::CompactionCbDecision + callback2(const jungle::CompactionCbParams& params, bool* flag) +{ + // To check whether compaction happened or not. + if (flag) *flag = true; + return jungle::CompactionCbDecision::KEEP; +} + +int compaction_cancel_recompact_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB with compaction callback function. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = false; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + size_t num = 100; + CHK_Z( _set_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Set delay for compaction. + jungle::DebugParams d_params; + d_params.compactionDelayUs = 500000; // 0.5 sec per record. + jungle::setDebugParams(d_params); + + // Do compaction in background. + CompactorArgs c_args; + c_args.db = db; + TestSuite::ThreadHolder h(&c_args, compactor, nullptr); + + TestSuite::sleep_sec(1, "Wait for compaction to start"); + + // Close DB while copmaction is in progress, + // compaction should be cancelled immediately. + CHK_Z(jungle::DB::close(db)); + + h.join(); + CHK_Z(h.getResult()); + + // Completely shutdown and re-open with background compactor. + CHK_Z(jungle::shutdown()); + jungle::GlobalConfig g_conf; + g_conf.compactorSleepDuration_ms = 500; + g_conf.numCompactorThreads = 1; + CHK_Z( jungle::init(g_conf) ); + + bool compaction_happened = false; + config.compactionCbFunc = std::bind( callback2, + std::placeholders::_1, + &compaction_happened ); + + // Re-open. + CHK_Z( jungle::DB::open(&db, filename, config) ); + + // Remove compaction delay. + d_params.compactionDelayUs = 0; + jungle::setDebugParams(d_params); + + // Compaction should work this time. + TestSuite::sleep_sec(1, "Wait for bg compaction"); + + CHK_TRUE( compaction_happened ); + + // All keys should be there. + CHK_Z( _get_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + CHK_Z( jungle::DB::close(db) ); + + // Re-open and check again. + CHK_Z( jungle::DB::open(&db, filename, config) ); + CHK_Z( _get_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + CHK_Z( jungle::DB::close(db) ); + + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int merge_compaction_cancel_recompact_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB with compaction callback function. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = false; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + size_t num = 100; + CHK_Z( _set_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction once. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Set delay for compaction. + jungle::DebugParams d_params; + d_params.compactionDelayUs = 500000; // 0.5 sec per record. + jungle::setDebugParams(d_params); + + // Do compaction in background. + CompactorArgs c_args; + c_args.db = db; + TestSuite::ThreadHolder h(&c_args, compactor, nullptr); + + TestSuite::sleep_sec(1, "Wait for compaction to start"); + + // Close DB while copmaction is in progress, + // compaction should be cancelled immediately. + CHK_Z(jungle::DB::close(db)); + + h.join(); + CHK_Z(h.getResult()); + + // Completely shutdown and re-open with background compactor. + CHK_Z(jungle::shutdown()); + jungle::GlobalConfig g_conf; + g_conf.compactorSleepDuration_ms = 500; + g_conf.numCompactorThreads = 1; + CHK_Z( jungle::init(g_conf) ); + + bool compaction_happened = false; + config.compactionCbFunc = std::bind( callback2, + std::placeholders::_1, + &compaction_happened ); + + // Re-open. + CHK_Z( jungle::DB::open(&db, filename, config) ); + + // Remove compaction delay. + d_params.compactionDelayUs = 0; + jungle::setDebugParams(d_params); + + // Compaction should work this time. + TestSuite::sleep_sec(1, "Wait for bg compaction"); + + CHK_TRUE( compaction_happened ); + + // All keys should be there. + CHK_Z( _get_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + CHK_Z( jungle::DB::close(db) ); + + // Re-open and check again. + CHK_Z( jungle::DB::open(&db, filename, config) ); + CHK_Z( _get_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + CHK_Z( jungle::DB::close(db) ); + + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int newest_value_after_compaction_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB with compaction callback function. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = false; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + size_t num = 100; + CHK_Z( _set_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction once. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Update even numbers. + CHK_Z( _set_keys(db, 0, num, 2, "k%06zu", "even_%06zu") ); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + CHK_Z( _get_keys(db, 1, num, 2, "k%06zu", "v%06zu") ); + CHK_Z( _get_keys(db, 0, num, 2, "k%06zu", "even_%06zu") ); + + // Do L0 compaction again. + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Update odd numbers. + CHK_Z( _set_keys(db, 1, num, 2, "k%06zu", "odd_%06zu") ); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + CHK_Z( _get_keys(db, 1, num, 2, "k%06zu", "odd_%06zu") ); + CHK_Z( _get_keys(db, 0, num, 2, "k%06zu", "even_%06zu") ); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int newest_value_during_compaction_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB with compaction callback function. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = false; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + size_t num = 100; + CHK_Z( _set_keys(db, 0, num, 1, "k%06zu", "v%06zu") ); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Set delay for compaction. + jungle::DebugParams d_params; + d_params.compactionDelayUs = 5000; // 5 ms per record. + jungle::setDebugParams(d_params); + + // Do L0 compaction in background. + CompactorArgs c_args; + c_args.db = db; + c_args.hashNum = _SCU32(-1); + c_args.expResult = jungle::Status(); + + // It will take 500 ms. + TestSuite::ThreadHolder h(&c_args, compactor, nullptr); + TestSuite::sleep_ms(100); + + // Update even numbers. + CHK_Z( _set_keys(db, 0, num, 2, "k%06zu", "even_%06zu") ); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + CHK_Z( _get_keys(db, 1, num, 2, "k%06zu", "v%06zu") ); + CHK_Z( _get_keys(db, 0, num, 2, "k%06zu", "even_%06zu") ); + + h.join(); + CHK_Z( h.getResult() ); + + // Do L0 compaction again, it will take 500 ms. + TestSuite::ThreadHolder h2(&c_args, compactor, nullptr); + TestSuite::sleep_ms(100); + + // Update odd numbers. + CHK_Z( _set_keys(db, 1, num, 2, "k%06zu", "odd_%06zu") ); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + CHK_Z( _get_keys(db, 1, num, 2, "k%06zu", "odd_%06zu") ); + CHK_Z( _get_keys(db, 0, num, 2, "k%06zu", "even_%06zu") ); + + h2.join(); + CHK_Z( h2.getResult() ); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +} using namespace compaction_test; + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + //ts.options.printTestMessage = true; + ts.doTest("compaction single thread test", + single_thread_test); + + ts.doTest("compaction with writer thread test", + concurrent_writer_test); + + ts.doTest("irrelevant table file test", + irrelevant_table_file_test); + + ts.doTest("callback test", + callback_test); + + ts.doTest("tombstone compaction test", + tombstone_compaction_test); + + ts.doTest("compaction cancel test", + compaction_cancel_test); + + ts.doTest("merge-compaction cancel test", + merge_compaction_cancel_test); + + ts.doTest("compaction cancel recompact test", + compaction_cancel_recompact_test); + + ts.doTest("merge-compaction cancel recompact test", + merge_compaction_cancel_recompact_test); + + ts.doTest("newest value after compaction test", + newest_value_after_compaction_test); + + ts.doTest("newest value during compaction test", + newest_value_during_compaction_test); + +#if 0 + ts.doTest("auto compactor test", + auto_compactor_test); +#endif + + return 0; +} + diff --git a/tests/jungle/corruption_test.cc b/tests/jungle/corruption_test.cc new file mode 100644 index 0000000..af4bb01 --- /dev/null +++ b/tests/jungle/corruption_test.cc @@ -0,0 +1,924 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "jungle_test_common.h" + +#include "internal_helper.h" + +#include + +#include + +namespace corruption_test { + +static int truncate_file(const std::string& filename, + size_t amount) { + std::ifstream fs_in; + fs_in.open(filename); + CHK_OK(fs_in.good()); + + fs_in.seekg(0, fs_in.end); + size_t fs_size = fs_in.tellg(); + fs_in.seekg(0, fs_in.beg); + char buffer[fs_size]; + fs_in.read(buffer, fs_size); + fs_in.close(); + + std::ofstream fs_out; + fs_out.open(filename); + CHK_OK(fs_out.good()); + fs_out.write(buffer, fs_size - amount); + fs_out.close(); + return 0; +} + +static int inject_crc_error(const std::string& filename, + size_t offset = 16) { + std::ifstream fs_in; + fs_in.open(filename); + CHK_OK(fs_in.good()); + + fs_in.seekg(0, fs_in.end); + size_t fs_size = fs_in.tellg(); + fs_in.seekg(0, fs_in.beg); + char buffer[fs_size]; + fs_in.read(buffer, fs_size); + fs_in.close(); + + // Flip. + buffer[offset] = ~buffer[offset]; + + std::ofstream fs_out; + fs_out.open(filename); + CHK_OK(fs_out.good()); + fs_out.write(buffer, fs_size); + fs_out.close(); + return 0; +} + +int log_file_truncation_test(size_t amount) { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.logSectionOnly = true; + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Write something. + size_t num = 101; + std::vector kv(num); + CHK_Z(_init_kv_pairs(num, kv, "key", "value")); + + for (size_t ii=0; iisetSN(ii+1, kv[ii])); + } + CHK_Z(db->sync(false)); + CHK_Z(jungle::DB::close(db)); + + // Truncate file. + CHK_Z(truncate_file(filename + "/log0000_00000000", amount)); + + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t corrupted_idx = 0; + for (size_t ii=0; iigetSN(ii+1, kv_out); + if (!s) { + corrupted_idx = ii; + break; + } + kv_out.free(); + } + // Corruption should happened. + CHK_GT(corrupted_idx, 0); + CHK_SM(corrupted_idx, num); + + // Insert & recover. + for (size_t ii=corrupted_idx; iisetSN(ii+1, kv[ii])); + } + + // Get check. + for (size_t ii=0; iigetSN(ii+1, kv_out)); + CHK_EQ(kv[ii].key, kv_out.key); + CHK_EQ(kv[ii].value, kv_out.value); + kv_out.free(); + TestSuite::clearInfo(); + } + + // Close and reopen. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Get check. + for (size_t ii=0; iigetSN(ii+1, kv_out)); + CHK_EQ(kv[ii].key, kv_out.key); + CHK_EQ(kv[ii].value, kv_out.value); + kv_out.free(); + TestSuite::clearInfo(); + } + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + CHK_Z(_free_kv_pairs(num, kv)); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int log_manifest_corruption_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.logSectionOnly = true; + config.maxEntriesInLogFile = 10; + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Write something. + size_t num = 6; + std::vector kv(num); + CHK_Z(_init_kv_pairs(num, kv, "key", "value")); + + for (size_t ii=0; iisetSN(ii+1, kv[ii])); + } + CHK_Z(db->sync(false)); + + for (size_t ii=num/2; iisetSN(ii+1, kv[ii])); + } + CHK_Z(db->sync(false)); + jungle::FlushOptions f_opt; + f_opt.purgeOnly = true; + CHK_Z(db->flushLogs(f_opt, num/2 - 1)); + CHK_Z(jungle::DB::close(db)); + + // Corrupt manifest file. + CHK_Z(inject_crc_error(filename + "/log0000_manifest")); + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Get check. + for (size_t ii=num/2; iigetSN(ii+1, kv_out)); + CHK_EQ(kv[ii].key, kv_out.key); + CHK_EQ(kv[ii].value, kv_out.value); + kv_out.free(); + TestSuite::clearInfo(); + } + + // Close. + CHK_Z(jungle::DB::close(db)); + + // Corrupt both manifest file & backup file. + CHK_Z(inject_crc_error(filename + "/log0000_manifest")); + CHK_Z(inject_crc_error(filename + "/log0000_manifest.bak")); + + // Should fail. + s = jungle::DB::open(&db, filename, config); + CHK_NOT(s); + + CHK_Z(jungle::shutdown()); + CHK_Z(_free_kv_pairs(num, kv)); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int log_manifest_corruption_across_file_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.logSectionOnly = true; + config.maxEntriesInLogFile = 10; + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Write something. + size_t num = 12; + size_t kv_num = 100; + std::vector kv(kv_num); + CHK_Z(_init_kv_pairs(kv_num, kv, "key", "value")); + + for (size_t ii=0; iisetSN(ii+1, kv[ii])); + } + CHK_Z(db->sync(false)); + + // Copy mani file somewhere + TestSuite::copyfile(filename + "/log0000_manifest", + filename + "/log0000_manifest.keep"); + + for (size_t ii=num/2; iisetSN(ii+1, kv[ii])); + } + + // NOTE: `close` will internally call `sync`. + CHK_Z(jungle::DB::close(db)); + + // Restore as a backup file. + TestSuite::copyfile(filename + "/log0000_manifest.keep", + filename + "/log0000_manifest.bak"); + + // Corrupt manifest file. + CHK_Z(inject_crc_error(filename + "/log0000_manifest")); + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Get last seq num. + uint64_t last_seqnum; + CHK_Z(db->getMaxSeqNum(last_seqnum)); + + // Get check. + for (size_t ii=1; ii<=last_seqnum; ++ii) { + TestSuite::setInfo("ii=%zu", ii); + jungle::KV kv_out; + CHK_Z(db->getSN(ii, kv_out)); + kv_out.free(); + TestSuite::clearInfo(); + } + + // Set more, it will overwrite previous log files. + std::vector kv_after(kv_num); + CHK_Z(_init_kv_pairs(kv_num, kv_after, "key", "value_after_crash")); + for (size_t ii=last_seqnum+1; ii<=last_seqnum+5; ++ii) { + CHK_Z(db->setSN(ii, kv_after[ii-1])); + } + + // Close & reopen. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Get check. + for (size_t ii=1; ii<=last_seqnum+5; ++ii) { + TestSuite::setInfo("ii=%zu", ii); + jungle::KV kv_out; + CHK_Z(db->getSN(ii, kv_out)); + if (ii <= last_seqnum) { + CHK_EQ(kv[ii-1].key, kv_out.key); + CHK_EQ(kv[ii-1].value, kv_out.value); + } else { + CHK_EQ(kv_after[ii-1].key, kv_out.key); + CHK_EQ(kv_after[ii-1].value, kv_out.value); + } + kv_out.free(); + TestSuite::clearInfo(); + } + CHK_Z(jungle::DB::close(db)); + + CHK_Z(jungle::shutdown()); + CHK_Z(_free_kv_pairs(kv_num, kv)); + CHK_Z(_free_kv_pairs(kv_num, kv_after)); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int incomplete_log_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.logSectionOnly = true; + config.maxEntriesInLogFile = 10; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t sync_point = 25; + size_t more_insert = 100; + + // Write something. + for (size_t ii=0; iiset(jungle::KV(key_str, val_str)) ); + } + + // Sync in the middle. + CHK_Z( db->sync(false) ); + + // Write more. + for (size_t ii=sync_point; iiset(jungle::KV(key_str, val_str)) ); + } + + // Copy whole database to other place. + std::string cmd; + std::string filename_copy = filename + "_copy"; + cmd = "cp -R " + filename + " " + filename + "_copy"; + int r = ::system(cmd.c_str()); + (void)r; + + // Close original. + CHK_Z(jungle::DB::close(db)); + + // Open copy, should work. + CHK_Z(jungle::DB::open(&db, filename_copy, config)); + + // All synced KV should exist. + for (size_t ii=0; iiget(jungle::SizedBuf(key_str), value_out) ); + CHK_EQ(jungle::SizedBuf(val_str), value_out); + value_out.free(); + } + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int irrelevant_log_file_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.logSectionOnly = true; + config.maxEntriesInLogFile = 10; + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Make an irrelevant log file. + std::string log_file = filename + "/log0000_00000003"; + std::ofstream fs_out; + fs_out.open(log_file); + CHK_OK(fs_out.good()); + fs_out.write("garbage", 7); + fs_out.close(); + + size_t num = 100; + + // Write something beyond that irrelevant file. + for (size_t ii=0; iiset(jungle::KV(key_str, val_str)) ); + } + + // Sync. + CHK_Z( db->sync(false) ); + + // Close and re-open. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + // All synced KV should exist. + for (size_t ii=0; iiget(jungle::SizedBuf(key_str), value_out) ); + CHK_EQ(jungle::SizedBuf(val_str), value_out); + value_out.free(); + } + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int not_existing_log_file_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.logSectionOnly = true; + config.maxEntriesInLogFile = 10; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t num = 15; + + // Write something beyond that irrelevant file. + for (size_t ii=0; iiset(jungle::KV(key_str, val_str)) ); + } + + // Sync. + CHK_Z( db->sync(false) ); + + // Close. + CHK_Z(jungle::DB::close(db)); + + // Remove the 2nd log file. + std::string log_file = filename + "/log0000_00000001"; + TestSuite::remove(log_file); + + // Re-open. + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Try to retrieve KV. + size_t succ_count = 0; + for (size_t ii=0; iiget(jungle::SizedBuf(key_str), value_out); + if (s) { + CHK_EQ(jungle::SizedBuf(val_str), value_out); + value_out.free(); + succ_count++; + } + } + CHK_EQ(config.maxEntriesInLogFile, succ_count); + + // Write lost logs again. + for (size_t ii=succ_count; iiset(jungle::KV(key_str, val_str)) ); + } + // Sync. + CHK_Z( db->sync(false) ); + + // Close and re-open. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Try to retrieve KV. + for (size_t ii=0; iiget(jungle::SizedBuf(key_str), value_out) ); + CHK_EQ(jungle::SizedBuf(val_str), value_out); + value_out.free(); + } + CHK_Z(jungle::DB::close(db)); + + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int wrong_manifest_test(bool log_section) { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.logSectionOnly = log_section; + config.maxEntriesInLogFile = 10; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t NUM1 = 15; + size_t NUM2 = 25; + size_t NUM3 = 95; + size_t NUM4 = 105; + size_t NUM5 = 135; + + // Write something beyond that irrelevant file. + for (size_t ii=0; iiset(jungle::KV(key_str, val_str)) ); + } + + // Sync. + CHK_Z( db->sync(false) ); + // Flush. + CHK_Z( db->flushLogs(jungle::FlushOptions(), NUM1) ); + + // Keep manifest file. + jungle::FileMgr::copy(filename + "/log0000_manifest", + filename + "/log0000_manifest.copy"); + + for (size_t ii=NUM2; iiset(jungle::KV(key_str, val_str)) ); + } + // Sync. + CHK_Z( db->sync(false) ); + // Flush. + CHK_Z( db->flushLogs(jungle::FlushOptions(), NUM3) ); + + // Wait until pending files are removed. + TestSuite::sleep_sec(1, "waiting for file purge"); + + // Close. + CHK_Z(jungle::DB::close(db)); + + // Restore old manifest file, to mimic crash without sync. + jungle::FileMgr::copy(filename + "/log0000_manifest.copy", + filename + "/log0000_manifest"); + + // Re-open and set more. + CHK_Z(jungle::DB::open(&db, filename, config)); + for (size_t ii=NUM4; iiset(jungle::KV(key_str, val_str)) ); + } + // Sync. + CHK_Z( db->sync(false) ); + + if (log_section) { + for (size_t ii=NUM4; iiget( jungle::SizedBuf(key_str), value_out ) ); + CHK_EQ(val_str, value_out.toString()); + } + + } else { + for (size_t ii=0; iiget( jungle::SizedBuf(key_str), value_out ) ); + CHK_EQ(val_str, value_out.toString()); + } + } + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int removed_log_files_at_the_beginning_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.logSectionOnly = true; + config.logFileTtl_sec = 60; + config.maxEntriesInLogFile = 10; + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Write something. + size_t KV_NUM = 100; + size_t FLUSH_NUM = 25; + size_t EXPECTED_MIN_NUM = 41; + + std::vector kv(KV_NUM); + CHK_Z(_init_kv_pairs(KV_NUM, kv, "key", "value")); + + for (size_t ii=0; iisetSN(ii+1, kv[ii])); + } + CHK_Z(db->sync(false)); + + jungle::FlushOptions f_opt; + f_opt.purgeOnly = true; + CHK_Z(db->flushLogs(f_opt, FLUSH_NUM)); + + CHK_Z(jungle::DB::close(db)); + + // Remove the first two log files. + TestSuite::remove(filename + "/log0000_00000002"); + TestSuite::remove(filename + "/log0000_00000003"); + + // Open. + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Get first seq num, should not hang. + uint64_t min_seqnum = 0; + CHK_Z(db->getMinSeqNum(min_seqnum)); + CHK_EQ(EXPECTED_MIN_NUM, min_seqnum); + + // Get last seq num, should not hang. + uint64_t last_seqnum = 0; + CHK_Z(db->getMaxSeqNum(last_seqnum)); + CHK_EQ(KV_NUM, last_seqnum); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + CHK_Z(_free_kv_pairs(KV_NUM, kv)); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int removed_log_files_in_the_middle_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.logSectionOnly = true; + config.logFileTtl_sec = 60; + config.maxEntriesInLogFile = 10; + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Write something. + size_t KV_NUM = 100; + size_t FLUSH_NUM = 25; + size_t EXPECTED_LAST_NUM = 70; + + std::vector kv(KV_NUM); + CHK_Z(_init_kv_pairs(KV_NUM, kv, "key", "value")); + + for (size_t ii=0; iisetSN(ii+1, kv[ii])); + } + CHK_Z(db->sync(false)); + + jungle::FlushOptions f_opt; + f_opt.purgeOnly = true; + CHK_Z(db->flushLogs(f_opt, FLUSH_NUM)); + + CHK_Z(jungle::DB::close(db)); + + // Remove two log files in the middle. + TestSuite::remove(filename + "/log0000_00000007"); + TestSuite::remove(filename + "/log0000_00000008"); + + // Open. + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Get first seq num, should not hang. + uint64_t min_seqnum = 0; + CHK_Z(db->getMinSeqNum(min_seqnum)); + CHK_EQ(FLUSH_NUM + 1, min_seqnum); + + // Get last seq num, should not hang. + uint64_t last_seqnum = 0; + CHK_Z(db->getMaxSeqNum(last_seqnum)); + CHK_EQ(EXPECTED_LAST_NUM, last_seqnum); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + CHK_Z(_free_kv_pairs(KV_NUM, kv)); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int incomplete_table_set_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + std::string filename_copy = filename + "_copy"; + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.numTableWriters = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + jungle::DebugParams d_params; + d_params.tableSetBatchCb = + [filename, filename_copy] + (const jungle::DebugParams::GenericCbParams& p) { + static size_t count = 0; + if (count++ == 0) { + // Sleep a second to flush log. + TestSuite::sleep_sec(1); + // Copy whole database to other place. + std::string cmd; + cmd = "cp -R " + filename + " " + filename_copy; + int r = ::system(cmd.c_str()); + (void)r; + } + TestSuite::_msg("flush\n"); + }; + db->setDebugParams(d_params); + + const size_t NUM = 1000; + + // Write something. + for (size_t ii=0; iiset(jungle::KV(key_str, val_str)) ); + } + + // Sync and flush. + CHK_Z( db->sync(false) ); + CHK_Z( db->flushLogs(jungle::FlushOptions()) ); + + // Close. + CHK_Z(jungle::DB::close(db)); + + // Open copied one, to mimic a crash in the middle of flush. + CHK_Z(jungle::DB::open(&db, filename_copy, config)); + + // Flush again. + CHK_Z( db->flushLogs(jungle::FlushOptions()) ); + + // Check if all records are there. + for (size_t ii=0; iiget(jungle::SizedBuf(key_str), value_out) ); + CHK_EQ(val_str, value_out.toString()); + } + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int duplicate_seq_flush_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.numTableWriters = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + const size_t NUM1 = 100; + const size_t NUM2 = 200; + + // Write something. + for (size_t ii=0; iisetRecord(rec) ); + } + CHK_Z( db->sync(false) ); + + // Write more. + for (size_t ii=NUM1; iisetRecord(rec) ); + } + CHK_Z( db->sync(false) ); + + // Close & reopen. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Write duplicate sequence number with different key. + for (size_t ii=NUM1; iisetRecord(rec) ); + } + CHK_Z( db->sync(false) ); + + // Flush again. + CHK_Z( db->flushLogs(jungle::FlushOptions()) ); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + + +} using namespace corruption_test; + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + //ts.options.printTestMessage = true; + ts.doTest("log file truncation test", + log_file_truncation_test, + TestRange({100, 127, 60})); + + ts.doTest("log manifest corruption test", + log_manifest_corruption_test); + + ts.doTest("log manifest corruption across multi log files test", + log_manifest_corruption_across_file_test); + + ts.doTest("incomplete log test", + incomplete_log_test); + + ts.doTest("irrelevant log file test", + irrelevant_log_file_test); + + ts.doTest("not existing log file test", + not_existing_log_file_test); + + ts.doTest("wrong manifest test", + wrong_manifest_test, + TestRange({true, false})); + + ts.doTest("removed log files at the beginning test", + removed_log_files_at_the_beginning_test); + + ts.doTest("removed log files in the middle test", + removed_log_files_in_the_middle_test); + + ts.doTest("incomplete table set test", + incomplete_table_set_test); + + ts.doTest("duplicate seq number test", + duplicate_seq_flush_test); + + return 0; +} diff --git a/tests/jungle/custom_cmp_test.cc b/tests/jungle/custom_cmp_test.cc new file mode 100644 index 0000000..1b87ffa --- /dev/null +++ b/tests/jungle/custom_cmp_test.cc @@ -0,0 +1,410 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "jungle_test_common.h" + +#include "libjungle/jungle.h" + +#include + +#include + +int cmp_double(void* a, size_t len_a, + void* b, size_t len_b, + void* param) +{ + CHK_NONNULL(param); + + double aa = *(double*)a; + double bb = *(double*)b; + if (aa < bb) return -1; + else if (aa > bb) return 1; + return 0; +} + +static int _custom_cmp_set(jungle::DB* db, + std::vector& kv) { + int n = kv.size(); + for (int ii=0; iiset(kv_ref)); + } + return 0; +} + +static int _custom_cmp_get(jungle::DB* db, + std::vector& kv) { + int n = kv.size(); + for (int ii=0; iiget(key_buf, value_out)); + + double val_out = *(double*)value_out.data; + CHK_EQ(val, val_out); + value_out.free(); + } + return 0; +} + +int cmp_log_only() { + jungle::DB* db; + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + // Open DB with custom cmp (double). + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.cmpFunc = cmp_double; + config.cmpFuncParam = (void*)&db; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + _custom_cmp_set(db, kv); + + // Just sync. + CHK_Z(db->sync()); + + // Point query check. + _custom_cmp_get(db, kv); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TestSuite::clearTestFile(prefix); + return 0; +} + +int cmp_table_only() { + jungle::DB* db; + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + // Open DB with custom cmp (double). + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.cmpFunc = cmp_double; + config.cmpFuncParam = (void*)&db; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + _custom_cmp_set(db, kv); + + // Sync and flush. + CHK_Z(db->sync()); + jungle::FlushOptions f_options; + CHK_Z(db->flushLogs(f_options)); + + // Point query check. + _custom_cmp_get(db, kv); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TestSuite::clearTestFile(prefix); + return 0; +} + +int cmp_mixed() { + jungle::DB* db; + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + // Open DB with custom cmp (double). + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.cmpFunc = cmp_double; + config.cmpFuncParam = (void*)&db; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 10; + + // Even numbers: in table. + std::vector kv(n); + for (int ii=0; iiset(kv_ref)); + } + CHK_Z(db->sync()); + jungle::FlushOptions f_options; + CHK_Z(db->flushLogs(f_options)); + + // Odd numbers: in log. + for (int ii=1; iiset(kv_ref)); + } + CHK_Z(db->sync()); + + // Point query check. + _custom_cmp_get(db, kv); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TestSuite::clearTestFile(prefix); + return 0; +} + +static int _custom_cmp_itr(jungle::Iterator& itr, + std::vector& kv) { + int n = kv.size(); + int count = 0; + do { + jungle::Record rec_out; + jungle::Status s = itr.get(rec_out); + if (!s) break; + + double key = (double)count * 100 / 13; + double key_out = *(double*)rec_out.kv.key.data; + CHK_EQ(key, key_out); + + double val = (double)count * 100 / 17; + double val_out = *(double*)rec_out.kv.value.data; + CHK_EQ(val, val_out); + + rec_out.free(); + count++; + } while (itr.next()); + CHK_EQ(n, count); + + return 0; +} + +int cmp_itr_log_only() { + jungle::DB* db; + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + // Open DB with custom cmp (double). + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.cmpFunc = cmp_double; + config.cmpFuncParam = (void*)&db; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + _custom_cmp_set(db, kv); + + // Just sync. + CHK_Z(db->sync()); + + // Key iterator check. + jungle::Iterator itr; + CHK_Z(itr.init(db)); + CHK_Z(_custom_cmp_itr(itr, kv)); + CHK_Z(itr.close()); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TestSuite::clearTestFile(prefix); + return 0; +} + +int cmp_itr_table_only() { + jungle::DB* db; + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + // Open DB with custom cmp (double). + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.cmpFunc = cmp_double; + config.cmpFuncParam = (void*)&db; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + _custom_cmp_set(db, kv); + + // Sync and flush. + CHK_Z(db->sync()); + jungle::FlushOptions f_options; + CHK_Z(db->flushLogs(f_options)); + + // Key iterator check. + jungle::Iterator itr; + CHK_Z(itr.init(db)); + CHK_Z(_custom_cmp_itr(itr, kv)); + CHK_Z(itr.close()); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TestSuite::clearTestFile(prefix); + return 0; +} + +int cmp_itr_mixed() { + jungle::DB* db; + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + // Open DB with custom cmp (double). + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.cmpFunc = cmp_double; + config.cmpFuncParam = (void*)&db; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 10; + + // Even numbers: in table. + std::vector kv(n); + for (int ii=0; iiset(kv_ref)); + } + CHK_Z(db->sync()); + jungle::FlushOptions f_options; + CHK_Z(db->flushLogs(f_options)); + + // Odd numbers: in log. + for (int ii=1; iiset(kv_ref)); + } + CHK_Z(db->sync()); + + // Key iterator check. + jungle::Iterator itr; + CHK_Z(itr.init(db)); + CHK_Z(_custom_cmp_itr(itr, kv)); + CHK_Z(itr.close()); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TestSuite::clearTestFile(prefix); + return 0; +} + +/* +int basic_test_template() { + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + jungle::shutdown(); + TestSuite::clearTestFile(prefix); + return 0; +} +*/ + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + //ts.options.printTestMessage = true; + ts.doTest("custom cmp log only", cmp_log_only); + ts.doTest("custom cmp table only", cmp_table_only); + ts.doTest("custom cmp mixed", cmp_mixed); + ts.doTest("custom cmp iteration log only", cmp_itr_log_only); + ts.doTest("custom cmp iteration table only", cmp_itr_table_only); + ts.doTest("custom cmp iteration mixed", cmp_itr_mixed); + + return 0; +} diff --git a/tests/jungle/jungle_test_common.h b/tests/jungle/jungle_test_common.h new file mode 100644 index 0000000..4be6a03 --- /dev/null +++ b/tests/jungle/jungle_test_common.h @@ -0,0 +1,360 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include "config_test_common.h" +#include "event_awaiter.h" +#include "test_common.h" + +#include + +#define _INT_UNUSED_ int __attribute__((unused)) + +#define MAX_TEST_LEN (1024) + +static _INT_UNUSED_ +_init_kv_pairs(int n, + std::vector& kv, + std::string key_string, + std::string value_string) { + for (int i=0; i& kv) { + for (int i=0; i& kv) { + for (int i=from; isetSN(i + seq_offset + 1, kv[i]); + CHK_OK(s); + } + return 0; +} + +// e.g.) from = 5, to = 10 +// set(kv[5]) +// set(kv[6]) +// ... +// set(kv[9]) +static _INT_UNUSED_ +_set_bykey_kv_pairs(int from, + int to, + jungle::DB* db, + std::vector& kv) { + for (int i=from; iset(kv[i]); + CHK_OK(s); + } + return 0; +} + +// e.g.) from = 0, to = 10, seq_offset = 5 +// getSN(6, kv[0]) +// getSN(7, kv[1]) +// ... +// getSN(15, kv[9]) +static _INT_UNUSED_ +_get_byseq_check(int from, + int to, + int seq_offset, + jungle::DB* db, + std::vector& kv, + bool exist = true) { + for (int i=from; igetSN(i + seq_offset + 1, kv_ret); + if (!exist) { + CHK_NOT(s); + } else { + CHK_Z(s); + CHK_EQ(kv[i].key, kv_ret.key); + CHK_EQ(kv[i].value, kv_ret.value); + kv_ret.free(); + } + } + return 0; +} + +// e.g.) from = 5, to = 10 +// get(kv[5]) +// get(kv[6]) +// ... +// get(kv[9]) +static _INT_UNUSED_ +_get_bykey_check(int from, + int to, + jungle::DB* db, + std::vector& kv, + bool exist = true) { + for (int i=from; iget(kv[i].key, value); + if (!exist) { + CHK_NOT(s); + } else { + CHK_Z(s); + CHK_EQ(kv[i].value, value); + value.free(); + } + } + return 0; +} + +// e.g.) from = 5, to = 10 +// itr.get(kv[5]) +// itr.get(kv[6]) +// ... +// itr.get(kv[9]) +static _INT_UNUSED_ +_itr_check_step(int from, + int to, + int step, + jungle::Iterator& itr, + std::vector& kv) { + jungle::Record rec; + size_t idx = from; + size_t count = 0; + do { + if (idx >= (size_t)to) break; + + CHK_OK(itr.get(rec)); + CHK_EQ(kv[idx].key, rec.kv.key); + CHK_EQ(kv[idx].value, rec.kv.value); + rec.free(); + + idx += step; + count++; + } while (itr.next()); + CHK_EQ( (to - from + (step - 1)) / step, (int)count); + + return 0; +} + +static _INT_UNUSED_ +_itr_check(int from, + int to, + jungle::Iterator& itr, + std::vector& kv) { + return _itr_check_step(from, to, 1, itr, kv); +} + +// e.g.) from = 5, to = 10 +// itr.get(kv[9]) +// itr.get(kv[8]) +// ... +// itr.get(kv[5]) +static _INT_UNUSED_ +_itr_check_bwd_step(int from, + int to, + int step, + jungle::Iterator& itr, + std::vector& kv) { + jungle::Record rec; + int idx = to - 1; + size_t count = 0; + do { + if (idx < from) break; + + CHK_OK(itr.get(rec)); + CHK_EQ(kv[idx].key, rec.kv.key); + CHK_EQ(kv[idx].value, rec.kv.value); + rec.free(); + + idx -= step; + count++; + } while (itr.prev()); + CHK_EQ( (to - from + (step - 1)) / step, (int)count); + + return 0; +} + +static _INT_UNUSED_ +_itr_check_bwd(int from, + int to, + jungle::Iterator& itr, + std::vector& kv) { + return _itr_check_bwd_step(from, to, 1, itr, kv); +} + + +template +static _INT_UNUSED_ +_cmp_lists(std::list& a, std::list& b) { + CHK_EQ(a.size(), b.size()); + for (auto& e_a: a) { + bool found = false; + for (auto& e_b: b) { + if (e_a == e_b) { + found = true; + break; + } + } + CHK_OK(found); + } + return 0; +} + +int _set_keys(jungle::DB* db, + size_t start, + size_t end, + size_t step, + const std::string& key_fmt, + const std::string& val_fmt) +{ + char key_str[MAX_TEST_LEN]; + char val_str[MAX_TEST_LEN]; + for (size_t ii=start; iiset(jungle::KV(key, val))); + } + return 0; +} + +int _del_keys(jungle::DB* db, + size_t start, + size_t end, + size_t step, + const std::string& key_fmt) +{ + char key_str[MAX_TEST_LEN]; + for (size_t ii=start; iidel(jungle::SizedBuf(key))); + } + return 0; +} + +int _get_keys(jungle::DB* db, + size_t start, + size_t end, + size_t step, + const std::string& key_fmt, + const std::string& val_fmt) +{ + char key_str[MAX_TEST_LEN]; + char val_str[MAX_TEST_LEN]; + for (size_t ii=start; iiget(key, val_out)); + + if (!val_fmt.empty()) { + sprintf(val_str, val_fmt.c_str(), ii); + jungle::SizedBuf val(val_str); + CHK_EQ(val, val_out); + } + val_out.free(); + } + return 0; +} + +int _non_existing_keys(jungle::DB* db, + size_t start, + size_t end, + size_t step, + const std::string& key_fmt) +{ + char key_str[MAX_TEST_LEN]; + for (size_t ii=start; iiget(key, val_out); + CHK_FALSE(s); + } + return 0; +} + +int _iterate_keys(jungle::DB* db, + size_t start, + size_t end, + size_t step, + const std::string& key_fmt, + const std::string& val_fmt) +{ + // NOTE: `end` is inclusive: [start, end] + + jungle::Status s; + jungle::Iterator itr; + + char s_str[MAX_TEST_LEN]; + char e_str[MAX_TEST_LEN]; + sprintf(s_str, key_fmt.c_str(), start); + sprintf(e_str, key_fmt.c_str(), end); + jungle::SizedBuf s_key(s_str); + jungle::SizedBuf e_key(e_str); + CHK_Z(itr.init(db, s_key, e_key)); + + char key_str[MAX_TEST_LEN]; + char val_str[MAX_TEST_LEN]; + size_t count = 0; + size_t idx = start; + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + + sprintf(key_str, key_fmt.c_str(), idx); + jungle::SizedBuf key(key_str); + + CHK_EQ(key, rec_out.kv.key); + + if (!val_fmt.empty()) { + sprintf(val_str, val_fmt.c_str(), idx); + jungle::SizedBuf val(val_str); + CHK_EQ(val, rec_out.kv.value); + } + rec_out.free(); + + count++; + idx += step; + } while (itr.next().ok()); + CHK_EQ( ((end - start) / step) + 1, count ); + itr.close(); + + return 0; +} + diff --git a/tests/jungle/key_itr_test.cc b/tests/jungle/key_itr_test.cc new file mode 100644 index 0000000..5a13d87 --- /dev/null +++ b/tests/jungle/key_itr_test.cc @@ -0,0 +1,905 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "jungle_test_common.h" + +#include "libjungle/jungle.h" + +#include + +#include + +int itr_key_empty() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 10; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Iterator on empty DB, should succeed. + jungle::Iterator itr; + s = itr.init(db); + CHK_Z(s); + + // All get, prev, next should fail. + jungle::Record rec_out; + CHK_NOT(itr.get(rec_out)); + CHK_NOT(itr.prev()); + CHK_NOT(itr.next()); + + // Close iterator. + itr.close(); + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_key_basic() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 10; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Set KV pairs. + int n = 10; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Update even number KV pairs. + int seq_count = n; + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key_", "value2_")); + for (int ii=0; iisetSN(seq_count, kv2[ii])); + seq_count++; + } + + // Iterator. + jungle::Iterator itr; + s = itr.init(db); + CHK_Z(s); + + // Check returned records. + int count = 0; + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + + if (count % 2 == 0) { + CHK_EQ(kv2[count].key, rec_out.kv.key); + CHK_EQ(kv2[count].value, rec_out.kv.value); + } else { + CHK_EQ(kv[count].key, rec_out.kv.key); + CHK_EQ(kv[count].value, rec_out.kv.value); + } + + rec_out.free(); + count++; + } while (itr.next()); + CHK_EQ(n, count); + + // Backward + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + + count--; + if (count % 2 == 0) { + CHK_EQ(kv2[count].key, rec_out.kv.key); + CHK_EQ(kv2[count].value, rec_out.kv.value); + } else { + CHK_EQ(kv[count].key, rec_out.kv.key); + CHK_EQ(kv[count].value, rec_out.kv.value); + } + + rec_out.free(); + } while (itr.prev()); + CHK_EQ(0, count); + + // Close iterator. + itr.close(); + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_key_purge() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 10; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Set KV pairs. + int n = 10; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync and flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Update even number KV pairs. + int seq_count = n; + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key", "value2_")); + for (int ii=0; iisetSN(seq_count, kv2[ii])); + seq_count++; + } + CHK_Z(db->sync(false)); + + // Iterator. + jungle::Iterator itr; + s = itr.init(db); + CHK_Z(s); + + // Check returned records. + int count = 0; + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + + if (count % 2 == 0) { + CHK_EQ(kv2[count].key, rec_out.kv.key); + CHK_EQ(kv2[count].value, rec_out.kv.value); + } else { + CHK_EQ(kv[count].key, rec_out.kv.key); + CHK_EQ(kv[count].value, rec_out.kv.value); + } + + rec_out.free(); + count++; + } while (itr.next()); + CHK_EQ(n, count); + + // Backward + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + + count--; + if (count % 2 == 0) { + CHK_EQ(kv2[count].key, rec_out.kv.key); + CHK_EQ(kv2[count].value, rec_out.kv.value); + } else { + CHK_EQ(kv[count].key, rec_out.kv.key); + CHK_EQ(kv[count].value, rec_out.kv.value); + } + + rec_out.free(); + } while (itr.prev()); + CHK_EQ(0, count); + + // Close iterator. + itr.close(); + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_key_isolation() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + int n = 10; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + + // Set even number KV pairs. + for (int ii=0; ii<10; ii+=2) { + db->set(kv[ii]); + } + + // Iterator. + jungle::Iterator itr; + s = itr.init(db); + CHK_Z(s); + + // Set odd number KV pairs. + for (int ii=1; ii<10; ii+=2) { + db->set(kv[ii]); + } + + // Only even numbers should be visible. + int count = 0; + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + + CHK_EQ(kv[count].key, rec_out.kv.key); + CHK_EQ(kv[count].value, rec_out.kv.value); + + rec_out.free(); + count += 2; + } while (itr.next()); + CHK_EQ(n, count); + + // But visiable by normal get. + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Close iterator. + itr.close(); + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_key_seek() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 5; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Set KV pairs. + int n = 50; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync & partial flush. + CHK_Z(db->sync(false)); + + uint64_t flush_upto = 13; + jungle::FlushOptions f_options; + CHK_Z(db->flushLogs(f_options, flush_upto)); + + // By-key check. + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Iterator. + jungle::Iterator itr; + CHK_Z(itr.init(db)); + + // Forward + for (int ii=0; iiflushLogs(f_options)); + + // Check again. + CHK_Z(itr.init(db)); + // Forward + for (int ii=0; ii kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + for (int ii=0; iiset(kv[ii])); + } + + // Delete even numbers. + for (int ii=0; iidel(kv[ii].key)); + } + + // Sync & partial flush. + CHK_Z(db->sync(false)); + + // By-key check. + for (int ii=0; iiget(kv[ii].key, value_out); + if (ii % 2 == 0) { + CHK_NOT(s); + } else { + CHK_EQ(kv[ii].value, value_out); + } + value_out.free(); + } + + // Iterator. + jungle::Iterator itr; + CHK_Z(itr.init(db)); + + for (int ii=0; ii kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + for (int ii=0; iidel(kv[ii].key)); + } + CHK_Z(db->sync(false)); + + // By-key check. + for (int ii=0; iiget(kv[ii].key, value_out); + CHK_NOT(s); + value_out.free(); + } + + // Iterator. + jungle::Iterator itr; + jungle::Record rec; + CHK_Z(itr.init(db)); + CHK_NOT(itr.get(rec)); + + CHK_Z(itr.gotoBegin()); + CHK_NOT(itr.get(rec)); + CHK_Z(itr.gotoEnd()); + CHK_NOT(itr.get(rec)); + + CHK_Z(itr.close()); + + // Insert a single KV. + CHK_Z(db->set(kv[n/2])); + + // Now iterator should return that key only. + CHK_Z(itr.init(db)); + CHK_Z(itr.get(rec)); + CHK_EQ(kv[n/2].key, rec.kv.key); + rec.free(); + CHK_NOT(itr.next()); + + CHK_Z(itr.gotoBegin()); + CHK_Z(itr.get(rec)); + CHK_EQ(kv[n/2].key, rec.kv.key); + rec.free(); + CHK_NOT(itr.next()); + + CHK_Z(itr.gotoEnd()); + CHK_Z(itr.get(rec)); + CHK_EQ(kv[n/2].key, rec.kv.key); + rec.free(); + CHK_NOT(itr.prev()); + + CHK_Z(itr.close()); + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_key_insert_delete_all() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 5; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Set KV pairs. + int n = 50; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + for (int ii=0; iiset(kv[ii])); + } + for (int ii=0; iidel(kv[ii].key)); + } + CHK_Z(db->sync(false)); + + // By-key check. + for (int ii=0; iiget(kv[ii].key, value_out); + CHK_NOT(s); + value_out.free(); + } + + // Iterator. + jungle::Iterator itr; + jungle::Record rec; + CHK_Z(itr.init(db)); + CHK_NOT(itr.get(rec)); + + CHK_Z(itr.gotoBegin()); + CHK_NOT(itr.get(rec)); + CHK_Z(itr.gotoEnd()); + CHK_NOT(itr.get(rec)); + + CHK_Z(itr.close()); + + // Insert a single KV. + CHK_Z(db->set(kv[n/2])); + + // Now iterator should return that key only. + CHK_Z(itr.init(db)); + CHK_Z(itr.get(rec)); + CHK_EQ(kv[n/2].key, rec.kv.key); + rec.free(); + CHK_NOT(itr.next()); + + CHK_Z(itr.gotoBegin()); + CHK_Z(itr.get(rec)); + CHK_EQ(kv[n/2].key, rec.kv.key); + rec.free(); + CHK_NOT(itr.next()); + + CHK_Z(itr.gotoEnd()); + CHK_Z(itr.get(rec)); + CHK_EQ(kv[n/2].key, rec.kv.key); + rec.free(); + CHK_NOT(itr.prev()); + + CHK_Z(itr.close()); + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_key_flush_and_delete_all(bool recreate = false) { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Set KV pairs. + int n = 10; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + + if (!recreate) { + // Open DB once. + TestSuite::clearTestFile(filename); + jungle::DBConfig config; + config.maxEntriesInLogFile = 5; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + } + + for (int kk=0; kk<2*n; kk++) { + if (recreate) { + // Open DB in every run. + TestSuite::clearTestFile(filename); + jungle::DBConfig config; + config.maxEntriesInLogFile = 5; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + } + + // kk: decide when to flush. + int cnt = 0; + + // set all + for (int ii=0; iiset(kv[ii])); + if (cnt++ == kk) { + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + } + } + + // delete all + for (int ii=0; iidel(kv[ii].key)); + if (cnt++ == kk) { + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + } + } + + CHK_Z(db->sync(false)); + + // By-key check: nothing should be visible. + for (int ii=0; iiget(kv[ii].key, value_out); + CHK_NOT(s); + value_out.free(); + } + + // Iterator. + jungle::Iterator itr; + jungle::Record rec; + CHK_Z(itr.init(db)); + CHK_NOT(itr.get(rec)); + + CHK_Z(itr.gotoBegin()); + CHK_NOT(itr.get(rec)); + CHK_Z(itr.gotoEnd()); + CHK_NOT(itr.get(rec)); + + CHK_Z(itr.close()); + + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + if (recreate) { + // Close DB in every run. + s = jungle::DB::close(db); + CHK_Z(s); + } + } + + if (!recreate) { + // Close DB once. + s = jungle::DB::close(db); + CHK_Z(s); + } + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_key_flush_and_delete_half_even(bool recreate = false) { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Set KV pairs. + int n = 10; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + + if (!recreate) { + // Open DB once. + TestSuite::clearTestFile(filename); + jungle::DBConfig config; + config.maxEntriesInLogFile = 5; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + } + + for (int kk=0; kk<15*n/10; kk++) { + if (recreate) { + // Open DB in every run. + TestSuite::clearTestFile(filename); + jungle::DBConfig config; + config.maxEntriesInLogFile = 5; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + } + + // kk: decide when to flush. + int cnt = 0; + + // set all + for (int ii=0; iiset(kv[ii])); + if (cnt++ == kk) { + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + } + } + + // delete even numbers + for (int ii=0; iidel(kv[ii].key)); + if (cnt++ == kk) { + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + } + } + + CHK_Z(db->sync(false)); + + // By-key check: only odd numbers should be visible. + for (int ii=0; iiget(kv[ii].key, value_out); + if (ii % 2 == 0) { + // even + CHK_NOT(s); + value_out.free(); + } else { + // odd + CHK_Z(s); + CHK_EQ(kv[ii].value, value_out); + } + } + + // Iterator. + jungle::Iterator itr; + CHK_Z(itr.init(db)); + + for (int ii=0; iiflushLogs(jungle::FlushOptions())); + + if (recreate) { + // Close DB in every run. + s = jungle::DB::close(db); + CHK_Z(s); + } + } + + if (!recreate) { + // Close DB once. + s = jungle::DB::close(db); + CHK_Z(s); + } + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + //ts.options.printTestMessage = true; + //ts.options.abortOnFailure = true; + ts.doTest("key empty itr", itr_key_empty); + ts.doTest("key itr test", itr_key_basic); + ts.doTest("key itr purge test", itr_key_purge); + ts.doTest("key itr isolation test", itr_key_isolation); + ts.doTest("key itr seek test", itr_key_seek); + ts.doTest("key itr deleted test", itr_key_deleted); + ts.doTest("key itr all deletion markers test", itr_key_all_del_markers); + ts.doTest("key itr insert and then delete all test", itr_key_insert_delete_all); + ts.doTest("key itr flush and delete all test", + itr_key_flush_and_delete_all, + TestRange({false, true})); + ts.doTest("key itr flush and delete half even test", + itr_key_flush_and_delete_half_even, + TestRange({false, true})); + + return 0; +} diff --git a/tests/jungle/large_test.cc b/tests/jungle/large_test.cc new file mode 100644 index 0000000..68b0c09 --- /dev/null +++ b/tests/jungle/large_test.cc @@ -0,0 +1,98 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "config_test_common.h" +#include "test_common.h" + +#include "libjungle/jungle.h" + +#include + +#include + +#define DURATION_MS 1000 + +int log_file_purge_test() { + jungle::DB* db; + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 1024; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + uint64_t idx = 0; + TestSuite::Timer timer(DURATION_MS); + do { + jungle::KV kv; + std::string key_str = "k" + TestSuite::lzStr(7, idx); + std::string val_str = "v" + TestSuite::lzStr(7, idx); + kv.alloc(key_str, val_str); + db->setSN(idx, kv); + kv.free(); + + if (idx > 1000 && idx % 1000 == 0) { + db->sync(false); + + jungle::FlushOptions f_opt; + f_opt.purgeOnly = true; + db->flushLogs(f_opt, idx - 1000); + } + idx++; + } while (!timer.timeover() && idx < 1000000); + + // Close DB. + s = jungle::DB::close(db); + CHK_OK(s); + + // Free all resources for jungle. + jungle::shutdown(); + + TestSuite::clearTestFile(prefix); + return 0; +} + +/* +int basic_test_template() { + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + jungle::shutdown(); + TestSuite::clearTestFile(prefix); + return 0; +} +*/ + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + //ts.options.printTestMessage = true; + ts.doTest("log file purge test", log_file_purge_test); + + return 0; +} diff --git a/tests/jungle/level_extension_test.cc b/tests/jungle/level_extension_test.cc new file mode 100644 index 0000000..a305208 --- /dev/null +++ b/tests/jungle/level_extension_test.cc @@ -0,0 +1,1077 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "jungle_test_common.h" + +#include "internal_helper.h" + +#include +#include +#include +#include + +#include + +namespace level_extension_test { + +int next_level_basic_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = true; + config.maxL1TableSize = 1024 * 1024; + config.bloomFilterBitsPerUnit = 10.0; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t NUM = 10000; + const char V_FMT[] = "v%0100zu"; + + // Write even numbers. + CHK_Z(_set_keys(db, 0, NUM, 2, "k%06zu", V_FMT)); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Write odd numbers. + CHK_Z(_set_keys(db, 1, NUM, 2, "k%06zu", V_FMT)); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Point query check. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + + // Range query check. + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + // Compact more. + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Check again. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + // Close and re-open. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Check again. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + // Now L1 compaction (split). + CHK_Z( db->splitLevel(c_opt, 1) ); + + // Check again. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +struct WorkerArgs : TestSuite::ThreadArgs { + enum CompactionType { + INTERLEVEL = 0x0, + INPLACE = 0x1, + SPLIT = 0x2, + MERGE = 0x3, + L0 = 0x4, + }; + + WorkerArgs() + : db(nullptr) + , type(INTERLEVEL) + , hashNum(0) + {} + jungle::DB* db; + CompactionType type; + size_t hashNum; + jungle::Status expResult; +}; + +int compaction_worker(TestSuite::ThreadArgs* t_args) { + WorkerArgs* args = static_cast(t_args); + jungle::CompactOptions c_opt; + jungle::Status s; + switch (args->type) { + case WorkerArgs::INTERLEVEL: + s = args->db->compactLevel(c_opt, 1); + break; + case WorkerArgs::INPLACE: + s = args->db->compactInplace(c_opt, 1); + break; + case WorkerArgs::SPLIT: + s = args->db->splitLevel(c_opt, 1); + break; + case WorkerArgs::MERGE: + s = args->db->mergeLevel(c_opt, 1); + break; + case WorkerArgs::L0: + s = args->db->compactL0(c_opt, args->hashNum); + break; + default: break; + } + + CHK_EQ( args->expResult, s ); + return 0; +} + +int interlevel_compaction_cancel_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Disable background threads. + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 0; + g_config.numCompactorThreads = 0; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = true; + config.maxL1TableSize = 1024 * 1024; + config.bloomFilterBitsPerUnit = 10.0; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t NUM = 10000; + const char V_FMT[] = "v%0100zu"; + + // Write KVs. + CHK_Z(_set_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Set delay for split. + jungle::DebugParams d_params; + // 150 ms per record. + d_params.compactionDelayUs = 150*1000; + jungle::setDebugParams(d_params); + + // Inter-level compaction. + WorkerArgs w_args; + w_args.db = db; + w_args.type = WorkerArgs::INTERLEVEL; + w_args.expResult = jungle::Status::COMPACTION_CANCELLED; + TestSuite::ThreadHolder h(&w_args, compaction_worker, nullptr); + TestSuite::sleep_sec(1, "wait for worker to start"); + + // Close DB, it should cancel the split. + CHK_Z(jungle::DB::close(db)); + h.join(); + CHK_Z(h.getResult()); + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Check. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + // Again, set delay to 2nd phase at this time. + d_params.compactionDelayUs = 0; + d_params.compactionItrScanDelayUs = 150*1000; + jungle::setDebugParams(d_params); + + w_args.db = db; + w_args.type = WorkerArgs::INTERLEVEL; + w_args.expResult = jungle::Status::COMPACTION_CANCELLED; + TestSuite::ThreadHolder h2(&w_args, compaction_worker, nullptr); + TestSuite::sleep_sec(1, "wait for worker to start"); + + // Close DB, it should cancel the split. + CHK_Z(jungle::DB::close(db)); + h2.join(); + CHK_Z(h2.getResult()); + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Check. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + // Inter-level compaction should succeed even after cancel. + d_params.compactionItrScanDelayUs = 0; // Cancel delay. + jungle::setDebugParams(d_params); + CHK_Z( db->compactLevel(c_opt, 1) ); + + // Check again. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int inplace_compaction_cancel_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Disable background threads. + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 0; + g_config.numCompactorThreads = 0; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = true; + config.maxL1TableSize = 1024 * 1024; + config.bloomFilterBitsPerUnit = 10.0; + config.minFileSizeToCompact = 128 * 1024; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t NUM = 10000; + const char V_FMT[] = "v%0100zu"; + + // Write KVs. + CHK_Z(_set_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Write more KVs, flush, and compact to L1, + // in order to make stale data for in-place compaction. + CHK_Z(_set_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + for (size_t ii=0; iicompactL0(c_opt, ii)); + + // Set delay for split. + jungle::DebugParams d_params; + // 150 ms per record. + d_params.compactionDelayUs = 150*1000; + jungle::setDebugParams(d_params); + + // Inter-level compaction. + WorkerArgs w_args; + w_args.db = db; + w_args.type = WorkerArgs::INPLACE; + w_args.expResult = jungle::Status::COMPACTION_CANCELLED; + TestSuite::ThreadHolder h(&w_args, compaction_worker, nullptr); + TestSuite::sleep_sec(1, "wait for worker to start"); + + // Close DB, it should cancel the split. + CHK_Z(jungle::DB::close(db)); + h.join(); + CHK_Z(h.getResult()); + + // NOTE: In-place compaction doesn't have 2nd phase iteration. + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Check. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + // In-place compaction should succeed even after cancel. + d_params.compactionDelayUs = 0; // Cancel delay. + jungle::setDebugParams(d_params); + CHK_Z( db->compactInplace(c_opt, 1) ); + + // Check again. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int split_cancel_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Disable background threads. + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 0; + g_config.numCompactorThreads = 0; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = true; + config.maxL1TableSize = 1024 * 1024; + config.bloomFilterBitsPerUnit = 10.0; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t NUM = 10000; + const char V_FMT[] = "v%0100zu"; + + // Write KVs. + CHK_Z(_set_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Set delay for split. + jungle::DebugParams d_params; + // 150 ms per record. + d_params.compactionDelayUs = 150*1000; + jungle::setDebugParams(d_params); + + // Interlevel compaction. + WorkerArgs w_args; + w_args.db = db; + w_args.type = WorkerArgs::INTERLEVEL; + w_args.expResult = jungle::Status::COMPACTION_CANCELLED; + TestSuite::ThreadHolder h(&w_args, compaction_worker, nullptr); + TestSuite::sleep_sec(1, "wait for worker to start"); + + // Close DB, it should cancel the interlevel compaction. + CHK_Z(jungle::DB::close(db)); + h.join(); + CHK_Z(h.getResult()); + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Check. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + // Again, set delay to 2nd phase at this time. + d_params.compactionDelayUs = 0; + d_params.compactionItrScanDelayUs = 150*1000; + jungle::setDebugParams(d_params); + + w_args.db = db; + w_args.type = WorkerArgs::SPLIT; + w_args.expResult = jungle::Status::COMPACTION_CANCELLED; + TestSuite::ThreadHolder h2(&w_args, compaction_worker, nullptr); + TestSuite::sleep_sec(1, "wait for worker to start"); + + // Close DB, it should cancel the split. + CHK_Z(jungle::DB::close(db)); + h2.join(); + CHK_Z(h2.getResult()); + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Check. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + // Split should succeed even after cancel. + d_params.compactionItrScanDelayUs = 0; // Cancel delay. + jungle::setDebugParams(d_params); + CHK_Z( db->splitLevel(c_opt, 1) ); + + // Check again. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int merge_cancel_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Disable background threads. + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 0; + g_config.numCompactorThreads = 0; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = true; + config.minFileSizeToCompact = 64 * 1024; + config.maxL0TableSize = 64 * 1024; + config.maxL1TableSize = 64 * 1024; + config.bloomFilterBitsPerUnit = 10.0; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t NUM = 10000; + const char V_FMT[] = "v%0250zu"; + + // Write KVs. + CHK_Z(_set_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Delete KVs. + CHK_Z(_del_keys(db, 0, NUM/2, 1, "k%06zu")); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction. + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // L1 in-place compaction. + for (size_t ii=0; ii<4; ++ii) { + db->compactInplace(c_opt, 1); + } + + // Check. + CHK_Z(_non_existing_keys(db, 0, NUM/2, 1, "k%06zu")); + CHK_Z(_get_keys(db, NUM/2, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, NUM/2, NUM-1, 1, "k%06zu", V_FMT)); + + // Set delay for merge. + jungle::DebugParams d_params; + d_params.compactionItrScanDelayUs = 500*1000; + jungle::setDebugParams(d_params); + + // Merge. + WorkerArgs w_args; + w_args.db = db; + w_args.type = WorkerArgs::MERGE; + w_args.expResult = jungle::Status::COMPACTION_CANCELLED; + TestSuite::ThreadHolder h(&w_args, compaction_worker, nullptr); + TestSuite::sleep_ms(200, "wait for worker to start"); + + // Close DB, it should cancel the merge. + CHK_Z(jungle::DB::close(db)); + h.join(); + CHK_Z(h.getResult()); + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Check. + CHK_Z(_non_existing_keys(db, 0, NUM/2, 1, "k%06zu")); + CHK_Z(_get_keys(db, NUM/2, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, NUM/2, NUM-1, 1, "k%06zu", V_FMT)); + + // Merge should succeed even after cancel. + d_params.compactionItrScanDelayUs = 0; // Cancel delay. + jungle::setDebugParams(d_params); + CHK_Z( db->mergeLevel(c_opt, 1) ); + + // Check again. + CHK_Z(_non_existing_keys(db, 0, NUM/2, 1, "k%06zu")); + CHK_Z(_get_keys(db, NUM/2, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, NUM/2, NUM-1, 1, "k%06zu", V_FMT)); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int single_file_compaction_cancel_mode_change_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Disable background threads. + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 0; + g_config.numCompactorThreads = 0; + jungle::init(g_config); + + // Open DB with non-level-extension mode. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = false; + config.maxL1TableSize = 1024 * 1024; + config.bloomFilterBitsPerUnit = 10.0; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t NUM = 10000; + const char V_FMT[] = "v%0100zu"; + + // Write KVs. + CHK_Z(_set_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Set delay for split. + jungle::DebugParams d_params; + // 10 ms per record. + d_params.compactionDelayUs = 10*1000; + jungle::setDebugParams(d_params); + + // L0 compaction. + WorkerArgs w_args; + w_args.db = db; + w_args.type = WorkerArgs::L0; + w_args.expResult = jungle::Status::COMPACTION_CANCELLED; + TestSuite::ThreadHolder h(&w_args, compaction_worker, nullptr); + TestSuite::sleep_sec(1, "wait for worker to start"); + + // Close DB, it should cancel the split. + CHK_Z(jungle::DB::close(db)); + h.join(); + CHK_Z(h.getResult()); + + // Open DB with level-extension mode. + config.nextLevelExtension = true; + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Check. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + // Remove delay. + d_params.compactionDelayUs = 0; + jungle::setDebugParams(d_params); + + // Inter-level compaction should succeed even after cancel. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii) ); + } + + // Check again. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + // Close & reopen. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Check again. + CHK_Z(_get_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, NUM-1, 1, "k%06zu", V_FMT)); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int dual_file_compaction_cancel_mode_change_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Disable background threads. + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 0; + g_config.numCompactorThreads = 0; + jungle::init(g_config); + + // Open DB with non-level-extension mode. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = false; + config.maxL1TableSize = 1024 * 1024; + config.bloomFilterBitsPerUnit = 10.0; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t NUM = 5000; + const char V_FMT[] = "v%0100zu"; + + // Write KVs. + CHK_Z(_set_keys(db, 0, NUM, 1, "k%06zu", V_FMT)); + + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Do L0 compaction. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Write more KVs. + CHK_Z(_set_keys(db, NUM, 2*NUM, 1, "k%06zu", V_FMT)); + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Set delay for split. + jungle::DebugParams d_params; + // 10 ms per record. + d_params.compactionDelayUs = 10*1000; + jungle::setDebugParams(d_params); + + // L0 compaction. + WorkerArgs w_args; + w_args.db = db; + w_args.type = WorkerArgs::L0; + w_args.expResult = jungle::Status::COMPACTION_CANCELLED; + TestSuite::ThreadHolder h(&w_args, compaction_worker, nullptr); + TestSuite::sleep_sec(1, "wait for worker to start"); + + // Close DB, it should cancel the split. + CHK_Z(jungle::DB::close(db)); + h.join(); + CHK_Z(h.getResult()); + + // Open DB with level-extension mode. + config.nextLevelExtension = true; + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Check. + CHK_Z(_get_keys(db, 0, 2*NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, 2*NUM-1, 1, "k%06zu", V_FMT)); + + // Remove delay. + d_params.compactionDelayUs = 0; + jungle::setDebugParams(d_params); + + // Inter-level compaction should succeed even after cancel. + for (size_t ii=0; iicompactL0(c_opt, ii) ); + } + + // Check again. + CHK_Z(_get_keys(db, 0, 2*NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, 2*NUM-1, 1, "k%06zu", V_FMT)); + + // Close & reopen. + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Check again. + CHK_Z(_get_keys(db, 0, 2*NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, 2*NUM-1, 1, "k%06zu", V_FMT)); + + // Compact L0. + for (size_t ii=0; iicompactL0(c_opt, ii) ); + } + + // Check again. + CHK_Z(_get_keys(db, 0, 2*NUM, 1, "k%06zu", V_FMT)); + CHK_Z(_iterate_keys(db, 0, 2*NUM-1, 1, "k%06zu", V_FMT)); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int seq_itr_test(bool with_opened_itr) { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + // Disable background threads. + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.numCompactorThreads = 0; + jungle::init(g_config); + + // Open DB with non-level-extension mode. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = true; + config.maxL1TableSize = 1024 * 1024; + config.bloomFilterBitsPerUnit = 10.0; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t NUM = 2000; + const char K_FMT[] = "k%06zu"; + const char V_FMT[] = "%08zu_v%0100zu"; + + std::vector key_arr(NUM); + std::iota(key_arr.begin(), key_arr.end(), 0); + + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(key_arr.begin(), key_arr.end(), g); + + // Write KVs. + size_t seqnum = 0; + char key_raw[256]; + char val_raw[256]; + for (size_t ii=0; iiset( jungle::KV(key_raw, val_raw) ); + if (ii % (NUM / 10) == 0) { + CHK_Z(db->sync(false)); + } + } + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Open an iterator and keep it to block table file removal. + jungle::Iterator initial_itr; + if (with_opened_itr) { + CHK_Z(initial_itr.initSN(db)); + } + + auto initial_itr_check = [&s, NUM, &initial_itr]() -> int { + size_t count = 0; + initial_itr.gotoBegin(); + do { + jungle::Record rec; + jungle::Record::Holder h(rec); + s = initial_itr.get(rec); + if (!s) break; + + count++; + std::string val_seq = rec.kv.value.toString().substr(0, 8); + uint64_t cur_seq = std::atol(val_seq.c_str()); + CHK_EQ(count, cur_seq); + } while (initial_itr.next().ok()); + CHK_EQ(NUM, count); + return 0; + }; + if (with_opened_itr) { + CHK_Z(initial_itr_check()); + } + + // L0 -> L1 compaction. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + if (with_opened_itr) { + CHK_Z(initial_itr_check()); + } + + // Write more KVs (even numbers). + for (size_t ii=0; iiset( jungle::KV(key_raw, val_raw) ); + if (ii % (NUM / 10) == 0) { + CHK_Z(db->sync(false)); + } + } + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + if (with_opened_itr) { + CHK_Z(initial_itr_check()); + } + + { // Do seq iteration. + jungle::Iterator itr; + itr.initSN(db); + size_t count = 0; + do { + jungle::Record rec; + jungle::Record::Holder h(rec); + s = itr.get(rec); + if (!s) break; + + count++; + std::string val_seq = rec.kv.value.toString().substr(0, 8); + uint64_t cur_seq = std::atol(val_seq.c_str()); + CHK_EQ(count, cur_seq); + + } while (itr.next().ok()); + CHK_Z(itr.close()); + CHK_EQ( NUM * 3 / 2, count ); + } + + // Write more KVs (odd numbers). + for (size_t ii=1; iiset( jungle::KV(key_raw, val_raw) ); + } + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + if (with_opened_itr) { + CHK_Z(initial_itr_check()); + } + + auto seq_check = [&s, NUM, db]() -> int { + jungle::Iterator itr; + itr.initSN(db); + uint64_t prev_seq = 0; + size_t count = 0; + do { + jungle::Record rec; + jungle::Record::Holder h(rec); + s = itr.get(rec); + if (!s) break; + + count++; + std::string val_seq = rec.kv.value.toString().substr(0, 8); + uint64_t cur_seq = std::atol(val_seq.c_str()); + CHK_GT(cur_seq, prev_seq); + prev_seq = cur_seq; + } while (itr.next().ok()); + CHK_Z(itr.close()); + CHK_GTEQ(count, NUM); + return 0; + }; + + // Do L0 -> L1 compaction, and check in between. + for (size_t ii=0; iicompactL0(c_opt, ii)); + if (with_opened_itr) { + CHK_Z(initial_itr_check()); + } + } + // Final check. + CHK_Z(seq_check()); + + if (with_opened_itr) { + CHK_Z(initial_itr.close()); + } + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int snapshot_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db = nullptr; + + // Disable background threads. + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.numCompactorThreads = 0; + jungle::init(g_config); + + // Open DB with non-level-extension mode. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.nextLevelExtension = true; + config.maxL1TableSize = 1024 * 1024; + config.bloomFilterBitsPerUnit = 10.0; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t NUM = 2000; + const char K_FMT[] = "k%06zu"; + const char V_FMT[] = "%08zu_v%0100zu"; + + std::vector key_arr(NUM); + std::iota(key_arr.begin(), key_arr.end(), 0); + + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(key_arr.begin(), key_arr.end(), g); + + // Write KVs. + size_t seqnum = 0; + char key_raw[256]; + char val_raw[256]; + for (size_t ii=0; iiset( jungle::KV(key_raw, val_raw) ); + } + // Sync & flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // L0 -> L1 compaction. + jungle::CompactOptions c_opt; + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + + // Write more KVs (even numbers). + for (size_t ii=0; iiset( jungle::KV(key_raw, val_raw) ); + } + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Open a snapshot (latest one). + jungle::DB* snap = nullptr; + CHK_Z(db->openSnapshot(&snap)); + + auto snap_check = [&s, NUM, snap]() -> int { + jungle::Iterator itr; + itr.initSN(snap); + size_t count = 0; + do { + jungle::Record rec; + jungle::Record::Holder h(rec); + s = itr.get(rec); + if (!s) break; + + count++; + std::string val_seq = rec.kv.value.toString().substr(0, 8); + uint64_t cur_seq = std::atol(val_seq.c_str()); + CHK_EQ(count, cur_seq); + + } while (itr.next().ok()); + CHK_Z(itr.close()); + CHK_EQ( NUM * 3 / 2, count ); + return 0; + }; + CHK_Z(snap_check()); + + // Write more KVs (odd numbers). + for (size_t ii=1; iiset( jungle::KV(key_raw, val_raw) ); + } + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + CHK_Z(snap_check()); + + // Do L0 -> L1 compaction, and check in between. + for (size_t ii=0; iicompactL0(c_opt, ii)); + } + // Final check. + CHK_Z(snap_check()); + + CHK_Z(jungle::DB::close(snap)); + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +} using namespace level_extension_test; + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + //ts.options.printTestMessage = true; + ts.doTest("next level basic test", + next_level_basic_test); + + ts.doTest("interlevel compaction cancel test", + interlevel_compaction_cancel_test); + + ts.doTest("inplace compaction cancel test", + inplace_compaction_cancel_test); + + ts.doTest("split cancel test", + split_cancel_test); + + ts.doTest("merge cancel test", + merge_cancel_test); + + ts.doTest("single file compaction cancel with mode change test", + single_file_compaction_cancel_mode_change_test); + + ts.doTest("dual file compaction cancel with mode change test", + dual_file_compaction_cancel_mode_change_test); + + ts.doTest("sequence iterator test", + seq_itr_test, + TestRange({false, true})); + + ts.doTest("snapshot test", + snapshot_test); + + return 0; +} + diff --git a/tests/jungle/log_reclaim_test.cc b/tests/jungle/log_reclaim_test.cc new file mode 100644 index 0000000..7c435a8 --- /dev/null +++ b/tests/jungle/log_reclaim_test.cc @@ -0,0 +1,966 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "jungle_test_common.h" + +#include "internal_helper.h" + +#include +#include + +#include + +namespace log_reclaim_test { + +int basic_log_reclaim_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 10; + config.logSectionOnly = true; + config.logFileTtl_sec = 1; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t num = 1000; + _set_keys(db, 0, num, 1, "k%06zu", "v%06zu"); + + // Sync. + CHK_Z(db->sync(false)); + + TestSuite::sleep_sec(3, "waiting for reclaiming"); + + // Purge. + CHK_Z( db->flushLogs(jungle::FlushOptions(), 500) ); + + // Get min seq number. + uint64_t min_seqnum = 0; + CHK_Z(db->getMinSeqNum(min_seqnum)); + CHK_EQ(501, min_seqnum); + + // Point query something. + for (size_t ii=501; ii<=1000; ii+=171) { + char key_str[256]; + char val_str[256]; + jungle::KV kv_out; + CHK_Z(db->getSN(ii, kv_out)); + + sprintf(key_str, "k%06zu", ii-1); + sprintf(val_str, "v%06zu", ii-1); + jungle::SizedBuf key(key_str); + jungle::SizedBuf val(val_str); + CHK_EQ(key, kv_out.key); + CHK_EQ(val, kv_out.value); + kv_out.free(); + } + + // Seq iterator. + jungle::Iterator itr; + CHK_Z( itr.initSN(db, 751, 900) ); + size_t idx = 750; + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + + char key_str[256]; + char val_str[256]; + sprintf(key_str, "k%06zu", idx); + sprintf(val_str, "v%06zu", idx); + jungle::SizedBuf key(key_str); + jungle::SizedBuf val(val_str); + CHK_EQ(key, rec_out.kv.key); + CHK_EQ(val, rec_out.kv.value); + rec_out.free(); + idx++; + } while (itr.next().ok()); + itr.close(); + + TestSuite::sleep_sec(3, "waiting for reclaiming"); + + // Purge more. + CHK_Z( db->flushLogs(jungle::FlushOptions(), 700) ); + + // Get min seq number. + min_seqnum = 0; + CHK_Z(db->getMinSeqNum(min_seqnum)); + CHK_EQ(701, min_seqnum); + + CHK_Z(jungle::DB::close(db)); + + // Offline checking if it is log section mode. + CHK_TRUE( jungle::DB::isLogSectionMode(filename) ); + + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int multi_instances_reclaim_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + std::vector dbs(10); + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + g_config.numFlusherThreads = 0; + g_config.numCompactorThreads = 0; + g_config.numTableWriters = 0; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 50; + config.logSectionOnly = true; + config.logFileTtl_sec = 1; + + size_t count = 0; + for (auto& entry: dbs) { + jungle::DB*& db = entry; + CHK_Z( jungle::DB::open( &db, + filename + "_" + std::to_string(count++), + config) ); + } + + size_t num = 1000; + for (auto& entry: dbs) { + jungle::DB*& db = entry; + _set_keys(db, 0, num, 1, "k%06zu", "v%06zu"); + } + + // Sync. + for (auto& entry: dbs) { + jungle::DB*& db = entry; + CHK_Z( db->sync(false) ); + } + + TestSuite::sleep_sec(3, "waiting for reclaiming"); + + // Purge. + for (auto& entry: dbs) { + jungle::DB*& db = entry; + CHK_Z( db->flushLogs(jungle::FlushOptions(), 500) ); + } + + // Get min seq number. + for (auto& entry: dbs) { + jungle::DB*& db = entry; + uint64_t min_seqnum = 0; + CHK_Z(db->getMinSeqNum(min_seqnum)); + CHK_EQ(501, min_seqnum); + } + + // Point query something. + for (size_t ii=501; ii<=1000; ii+=171) { + for (size_t jj=0; jj<10; ++jj) { + jungle::DB*& db = dbs[jj]; + char key_str[256]; + char val_str[256]; + jungle::KV kv_out; + jungle::KV::Holder h_kv_out(kv_out); + if (ii == 843 && jj == 9) { + int bp = 0; (void)bp; + } + s = db->getSN(ii, kv_out); + if (!s) { + int bp = 0; (void)bp; + } + + sprintf(key_str, "k%06zu", ii-1); + sprintf(val_str, "v%06zu", ii-1); + jungle::SizedBuf key(key_str); + jungle::SizedBuf val(val_str); + CHK_EQ(key, kv_out.key); + CHK_EQ(val, kv_out.value); + } + } + + // Seq iterator. + for (auto& entry: dbs) { + jungle::DB*& db = entry; + jungle::Iterator itr; + CHK_Z( itr.initSN(db, 751, 900) ); + size_t idx = 750; + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + + char key_str[256]; + char val_str[256]; + sprintf(key_str, "k%06zu", idx); + sprintf(val_str, "v%06zu", idx); + jungle::SizedBuf key(key_str); + jungle::SizedBuf val(val_str); + CHK_EQ(key, rec_out.kv.key); + CHK_EQ(val, rec_out.kv.value); + rec_out.free(); + idx++; + } while (itr.next().ok()); + itr.close(); + } + + TestSuite::sleep_sec(3, "waiting for reclaiming"); + + // Purge more. + for (auto& entry: dbs) { + jungle::DB*& db = entry; + CHK_Z( db->flushLogs(jungle::FlushOptions(), 700) ); + } + + // Get min seq number. + for (auto& entry: dbs) { + jungle::DB*& db = entry; + uint64_t min_seqnum = 0; + CHK_Z(db->getMinSeqNum(min_seqnum)); + CHK_EQ(701, min_seqnum); + } + + for (auto& entry: dbs) { + jungle::DB*& db = entry; + CHK_Z(jungle::DB::close(db)); + } + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int reload_log_in_reclaim_mode_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 10; + config.logSectionOnly = true; + config.logFileTtl_sec = 1; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t num = 1000; + _set_keys(db, 0, num, 1, "k%06zu", "v%06zu"); + + // Sync. + CHK_Z(db->sync(false)); + + // Purge. + CHK_Z( db->flushLogs(jungle::FlushOptions(), 500) ); + + // Close and re-open. + CHK_Z(jungle::DB::close(db)); + + // Offline checking if it is log section mode. + CHK_TRUE( jungle::DB::isLogSectionMode(filename) ); + + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Get min seq number. + uint64_t min_seqnum = 0; + CHK_Z(db->getMinSeqNum(min_seqnum)); + CHK_EQ(501, min_seqnum); + + // Point query something. + for (size_t ii=501; ii<=1000; ii+=171) { + char key_str[256]; + char val_str[256]; + jungle::KV kv_out; + CHK_Z(db->getSN(ii, kv_out)); + + sprintf(key_str, "k%06zu", ii-1); + sprintf(val_str, "v%06zu", ii-1); + jungle::SizedBuf key(key_str); + jungle::SizedBuf val(val_str); + CHK_EQ(key, kv_out.key); + CHK_EQ(val, kv_out.value); + kv_out.free(); + } + + // Seq iterator. + jungle::Iterator itr; + CHK_Z( itr.initSN(db, 751, 900) ); + size_t idx = 750; + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + + char key_str[256]; + char val_str[256]; + sprintf(key_str, "k%06zu", idx); + sprintf(val_str, "v%06zu", idx); + jungle::SizedBuf key(key_str); + jungle::SizedBuf val(val_str); + CHK_EQ(key, rec_out.kv.key); + CHK_EQ(val, rec_out.kv.value); + rec_out.free(); + idx++; + } while (itr.next().ok()); + itr.close(); + + TestSuite::sleep_sec(3, "waiting for reclaiming"); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int reload_with_empty_files_test_create() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DB* db; + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 10000; + config.logSectionOnly = true; + config.logFileTtl_sec = 1; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t num = 100; + _set_keys(db, 0, num, 1, "k%06zu", "v%06zu"); + + // Sync. + CHK_Z(db->sync(false)); + + // Close and re-open. + CHK_Z(jungle::DB::close(db)); + + // Offline checking if it is log section mode. + CHK_TRUE( jungle::DB::isLogSectionMode(filename) ); + + CHK_Z(jungle::DB::open(&db, filename, config)); + + _set_keys(db, 0, num, 1, "k%06zu", "v%06zu"); + + TestSuite::sleep_sec(60, "kill me now"); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int reload_with_empty_files_test_load() { + std::string filename = "./bed"; + + jungle::Status s; + jungle::DB* db; + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 10000; + config.logSectionOnly = true; + config.logFileTtl_sec = 1; + CHK_Z(jungle::DB::open(&db, filename, config)); + + uint64_t seq_num_out = 0; + db->getMaxSeqNum(seq_num_out); + CHK_EQ(100, seq_num_out); + + size_t num = 100; + _set_keys(db, 0, num, 1, "k%06zu", "v%06zu"); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + return 0; +} + +void async_callback(bool* invoked, + EventAwaiter* ea, + jungle::Status s, + void* ctx) +{ + if (invoked) *invoked = true; + ea->invoke(); +} + +int async_sync_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.numCompactorThreads = 0; + g_config.numTableWriters = 0; + g_config.compactorSleepDuration_ms = 1000; // 1 second + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 10; + config.logSectionOnly = true; + config.logFileTtl_sec = 1; + + jungle::DB* db; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + size_t num = 100; + _set_keys(db, 0, num, 1, "k%06zu", "v%06zu"); + + // Async sync. + jungle::FlushOptions f_opt; + f_opt.syncOnly = true; + f_opt.callFsync = true; + + bool callback_invoked = false; + EventAwaiter ea; + CHK_Z( db->flushLogsAsync + ( f_opt, + std::bind( async_callback, + &callback_invoked, + &ea, + std::placeholders::_1, + std::placeholders::_2 ), + nullptr ) ); + + ea.wait_ms(1000 * 5); + CHK_TRUE( callback_invoked ); + + uint64_t seq_num_out = 0; + CHK_Z( db->getLastSyncedSeqNum(seq_num_out) ); + CHK_EQ( num, seq_num_out ); + + // Put more keys. + _set_keys(db, num, 2 * num, 1, "k%06zu", "v%06zu"); + + // They are not synced. + seq_num_out = 0; + CHK_Z( db->getLastSyncedSeqNum(seq_num_out) ); + CHK_EQ( num, seq_num_out ); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z( jungle::shutdown() ); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int crash_after_adding_new_log_file_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 10; + config.logSectionOnly = true; + config.logFileTtl_sec = 1; + + jungle::DB* db; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + _set_keys(db, 0, 5, 1, "k%06zu", "v%06zu"); + CHK_Z( db->sync(false) ); + + _set_keys(db, 5, 25, 1, "k%06zu", "v%06zu"); + + // Copy files somewhere. + jungle::FileMgr::mkdir(filename + "/backup"); + jungle::FileMgr::copy(filename + "/log0000_*", filename + "/backup"); + + CHK_Z( jungle::DB::close(db) ); + + // Restore and re-open. + jungle::FileMgr::copy(filename + "/backup/log0000_*", filename + "/"); + CHK_Z( jungle::DB::open(&db, filename, config) ); + + uint64_t seq_num_out = 0; + CHK_Z( db->getMaxSeqNum(seq_num_out) ); + CHK_EQ( 5, seq_num_out ); + _set_keys(db, 5, 15, 1, "k%06zu", "v%06zu"); + + CHK_Z( db->getMaxSeqNum(seq_num_out) ); + CHK_EQ( 15, seq_num_out ); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z( jungle::DB::open(&db, filename, config) ); + + CHK_Z( db->getMaxSeqNum(seq_num_out) ); + CHK_EQ( 15, seq_num_out ); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z( jungle::shutdown() ); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int log_rollback_basic_test(bool sync_before_rollback) { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 10; + config.logSectionOnly = true; + config.logFileTtl_sec = 1; + + jungle::DB* db; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + const size_t N1 = 8, N2 = 15, ROLLBACK = 5; + _set_keys(db, 0, N1, 1, "k%06zu", "v%06zu"); + + if (sync_before_rollback) { + CHK_Z( db->sync(false) ); + } + + _get_keys(db, 0, N1, 1, "k%06zu", "v%06zu"); + + CHK_Z( db->rollback(ROLLBACK) ); + + jungle::KV kv_out; + uint64_t seq_num_out = 0; + + // Check rollback. + CHK_Z( db->getMaxSeqNum(seq_num_out) ); + CHK_EQ( ROLLBACK, seq_num_out ); + _get_keys(db, 0, ROLLBACK, 1, "k%06zu", "v%06zu"); + for (size_t ii=ROLLBACK+1; ii<=N1; ++ii) CHK_FALSE( db->getSN(ii, kv_out) ); + + CHK_Z( db->sync(false) ); + + // Even after sync, the results should be the same. + CHK_Z( db->getMaxSeqNum(seq_num_out) ); + CHK_EQ( ROLLBACK, seq_num_out ); + _get_keys(db, 0, ROLLBACK, 1, "k%06zu", "v%06zu"); + for (size_t ii=ROLLBACK+1; ii<=N1; ++ii) CHK_FALSE( db->getSN(ii, kv_out) ); + + // Put more KVs. + _set_keys(db, ROLLBACK, N2, 1, "k%06zu", "v2_%06zu"); + + // Old KVs (rolled-back) shouldn't be visible. + _get_keys(db, 0, ROLLBACK, 1, "k%06zu", "v%06zu"); + _get_keys(db, ROLLBACK, N2, 1, "k%06zu", "v2_%06zu"); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z( jungle::DB::open(&db, filename, config) ); + + CHK_Z( db->getMaxSeqNum(seq_num_out) ); + CHK_EQ( N2, seq_num_out ); + _get_keys(db, 0, ROLLBACK, 1, "k%06zu", "v%06zu"); + _get_keys(db, ROLLBACK, N2, 1, "k%06zu", "v2_%06zu"); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z( jungle::shutdown() ); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int log_rollback_multi_files_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 10; + config.logSectionOnly = true; + config.logFileTtl_sec = 1; + + jungle::DB* db; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + const size_t N1 = 55, N2 = 95, ROLLBACK = 25; + _set_keys(db, 0, N1, 1, "k%06zu", "v%06zu"); + CHK_Z( db->sync(false) ); + _get_keys(db, 0, N1, 1, "k%06zu", "v%06zu"); + + CHK_Z( db->rollback(ROLLBACK) ); + + jungle::KV kv_out; + uint64_t seq_num_out = 0; + + // Check rollback. + CHK_Z( db->getMaxSeqNum(seq_num_out) ); + CHK_EQ( ROLLBACK, seq_num_out ); + _get_keys(db, 0, ROLLBACK, 1, "k%06zu", "v%06zu"); + for (size_t ii=ROLLBACK+1; ii<=N1; ++ii) CHK_FALSE( db->getSN(ii, kv_out) ); + + CHK_Z( db->sync(false) ); + + // Even after sync, the results should be the same. + CHK_Z( db->getMaxSeqNum(seq_num_out) ); + CHK_EQ( ROLLBACK, seq_num_out ); + _get_keys(db, 0, ROLLBACK, 1, "k%06zu", "v%06zu"); + for (size_t ii=ROLLBACK+1; ii<=N1; ++ii) CHK_FALSE( db->getSN(ii, kv_out) ); + + // Put more KVs. + _set_keys(db, ROLLBACK, N2, 1, "k%06zu", "v2_%06zu"); + + // Old KVs (rolled-back) shouldn't be visible. + _get_keys(db, 0, ROLLBACK, 1, "k%06zu", "v%06zu"); + _get_keys(db, ROLLBACK, N2, 1, "k%06zu", "v2_%06zu"); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z( jungle::DB::open(&db, filename, config) ); + + CHK_Z( db->getMaxSeqNum(seq_num_out) ); + CHK_EQ( N2, seq_num_out ); + _get_keys(db, 0, ROLLBACK, 1, "k%06zu", "v%06zu"); + _get_keys(db, ROLLBACK, N2, 1, "k%06zu", "v2_%06zu"); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z( jungle::shutdown() ); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int log_rollback_and_reclaim_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 10; + config.logSectionOnly = true; + config.logFileTtl_sec = 1; + + jungle::DB* db; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + const size_t N1 = 99, N2 = 105, ROLLBACK = 98; + _set_keys(db, 0, N1, 1, "k%06zu", "v%06zu"); + + CHK_Z( db->sync(false) ); + + _get_keys(db, 0, N1, 1, "k%06zu", "v%06zu"); + + CHK_Z( db->rollback(ROLLBACK) ); + + jungle::KV kv_out; + uint64_t seq_num_out = 0; + + CHK_Z( db->getMaxSeqNum(seq_num_out) ); + CHK_EQ( ROLLBACK, seq_num_out ); + _get_keys(db, 0, ROLLBACK, 1, "k%06zu", "v%06zu"); + for (size_t ii=ROLLBACK+1; ii<=N1; ++ii) CHK_FALSE( db->getSN(ii, kv_out) ); + + // Put more KVs. + _set_keys(db, ROLLBACK, N2, 1, "k%06zu", "v2_%06zu"); + CHK_Z( db->sync(false) ); + + // Old KVs (rolled-back) shouldn't be visible. + _get_keys(db, 0, ROLLBACK, 1, "k%06zu", "v%06zu"); + _get_keys(db, ROLLBACK, N2, 1, "k%06zu", "v2_%06zu"); + + // Wait for reclaim of old log. + TestSuite::sleep_ms(1500, "wait for reclaim"); + + // Wait one more time. + TestSuite::sleep_ms(1500, "wait for reclaim"); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z( jungle::shutdown() ); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int log_rollback_with_concurrent_write_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 10; + config.logSectionOnly = true; + config.logFileTtl_sec = 1; + + jungle::DB* db; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + const size_t N1 = 99, N2 = 105, ROLLBACK = 98; + _set_keys(db, 0, N1, 1, "k%06zu", "v%06zu"); + + CHK_Z( db->sync(false) ); + + _get_keys(db, 0, N1, 1, "k%06zu", "v%06zu"); + + // Set delay for compaction. + jungle::DebugParams d_params; + d_params.rollbackDelayUs = 1000000; // 1 second delay. + jungle::setDebugParams(d_params); + + std::thread t_rollback + ( [db, ROLLBACK]() { + (void)ROLLBACK; + db->rollback(ROLLBACK); + } ); + + TestSuite::sleep_us(d_params.rollbackDelayUs / 2, "wait for rollback"); + + // Write operations should fail. + { + char key_str[256]; + char val_str[256]; + for (size_t ii=ROLLBACK; iiset(jungle::KV(key, val)) ); + } + } + CHK_FALSE( db->sync(false) ); + CHK_FALSE( db->syncNoWait(false) ); + CHK_FALSE( db->flushLogsAsync(jungle::FlushOptions(), nullptr, nullptr) ); + + if (t_rollback.joinable()) t_rollback.join(); + + // After rollback is done, they should succeed. + // Put more KVs. + _set_keys(db, ROLLBACK, N2, 1, "k%06zu", "v2_%06zu"); + CHK_Z( db->sync(false) ); + + // Old KVs (rolled-back) shouldn't be visible. + _get_keys(db, 0, ROLLBACK, 1, "k%06zu", "v%06zu"); + _get_keys(db, ROLLBACK, N2, 1, "k%06zu", "v2_%06zu"); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z( jungle::shutdown() ); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int urgent_reclaim_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 10; + config.logSectionOnly = true; + config.logFileTtl_sec = 3600; + config.maxKeepingMemtables = 5; + + jungle::DB* db; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + const size_t N1 = 200; + _set_keys(db, 0, N1, 1, "k%06zu", "v%06zu"); + CHK_Z( db->sync(false) ); + + _get_keys(db, 0, N1, 1, "k%06zu", "v%06zu"); + + TestSuite::sleep_sec(2, "wait for reclaiming.."); + + // Read from the beginning, may cause reload and re-purge. + uint64_t cur_seq = 1; + for (size_t ii=0; iigetSN(cur_seq, kv_out)); + + char key_str[256], val_str[256]; + sprintf(key_str, "k%06zu", ii); + sprintf(val_str, "v%06zu", ii); + jungle::SizedBuf key_exp(key_str); + jungle::SizedBuf val_exp(val_str); + CHK_EQ(key_exp, kv_out.key); + CHK_EQ(val_exp, kv_out.value); + cur_seq++; + } + + TestSuite::sleep_sec(2, "wait for reclaiming.."); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z( jungle::shutdown() ); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int log_flush_zero_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 10; + config.logSectionOnly = true; + config.logFileTtl_sec = 60; + + jungle::DB* db; + CHK_Z( jungle::DB::open(&db, filename, config) ); + + const size_t N1 = 100; + _set_keys(db, 0, N1, 1, "k%06zu", "v%06zu"); + CHK_Z( db->sync(false) ); + + // Flush with 0, should do nothing. + db->flushLogs(jungle::FlushOptions(), 0); + + uint64_t seq_num_out = 0; + CHK_Z( db->getMaxSeqNum(seq_num_out) ); + CHK_EQ( N1, seq_num_out ); + + // Should fail. + CHK_NEQ(0, db->getLastFlushedSeqNum(seq_num_out)); + + CHK_Z( jungle::DB::close(db) ); + CHK_Z( jungle::shutdown() ); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +} using namespace log_reclaim_test; + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + //ts.options.printTestMessage = true; + ts.doTest("basic log reclaim test", + basic_log_reclaim_test); + + ts.doTest("multi instances log reclaim test", + multi_instances_reclaim_test); + + ts.doTest("reload log in reclaim mode test", + reload_log_in_reclaim_mode_test); + + ts.doTest("async-sync test", + async_sync_test); + + ts.doTest("crash after adding new log file test", + crash_after_adding_new_log_file_test); + + ts.doTest("log rollback basic test", + log_rollback_basic_test, + TestRange( {true, false} )); + + ts.doTest("log rollback multi files test", + log_rollback_multi_files_test); + + ts.doTest("log rollback and reclaim test", + log_rollback_and_reclaim_test); + + ts.doTest("log rollback with concurrent write test", + log_rollback_with_concurrent_write_test); + + ts.doTest("urgent reclaim test", + urgent_reclaim_test); + + ts.doTest("log flush upto zero test", + log_flush_zero_test); + +#if 0 + ts.doTest("reload empty files test", + reload_with_empty_files_test_load); +#endif + + return 0; +} diff --git a/tests/jungle/mt_test.cc b/tests/jungle/mt_test.cc new file mode 100644 index 0000000..6540fef --- /dev/null +++ b/tests/jungle/mt_test.cc @@ -0,0 +1,188 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "config_test_common.h" +#include "test_common.h" + +#include "libjungle/jungle.h" + +#include +#include +#include +#include + +#include +#include + +namespace mt_test { + +struct mt1_args : public TestSuite::ThreadArgs { + enum Type { + READER = 0, + WRITER = 1 + }; + + mt1_args() {} + mt1_args(jungle::DB* _db, Type _type, int _duration) + : db(_db), type(_type), duration_ms(_duration) + {} + + jungle::DB* db; + Type type; + int duration_ms; + int ret; +}; + +int mt1_reader(TestSuite::ThreadArgs* t_args) { + mt1_args* args = static_cast(t_args); + TestSuite::Timer timer(args->duration_ms); + do { + jungle::Status s; + uint64_t seq; + s = args->db->getMaxSeqNum(seq); + if (!s || !seq) continue; + + uint64_t r = (rand() % seq); + jungle::Record rec; + jungle::Record::Holder h_rec(rec); + std::string key_str("k" + TestSuite::lzStr(6, r)); + s = args->db->getRecordByKey( jungle::SizedBuf(key_str), rec ); + CHK_OK(s); + } while (!timer.timeover()); + return 0; +} + +int mt1_writer(TestSuite::ThreadArgs* t_args) { + mt1_args* args = static_cast(t_args); + TestSuite::Timer timer(args->duration_ms); + uint64_t count = 0; + do { + jungle::Status s; + jungle::KV kv; + std::string key_str("k" + TestSuite::lzStr(6, count)); + std::string val_str("v" + TestSuite::lzStr(6, count)); + kv.set(key_str, val_str); + s = args->db->set(kv); + CHK_OK(s); + if (count % 1000 == 0) { + s = args->db->sync(); + // NOTE: `s` may fail if there is concurrent flush + // by BG flusher. + (void)s; + } + count++; + } while (!timer.timeover()); + return 0; +} + +int mt1_single_writer_multi_reader_seq(size_t duration_sec) { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + jungle::DB::open(&db, filename, config); + + int num_reader = 2; + + TestSuite::ThreadHolder* w_holder = nullptr; + mt1_args w_args; + w_args = mt1_args(db, mt1_args::WRITER, 1000 * duration_sec); + w_holder = new TestSuite::ThreadHolder(&w_args, mt1_writer, nullptr); + + mt1_args r_args[num_reader]; + std::vector r_holders(num_reader, nullptr); + for (int i=0; ijoin(); + CHK_Z(w_holder->getResult()); + delete w_holder; + + for (int i=0; ijoin(); + CHK_Z(r_holder->getResult()); + delete r_holder; + } + + jungle::DB::close(db); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int flusher_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + jungle::init(g_config); + + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // 1 second. + TestSuite::Timer timer(1000); + uint64_t ii = 0; + while (!timer.timeover()) { + jungle::KV kv; + std::string key_str("k" + TestSuite::lzStr(6, ii)); + std::string val_str("v" + TestSuite::lzStr(6, ii)); + kv.set(key_str, val_str); + db->set(kv); + ii++; + + // 5000 r/sec * 1 sec = 1000 records. + TestSuite::sleep_us(200); + } + + s = jungle::DB::close(db); + CHK_OK(s); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +} using namespace mt_test; + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + //ts.options.printTestMessage = true; + ts.doTest("single writer multi reader seq test", + mt1_single_writer_multi_reader_seq, + TestRange({1})); + + ts.doTest("flusher test", + flusher_test); + + return 0; +} diff --git a/tests/jungle/seq_itr_test.cc b/tests/jungle/seq_itr_test.cc new file mode 100644 index 0000000..383dda4 --- /dev/null +++ b/tests/jungle/seq_itr_test.cc @@ -0,0 +1,552 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "jungle_test_common.h" + +#include "libjungle/jungle.h" + +#include + +#include + +int itr_seq_empty() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 10; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Iterator on empty DB, should succeed. + jungle::Iterator itr; + s = itr.initSN(db); + CHK_Z(s); + + // All get, prev, next should fail. + jungle::Record rec_out; + CHK_NOT(itr.get(rec_out)); + CHK_NOT(itr.prev()); + CHK_NOT(itr.next()); + + // Close iterator. + itr.close(); + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_seq_basic() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key", "value")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Iterator. + jungle::Iterator itr; + s = itr.initSN(db); + CHK_Z(s); + + // prev() should fail. + s = itr.prev(); + CHK_NOT(s); + + // Check returned records (forward). + CHK_Z(_itr_check(0, n, itr, kv)); + // Check returned records (backward). + CHK_Z(_itr_check_bwd(0, n, itr, kv)); + + // Close iterator. + itr.close(); + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_seq_purge() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync & partial flush. + CHK_Z(db->sync(false)); + + uint64_t flush_upto = 2; + jungle::FlushOptions f_options; + CHK_Z(db->flushLogs(f_options, flush_upto)); + + // Min seqnum == flush_upto + 1. + { uint64_t seq; + s = db->getMinSeqNum(seq); + CHK_Z(s); + CHK_EQ(flush_upto+1, seq); } + + // Flush seqnum == flush_upto. + { uint64_t seq; + s = db->getLastFlushedSeqNum(seq); + CHK_Z(s); + CHK_EQ(flush_upto, seq); } + + // Iterator. + jungle::Iterator itr; + s = itr.initSN(db); + CHK_Z(s); + + // prev() should fail. + s = itr.prev(); + CHK_NOT(s); + + // Check returned records. + CHK_Z(_itr_check(0, n, itr, kv)); + // Check returned records (backward). + CHK_Z(_itr_check_bwd(0, n, itr, kv)); + + // Close iterator. + itr.close(); + + // Flush all. + CHK_Z(db->flushLogs(f_options)); + + // Check again. + CHK_Z(itr.initSN(db)); + CHK_Z(_itr_check(0, n, itr, kv)); + CHK_Z(_itr_check_bwd(0, n, itr, kv)); + CHK_Z(itr.close()); + + // Set more KV pairs. + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key2_", "value2_")); + CHK_Z(_set_byseq_kv_pairs(0, n, n, db, kv2)); + CHK_Z(db->sync(false)); + + // Check again. + CHK_Z(itr.initSN(db)); + CHK_Z(_itr_check(0, n, itr, kv)); + CHK_Z(_itr_check(0, n, itr, kv2)); + CHK_Z(_itr_check_bwd(0, n, itr, kv2)); + CHK_Z(_itr_check_bwd(0, n, itr, kv)); + CHK_Z(itr.close()); + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_seq_isolation() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Iterator. + jungle::Iterator itr; + s = itr.initSN(db); + CHK_Z(s); + + // Add more KVs + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key2_", "value2_")); + CHK_Z(_set_byseq_kv_pairs(0, n, n, db, kv2)); + + // Check returned records. + // New KVs should not be visiable by this iterator. + CHK_Z(_itr_check(0, n, itr, kv)); + CHK_Z(_itr_check_bwd(0, n, itr, kv)); + + // But visiable by normal get. + CHK_Z(_get_bykey_check(0, n, db, kv2)); + + // Close iterator. + itr.close(); + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_seq_multiple_logs() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 10; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Set KV pairs. + int n = 100; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync & partial flush. + CHK_Z(db->sync(false)); + + uint64_t flush_upto = 150; + jungle::FlushOptions f_options; + CHK_Z(db->flushLogs(f_options, flush_upto)); + + // By-key check. + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Iterator. + jungle::Iterator itr; + s = itr.initSN(db); + CHK_Z(s); + CHK_Z(_itr_check(0, n, itr, kv)); + CHK_Z(_itr_check_bwd(0, n, itr, kv)); + s = itr.close(); + CHK_Z(s); + + // Flush all. + CHK_Z(db->flushLogs(f_options)); + + // Check again. + CHK_Z(itr.initSN(db)); + CHK_Z(_itr_check(0, n, itr, kv)); + CHK_Z(_itr_check_bwd(0, n, itr, kv)); + CHK_Z(itr.close()); + + // By-key check. + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_seq_preserve_log_file_test() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 10; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Set KV pairs. + int n = 100; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync & partial flush. + CHK_Z(db->sync(false)); + + // Open iterator. + jungle::Iterator itr; + s = itr.initSN(db); + CHK_Z(s); + + // Flush while the iterator is still alive. + uint64_t flush_upto = n * 95 / 100; + jungle::FlushOptions f_options; + CHK_Z(db->flushLogs(f_options, flush_upto)); + + // Check. + CHK_Z(_itr_check(0, n, itr, kv)); + CHK_Z(_itr_check_bwd(0, n, itr, kv)); + + // By-key check. + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Close iterator. + s = itr.close(); + CHK_Z(s); + + // Re-open iterator. + s = itr.initSN(db); + CHK_Z(s); + CHK_Z(_itr_check(0, n, itr, kv)); + CHK_Z(_itr_check_bwd(0, n, itr, kv)); + s = itr.close(); + CHK_Z(s); + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int itr_seq_seek() { + jungle::DB* db; + jungle::Status s; + + std::string filename; + TEST_SUITE_PREPARE_PATH(filename) + + // Open DB. + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.maxEntriesInLogFile = 5; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Set KV pairs. + int n = 50; + std::vector kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync & partial flush. + CHK_Z(db->sync(false)); + + uint64_t flush_upto = 23; + jungle::FlushOptions f_options; + CHK_Z(db->flushLogs(f_options, flush_upto)); + + // By-key check. + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Iterator. + jungle::Iterator itr; + CHK_Z(itr.initSN(db)); + + // Forward + for (int ii=0; iiflushLogs(f_options)); + + // Check again. + CHK_Z(itr.initSN(db)); + // Forward + for (int ii=0; ii kv(n); + + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync. + CHK_Z(db->sync(false)); + + // By-key check. + CHK_Z(_get_bykey_check(0, n, db, kv)); + + // Iterator. + for (size_t ii=1; ii<=50; ++ii) { + for (size_t jj=ii; jj<=50; ++jj) { + TestSuite::setInfo("ii: %zu, jj: %zu", ii, jj); + jungle::Iterator itr; + CHK_Z(itr.initSN(db, ii, jj)); + CHK_Z(_itr_check(ii-1, jj, itr, kv)); + } + } + + // Close DB. + s = jungle::DB::close(db); + CHK_Z(s); + + // Free all resources for jungle. + jungle::shutdown(); + _free_kv_pairs(n, kv); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + //ts.options.printTestMessage = true; + ts.doTest("seq empty itr", itr_seq_empty); + ts.doTest("seq itr test", itr_seq_basic); + ts.doTest("seq itr flush test", itr_seq_purge); + ts.doTest("seq itr snapshot isolation test", itr_seq_isolation); + ts.doTest("seq itr multiple log files test", itr_seq_multiple_logs); + ts.doTest("seq itr preserve log files test", itr_seq_preserve_log_file_test); + ts.doTest("seq itr seek test", itr_seq_seek); + ts.doTest("seq itr min max test", itr_seq_min_max_logs); + + return 0; +} diff --git a/tests/jungle/snapshot_test.cc b/tests/jungle/snapshot_test.cc new file mode 100644 index 0000000..ab7ce9c --- /dev/null +++ b/tests/jungle/snapshot_test.cc @@ -0,0 +1,1019 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "jungle_test_common.h" + +#include "libjungle/jungle.h" + +#include + +#include + +namespace snapshot_test { + +int checkpoint_basic_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + + // 0 -- n-1: key_x value_x + CHK_OK(_init_kv_pairs(n, kv, "key_", "value_") == 0); + CHK_OK(_set_byseq_kv_pairs(0, n, 0, db, kv) == 0); + + // Checkpoint. + uint64_t seq_num_out = 0; + s = db->checkpoint(seq_num_out); + CHK_OK(s); + CHK_EQ(n, (int)seq_num_out); + + // n -- 2n-1: key2_x value2_x + std::vector kv2(n); + CHK_OK(_init_kv_pairs(n, kv2, "key2_", "value2_") == 0); + CHK_OK(_set_byseq_kv_pairs(0, n, n, db, kv2) == 0); + + // Checkpoint again. + s = db->checkpoint(seq_num_out); + CHK_OK(s); + CHK_EQ(2*n, (int)seq_num_out); + + // Flush. + s = db->flushLogs(jungle::FlushOptions()); + CHK_OK(s); + + // 2n -- 3n-1: key3_x value3_x + std::vector kv3(n); + CHK_OK(_init_kv_pairs(n, kv3, "key3_", "value3_") == 0); + CHK_OK(_set_byseq_kv_pairs(0, n, 2*n, db, kv3) == 0); + s = db->sync(false); + + // Get all. + CHK_Z(_get_bykey_check(0, n, db, kv)); + CHK_Z(_get_bykey_check(0, n, db, kv2)); + CHK_Z(_get_bykey_check(0, n, db, kv3)); + + s = jungle::DB::close(db); + CHK_OK(s); + + jungle::shutdown(); + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + _free_kv_pairs(n, kv3); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int checkpoint_marker_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.maxEntriesInLogFile = 16; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 10; + std::list chk_local; + + for (size_t ii=0; ii<10; ++ii) { + // Append `n` pairs. + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + CHK_Z(_set_byseq_kv_pairs(0, n, ii*n, db, kv)); + + // Checkpoint. + uint64_t seq_num_out = 0; + CHK_OK(db->checkpoint(seq_num_out)); + chk_local.push_back(seq_num_out); + + _free_kv_pairs(n, kv); + } + + // Get list of checkpoints. + std::list chk_out; + s = db->getCheckpoints(chk_out); + CHK_OK(s); + + // Should be identical. + CHK_Z(_cmp_lists(chk_local, chk_out)); + + s = jungle::DB::close(db); + CHK_OK(s); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int checkpoint_marker_flush_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.maxEntriesInLogFile = 16; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 10; + std::list chk_local; + + for (size_t ii=0; ii<10; ++ii) { + // Append `n` pairs. + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + CHK_Z(_set_byseq_kv_pairs(0, n, ii*n, db, kv)); + + // Checkpoint. + uint64_t seq_num_out = 0; + CHK_OK(db->checkpoint(seq_num_out)); + chk_local.push_back(seq_num_out); + + _free_kv_pairs(n, kv); + } + + // Flush half. + CHK_OK(db->flushLogs(jungle::FlushOptions(), n*10/2)); + + // Get list of checkpoints. + std::list chk_out; + s = db->getCheckpoints(chk_out); + CHK_OK(s); + + // Should be identical. + CHK_Z(_cmp_lists(chk_local, chk_out)); + + s = jungle::DB::close(db); + CHK_OK(s); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int checkpoint_marker_purge_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.maxEntriesInLogFile = 16; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 10; + std::list chk_local; + + for (size_t ii=0; ii<10; ++ii) { + // Append `n` pairs. + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + CHK_Z(_set_byseq_kv_pairs(0, n, ii*n, db, kv)); + + // Checkpoint. + uint64_t seq_num_out = 0; + CHK_OK(db->checkpoint(seq_num_out)); + if (seq_num_out > 50) chk_local.push_back(seq_num_out); + + _free_kv_pairs(n, kv); + } + + // Purge half. + jungle::FlushOptions f_opt; + f_opt.purgeOnly = true; + CHK_OK(db->flushLogs(f_opt, n*10/2)); + + // Get list of checkpoints. + std::list chk_out; + s = db->getCheckpoints(chk_out); + CHK_OK(s); + + // Should be identical. + CHK_Z(_cmp_lists(chk_local, chk_out)); + + s = jungle::DB::close(db); + CHK_OK(s); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int checkpoint_on_cold_start() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.maxEntriesInLogFile = 16; + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Sync and close. + CHK_Z(db->sync(false)); + CHK_Z(jungle::DB::close(db)); + + // Re-open. + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Add checkpoint. + uint64_t seq_num_out; + CHK_Z(db->checkpoint(seq_num_out, false)); + + // New records. + CHK_Z(_set_byseq_kv_pairs(0, n, 5, db, kv)); + + // Sync and close. + CHK_Z(db->sync(false)); + CHK_Z(jungle::DB::close(db)); + + // Re-open. + CHK_Z(jungle::DB::open(&db, filename, config)); + + // Get checkpoint. + std::list chk_out; + CHK_Z(db->getCheckpoints(chk_out)); + + CHK_EQ(1, chk_out.size()); + CHK_EQ(seq_num_out, *chk_out.rbegin()); + + CHK_Z(jungle::DB::close(db)); + jungle::shutdown(); + + _free_kv_pairs(n, kv); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int snapshot_basic_table_only_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.maxEntriesInLogFile = 16; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Checkpoint. + uint64_t chk = 0; + CHK_OK(db->checkpoint(chk)); + + // Set more. + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key_", "value2_")); + CHK_Z(_set_byseq_kv_pairs(0, n, n, db, kv2)); + + // Sync and flush. + CHK_OK(db->sync(false)); + CHK_OK(db->flushLogs(jungle::FlushOptions())); + + // Open snapshot. + jungle::DB* snap; + CHK_OK(db->openSnapshot(&snap, chk)); + + // Old value should be visible by snapshot. + CHK_Z(_get_bykey_check(0, n, snap, kv)); + + // New value should be visible by DB handle. + CHK_Z(_get_bykey_check(0, n, db, kv2)); + + // Close snapshot. + CHK_OK(jungle::DB::close(snap)); + + s = jungle::DB::close(db); + CHK_OK(s); + + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int snapshot_basic_log_only_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.maxEntriesInLogFile = 16; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Checkpoint. + uint64_t chk = 0; + CHK_OK(db->checkpoint(chk)); + + // Set more. + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key_", "value2_")); + CHK_Z(_set_byseq_kv_pairs(0, n, n, db, kv2)); + + // Sync. + CHK_OK(db->sync(false)); + + // Open snapshot. + jungle::DB* snap; + CHK_OK(db->openSnapshot(&snap, chk)); + + // Old value should be visible by snapshot. + CHK_Z(_get_bykey_check(0, n, snap, kv)); + + // New value should be visible by DB handle. + CHK_Z(_get_bykey_check(0, n, db, kv2)); + + // Close snapshot. + CHK_OK(jungle::DB::close(snap)); + + s = jungle::DB::close(db); + CHK_OK(s); + + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int snapshot_basic_combined_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.maxEntriesInLogFile = 16; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Checkpoint. + uint64_t chk = 0; + CHK_OK(db->checkpoint(chk)); + + // Set more. + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key_", "value2_")); + CHK_Z(_set_byseq_kv_pairs(0, n, n, db, kv2)); + + // Sync and flush. + CHK_OK(db->sync(false)); + jungle::FlushOptions f_opt; + CHK_OK(db->flushLogs(f_opt)); + + // Set more and checkpoint. + std::vector kv3(n); + CHK_Z(_init_kv_pairs(n, kv3, "key3_", "value3_")); + CHK_Z(_set_byseq_kv_pairs(0, n, n*2, db, kv3)); + + CHK_OK(db->checkpoint(chk)); + + // Set more and sync. + std::vector kv4(n); + CHK_Z(_init_kv_pairs(n, kv4, "key3_", "value4_")); + CHK_Z(_set_byseq_kv_pairs(0, n, n*3, db, kv4)); + CHK_OK(db->sync(false)); + + // Open snapshot. + jungle::DB* snap; + CHK_OK(db->openSnapshot(&snap, chk)); + + // Flush after that. + CHK_OK(db->flushLogs(f_opt)); + + // Snapshot: sees kv2 and kv3. + CHK_Z(_get_bykey_check(0, n, snap, kv2)); + CHK_Z(_get_bykey_check(0, n, snap, kv3)); + + // DB: sees kv2 and kv4. + CHK_Z(_get_bykey_check(0, n, db, kv2)); + CHK_Z(_get_bykey_check(0, n, db, kv4)); + + // Close snapshot. + CHK_OK(jungle::DB::close(snap)); + + // Close DB. + CHK_OK(jungle::DB::close(db)); + + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + _free_kv_pairs(n, kv3); + _free_kv_pairs(n, kv4); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int snapshot_iterator_table_only_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.maxEntriesInLogFile = 16; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Checkpoint. + uint64_t chk = 0; + CHK_OK(db->checkpoint(chk)); + + // Set more. + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key_", "value2_")); + CHK_Z(_set_byseq_kv_pairs(0, n, n, db, kv2)); + + // Sync and flush. + CHK_OK(db->sync(false)); + CHK_OK(db->flushLogs(jungle::FlushOptions())); + + // Open snapshot. + jungle::DB* snap; + CHK_OK(db->openSnapshot(&snap, chk)); + + jungle::Iterator itr; + CHK_OK(itr.initSN(snap)); + + // kv should be found. + CHK_Z(_itr_check(0, n, itr, kv)); + + // No more records. + CHK_NOT(itr.next()); + + CHK_OK(itr.close()); + + // Close snapshot. + CHK_OK(jungle::DB::close(snap)); + + s = jungle::DB::close(db); + CHK_OK(s); + + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int snapshot_iterator_log_only_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.maxEntriesInLogFile = 16; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Checkpoint. + uint64_t chk = 0; + CHK_OK(db->checkpoint(chk)); + + // Set more. + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key_", "value2_")); + CHK_Z(_set_byseq_kv_pairs(0, n, n, db, kv2)); + + // Sync. + CHK_OK(db->sync(false)); + + // Open snapshot. + jungle::DB* snap; + CHK_OK(db->openSnapshot(&snap, chk)); + + jungle::Iterator itr; + CHK_OK(itr.initSN(snap)); + + // kv should be found. + CHK_Z(_itr_check(0, n, itr, kv)); + + // No more records. + CHK_NOT(itr.next()); + + CHK_OK(itr.close()); + + // Close snapshot. + CHK_OK(jungle::DB::close(snap)); + + s = jungle::DB::close(db); + CHK_OK(s); + + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int snapshot_iterator_combined_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + config.maxEntriesInLogFile = 16; + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + int n = 5; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key_", "value_")); + CHK_Z(_set_byseq_kv_pairs(0, n, 0, db, kv)); + + // Checkpoint. + uint64_t chk = 0; + CHK_OK(db->checkpoint(chk)); + + // Set more. + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key_", "value2_")); + CHK_Z(_set_byseq_kv_pairs(0, n, n, db, kv2)); + + // Sync and flush. + CHK_OK(db->sync(false)); + jungle::FlushOptions f_opt; + CHK_OK(db->flushLogs(f_opt)); + + // Set more and checkpoint. + std::vector kv3(n); + CHK_Z(_init_kv_pairs(n, kv3, "key3_", "value3_")); + CHK_Z(_set_byseq_kv_pairs(0, n, n*2, db, kv3)); + + CHK_OK(db->checkpoint(chk)); + + // Set more and sync. + std::vector kv4(n); + CHK_Z(_init_kv_pairs(n, kv4, "key3_", "value4_")); + CHK_Z(_set_byseq_kv_pairs(0, n, n*3, db, kv4)); + CHK_OK(db->sync(false)); + + // Open snapshot. + jungle::DB* snap; + CHK_OK(db->openSnapshot(&snap, chk)); + + // Iterator. + jungle::Iterator itr; + CHK_OK(itr.initSN(snap)); + + // Flush after that. + CHK_OK(db->flushLogs(f_opt)); + + // Snapshot: sees kv2 and kv3. + CHK_Z(_itr_check(0, n, itr, kv2)); + CHK_Z(_itr_check(0, n, itr, kv3)); + + // No more records. + CHK_NOT(itr.next()); + + CHK_OK(itr.close()); + + // Close snapshot. + CHK_OK(jungle::DB::close(snap)); + + // Close DB. + CHK_OK(jungle::DB::close(db)); + + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + _free_kv_pairs(n, kv3); + _free_kv_pairs(n, kv4); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int snapshot_with_compaction_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db; + + s = jungle::DB::open(&db, filename, config); + CHK_OK(s); + + // Set KV pairs. + for (size_t ii=0; ii<30; ii+=10) { + CHK_Z( _set_keys(db, ii, ii+10, 1, "key_%04d", "val_%04d") ); + + uint64_t seq_num_out; + CHK_Z( db->checkpoint(seq_num_out, false) ); + TestSuite::_msg("%zu\n", seq_num_out); + } + + // Flush. + CHK_Z( db->flushLogs(jungle::FlushOptions()) ); + + // Verify checkpoints. + for (int chk: {10, 20, 30}) { + jungle::DB* snap_out = nullptr; + CHK_Z( db->openSnapshot(&snap_out, chk) ); + + jungle::Iterator itr; + CHK_Z( itr.initSN(snap_out) ); + + size_t count = 0; + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + + rec_out.free(); + count++; + } while (itr.next().ok()); + CHK_EQ(chk, count); + CHK_Z( itr.close() ); + CHK_Z( jungle::DB::close(snap_out) ); + } + + // Compact. + for (size_t ii=0; iicompactL0(jungle::CompactOptions(), ii) ); + } + + // Verify checkpoints: + for (int chk: {10, 20, 30}) { + jungle::DB* snap_out = nullptr; + CHK_NOT( db->openSnapshot(&snap_out, chk).ok() ); + } + + // Put more. + for (size_t ii=30; ii<50; ii+=10) { + CHK_Z( _set_keys(db, ii, ii+10, 1, "key_%04d", "val_%04d") ); + + uint64_t seq_num_out; + CHK_Z( db->checkpoint(seq_num_out, false) ); + TestSuite::_msg("%zu\n", seq_num_out); + } + + // Flush. + CHK_Z( db->flushLogs(jungle::FlushOptions()) ); + + // Verify checkpoints: + // 10-30: invalid. + // 40-50: valid. + for (int chk: {10, 20, 30, 40, 50}) { + jungle::DB* snap_out = nullptr; + if (chk < 40) { + CHK_NOT( db->openSnapshot(&snap_out, chk).ok() ); + continue; + } + CHK_Z( db->openSnapshot(&snap_out, chk) ); + + jungle::Iterator itr; + CHK_Z( itr.initSN(snap_out) ); + + size_t count = 0; + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + + rec_out.free(); + count++; + } while (itr.next().ok()); + CHK_EQ(chk, count); + CHK_Z( itr.close() ); + CHK_Z( jungle::DB::close(snap_out) ); + } + + // Compact. + for (size_t ii=0; iicompactL0(jungle::CompactOptions(), ii) ); + } + + for (int chk: {10, 20, 30, 40, 50}) { + TestSuite::setInfo("chk %d", chk); + jungle::DB* snap_out = nullptr; + CHK_NOT( db->openSnapshot(&snap_out, chk).ok() ); + } + + // Put more. + for (size_t ii=50; ii<70; ii+=10) { + CHK_Z( _set_keys(db, ii, ii+10, 1, "key_%04d", "val_%04d") ); + + uint64_t seq_num_out; + CHK_Z( db->checkpoint(seq_num_out, false) ); + TestSuite::_msg("%zu\n", seq_num_out); + } + + // Verify checkpoints: + // 10-50: invalid. + // 60-70: valid. + for (int chk: {10, 20, 30, 40, 50, 60, 70}) { + jungle::DB* snap_out = nullptr; + if (chk < 60) { + CHK_NOT( db->openSnapshot(&snap_out, chk).ok() ); + continue; + } + CHK_Z( db->openSnapshot(&snap_out, chk) ); + + jungle::Iterator itr; + CHK_Z( itr.initSN(snap_out) ); + + size_t count = 0; + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + + rec_out.free(); + count++; + } while (itr.next().ok()); + CHK_EQ(chk, count); + CHK_Z( itr.close() ); + CHK_Z( jungle::DB::close(snap_out) ); + } + + // Flush. + CHK_Z( db->flushLogs(jungle::FlushOptions()) ); + + // Close DB. + CHK_OK(jungle::DB::close(db)); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int latest_snapshot_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db = nullptr; + + config.maxEntriesInLogFile = 10; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Set KV pairs. + int n = 25; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + CHK_Z(_set_bykey_kv_pairs(0, n, db, kv)); + + // Sync and flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key2_", "value2_")); + CHK_Z(_set_bykey_kv_pairs(0, n, db, kv2)); + CHK_Z(db->sync(false)); + + // Open the latest snapshot (checkpoint = 0). + jungle::DB* snap = nullptr; + CHK_Z(db->openSnapshot(&snap)); + + // Set more KV pairs, using the same key. + std::vector kv3(n); + CHK_Z(_init_kv_pairs(n, kv3, "key1_", "value1_new_")); + CHK_Z(_set_bykey_kv_pairs(0, n, db, kv3)); + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + std::vector kv4(n); + CHK_Z(_init_kv_pairs(n, kv4, "key2_", "value2_new_")); + CHK_Z(_set_bykey_kv_pairs(0, n, db, kv4)); + CHK_Z(db->sync(false)); + + // Do compaction. + for (size_t ii=0; iicompactL0(jungle::CompactOptions(), ii) ); + } + + // Now open iterator using the snapshot. + jungle::Iterator itr; + CHK_Z(itr.init(snap)); + + // Should see kv and kv2 only. + CHK_Z(_itr_check(0, n, itr, kv)); + CHK_Z(_itr_check(0, n, itr, kv2)); + + // Create another iterator from the snapshot in parallel + // and do the same thing. + jungle::Iterator itr2; + CHK_Z(itr2.init(snap)); + CHK_Z(_itr_check(0, n, itr2, kv)); + CHK_Z(_itr_check(0, n, itr2, kv2)); + + // Create another iterator from the main DB in parallel. + // Should see new kv. + jungle::Iterator itr3; + CHK_Z(itr3.init(db)); + CHK_Z(_itr_check(0, n, itr3, kv3)); + CHK_Z(_itr_check(0, n, itr3, kv4)); + + // Close iterators. + CHK_Z(itr.close()); + CHK_Z(itr2.close()); + CHK_Z(itr3.close()); + + // Close snapshot. + CHK_Z(jungle::DB::close(snap)); + + // Close DB. + CHK_Z(jungle::DB::close(db)); + + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + _free_kv_pairs(n, kv3); + _free_kv_pairs(n, kv4); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +int empty_db_snapshot_test() { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + jungle::DB* db = nullptr; + + config.maxEntriesInLogFile = 10; + s = jungle::DB::open(&db, filename, config); + CHK_Z(s); + + // Open snapshot on empty DB, should succeed. + jungle::DB* snap = nullptr; + CHK_Z(db->openSnapshot(&snap)); + + // Set KV pairs. + int n = 25; + std::vector kv(n); + CHK_Z(_init_kv_pairs(n, kv, "key1_", "value1_")); + CHK_Z(_set_bykey_kv_pairs(0, n, db, kv)); + + // Sync and flush. + CHK_Z(db->sync(false)); + CHK_Z(db->flushLogs(jungle::FlushOptions())); + + // Set more. + std::vector kv2(n); + CHK_Z(_init_kv_pairs(n, kv2, "key2_", "value2_")); + CHK_Z(_set_bykey_kv_pairs(0, n, db, kv2)); + CHK_Z(db->sync(false)); + + // Snapshot shouldn't see anything. + CHK_Z(_get_bykey_check(0, n, snap, kv, false)); + CHK_Z(_get_bykey_check(0, n, snap, kv2, false)); + + // Now open iterator using the snapshot. + jungle::Iterator itr; + CHK_Z(itr.init(snap)); + + // Shouldn't see anything. + jungle::Record rec_out; + CHK_NOT(itr.get(rec_out)); + CHK_NOT(itr.next()); + CHK_NOT(itr.prev()); + + // Close iterator. + CHK_Z(itr.close()); + + // Close snapshot. + CHK_Z(jungle::DB::close(snap)); + + // Close DB. + CHK_Z(jungle::DB::close(db)); + + _free_kv_pairs(n, kv); + _free_kv_pairs(n, kv2); + + jungle::shutdown(); + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +} +using namespace snapshot_test; + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + //ts.options.printTestMessage = true; + ts.doTest("checkpoint basic test", checkpoint_basic_test); + ts.doTest("checkpoint marker test", checkpoint_marker_test); + ts.doTest("checkpoint marker flush test", checkpoint_marker_flush_test); + ts.doTest("checkpoint marker purge test", checkpoint_marker_purge_test); + ts.doTest("checkpoint on cold start test", checkpoint_on_cold_start); + ts.doTest("snapshot basic table only test", snapshot_basic_table_only_test); + ts.doTest("snapshot basic log only test", snapshot_basic_log_only_test); + ts.doTest("snapshot basic combined test", snapshot_basic_combined_test); + ts.doTest("snapshot iterator table only test", snapshot_iterator_table_only_test); + ts.doTest("snapshot iterator log only test", snapshot_iterator_log_only_test); + ts.doTest("snapshot iterator combined test", snapshot_iterator_combined_test); + ts.doTest("snapshot with compaction test", snapshot_with_compaction_test); + ts.doTest("latest snapshot test", latest_snapshot_test); + ts.doTest("empty db snapshot test", empty_db_snapshot_test); + + return 0; +} diff --git a/tests/robust/basic_robust_child.cc b/tests/robust/basic_robust_child.cc new file mode 100644 index 0000000..9e8c94a --- /dev/null +++ b/tests/robust/basic_robust_child.cc @@ -0,0 +1,117 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "config_test_common.h" +#include "libjungle/jungle.h" +#include "test_common.h" + +#include +#include + +#include + +namespace basic_robust_child { + +int robust_child(size_t dur_sec) { + std::string filename = "./robust_child"; + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.fdbCacheSize = (uint64_t)1024*1024*1024; // 1GB + + g_config.numCompactorThreads = 1; + g_config.compactorSleepDuration_ms = 1000; // 1 second + jungle::init(g_config); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + // NOTE: Make compaction happen very frequently. + config.compactionFactor = 120; + config.minFileSizeToCompact = 4*1024*1024; + config.minBlockReuseCycleToCompact = 1; + + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t max_num = 100000; + size_t val_size = 900; + char val_buf[1024]; + memset(val_buf, 'x', 1024); + + { // Initial verify + TestSuite::UnknownProgress upp("verifying"); + upp.update(0); + jungle::Iterator itr; + itr.init(db); + uint64_t ii = 0; + do { + jungle::Record rec; + jungle::Record::Holder h(rec); + s = itr.get(rec); + if (!s) break; + upp.update(ii++); + } while (itr.next().ok()); + upp.done(); + itr.close(); + } + + TestSuite::Progress pp(dur_sec, "testing"); + TestSuite::Timer tt(dur_sec * 1000); + + TestSuite::WorkloadGenerator wg(30000.0); + while (!tt.timeover()) { + if (wg.getNumOpsToDo() == 0) { + TestSuite::sleep_us(100); + continue; + } + + //size_t number = (idx * 7) % max_num; + size_t number = rand() % max_num; + + std::string key_str = "k" + TestSuite::lzStr(7, number); + jungle::SizedBuf key(key_str); + + sprintf(val_buf, "%s", TestSuite::getTimeString().c_str()); + jungle::SizedBuf val(val_size, val_buf); + + CHK_Z( db->set(jungle::KV(key, val)) ); + + pp.update(tt.getTimeSec()); + wg.addNumOpsDone(1); + } + pp.done(); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + return 0; +} + +}; // namespace basic_robust_child +using namespace basic_robust_child; + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + ts.options.printTestMessage = true; + ts.doTest( "basic robust child", robust_child, TestRange({90}) ); + + return 0; +} + + diff --git a/tests/robust/basic_robust_main.cc b/tests/robust/basic_robust_main.cc new file mode 100644 index 0000000..ab51e8c --- /dev/null +++ b/tests/robust/basic_robust_main.cc @@ -0,0 +1,16 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + diff --git a/tests/stress/compactor_stress_test.cc b/tests/stress/compactor_stress_test.cc new file mode 100644 index 0000000..ccd9441 --- /dev/null +++ b/tests/stress/compactor_stress_test.cc @@ -0,0 +1,395 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "config_test_common.h" +#include "libjungle/jungle.h" +#include "test_common.h" + +#include "latency_collector.h" +#include "latency_dump.h" +#include "internal_helper.h" + +#include +#include +#include + +#include + +namespace compactor_stress_test { + +static LatencyCollector global_lat; + +static std::string given_path; +static bool verify_at_the_end = true; +static int write_rate = 10000; + +std::string get_key_str(uint64_t idx) { + return "k" + TestSuite::lzStr(8, idx); +} + +struct IteratorArgs : TestSuite::ThreadArgs { + IteratorArgs() + : TestSuite::ThreadArgs() + , db(nullptr) + , maxNum(0) + , batchSize(30) + , lastInsertedNumber(nullptr) + , termSignal(false) {} + jungle::DB* db; + size_t maxNum; + size_t batchSize; + std::atomic* lastInsertedNumber; + std::atomic termSignal; +}; + +int iterator_worker(TestSuite::ThreadArgs* base_args) { + IteratorArgs* args = static_cast(base_args); + uint64_t succ_count = 0; + uint64_t succ_get_count = 0; + uint64_t fail_count = 0; + jungle::Status s; + jungle::Timer tt; + + while ( !args->termSignal ) { + jungle::Iterator itr; + size_t num_limit = args->lastInsertedNumber->load(); + if (!num_limit) { + TestSuite::sleep_ms(1); + continue; + } + + size_t rnd_start = std::rand() % num_limit; + std::string key_s = get_key_str(rnd_start); + std::string key_e = get_key_str(rnd_start + args->batchSize); + + { collectBlockLatency(&global_lat, "init"); + s = itr.init(args->db, jungle::SizedBuf(key_s), jungle::SizedBuf(key_e)); + } + + if (!s) { + fail_count++; + printf("%d %zu %lu\n", (int)s, rnd_start, num_limit); + continue; + } + + size_t batch_cnt = 0; + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + succ_get_count++; + + //printf("%.*s\n", (int)rec_out.kv.key.size, rec_out.kv.key.data); + rec_out.free(); + if (batch_cnt++ >= args->batchSize) break; + + { collectBlockLatency(&global_lat, "next"); + s = itr.next(); + } + } while (s); + + itr.close(); + succ_count++; + } + TestSuite::_msg("%zu %zu successful reads, %ld failed reads\n", + succ_count, succ_get_count, fail_count); + + return 0; +} + +int cpt_stress_test(size_t dur_sec) { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + bool do_initial_load = true; + if (!given_path.empty()) { + filename = given_path; + if ( TestSuite::exist(given_path) && + TestSuite::exist(given_path + "/db_manifest") ) { + do_initial_load = false; + } + } + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.fdbCacheSize = (uint64_t)1*1024*1024*1024; // 1GB + + g_config.numCompactorThreads = 1; + g_config.compactorSleepDuration_ms = 1000; // 1 second + g_config.flusherMinRecordsToTrigger = 8192; + + g_config.numTableWriters = 4; + jungle::init(g_config); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.compactionFactor = 200; + //config.compactionFactor = 200; + config.blockReuseFactor = 0; + //config.blockReuseFactor = 0; + config.minFileSizeToCompact = 4*1024*1024; + config.minBlockReuseCycleToCompact = 0; + config.maxBlockReuseCycle = 0; + config.numL0Partitions = 4; + + config.nextLevelExtension = false; + config.maxL0TableSize = (uint64_t)64*1024*1024; + config.maxL1TableSize = (uint64_t)64*1024*1024; + config.maxL1Size = (uint64_t)5*64*1024*1024; + config.tableSizeRatio = {2.5}; + config.levelSizeRatio = {10.0}; + config.bloomFilterBitsPerUnit = 0.0; + config.useBloomFilterForGet = true; + + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + //const size_t MAX_NUM = 250000; + const size_t MAX_NUM = 400 * 1000; + const size_t VAL_SIZE = 900; + + jungle::DebugParams d_params; + d_params.urgentCompactionRatio = 120; (void)d_params; + //jungle::setDebugParams(d_params, 60); + + char val_buf[1024]; + memset(val_buf, 'x', 1024); + + if (do_initial_load) { + // Initial load + TestSuite::Progress pp(MAX_NUM, "initial load"); + TestSuite::Timer tt; + for (size_t ii=0; iiset(jungle::KV(key, val)) ); + pp.update(ii); + } + pp.done(); + db->sync(false); + db->flushLogs(jungle::FlushOptions()); + + TestSuite::_msg( "initial load rate: %s ops/sec\n", + TestSuite::throughputStr(MAX_NUM, tt.getTimeUs()).c_str() ); + TestSuite::sleep_sec(5, "sleep"); + + } else { + TestSuite::_msg("skip initial load\n"); + } + + + size_t idx = 1; + std::atomic last_inserted_number(MAX_NUM); + + size_t NUM_ITR_THREADS = 2; + std::vector args_arr(NUM_ITR_THREADS); + for (IteratorArgs& args: args_arr) { + args.db = db; + args.maxNum = MAX_NUM; + args.lastInsertedNumber = &last_inserted_number; + } + + std::vector hs(NUM_ITR_THREADS); + for (size_t ii=0; iiset(jungle::KV(key, val)) ); + //last_inserted_number = number; + idx++; + + wg.addNumOpsDone(1); + } + pp.done(); + + for (IteratorArgs& args: args_arr) { + args.termSignal = true; + } + + for (size_t ii=0; iijoin(); + CHK_Z(hs[ii]->getResult()); + delete hs[ii]; + } + + if (verify_at_the_end) { + // Final verify + { TestSuite::Timer tt; + TestSuite::Progress pp(MAX_NUM, "verifying (point)"); + for (size_t ii=0; iiget(jungle::SizedBuf(key), value_out) ); + value_out.free(); + pp.update(ii); + } + pp.done(); + TestSuite::_msg( "point lookup rate: %s ops/sec\n", + TestSuite::throughputStr + (MAX_NUM, tt.getTimeUs()).c_str() ); + } + + { TestSuite::Timer tt; + TestSuite::Progress pp(MAX_NUM, "verifying (iterator)"); + jungle::Iterator itr; + itr.init(db); + size_t ii = 0; + do { + TestSuite::setInfo("ii=%zu", ii); + jungle::Record rec_out; + jungle::Record::Holder h_rec_out(rec_out); + CHK_Z( itr.get(rec_out) ); + + std::string key = get_key_str(ii); + CHK_EQ( jungle::SizedBuf(key), rec_out.kv.key ); + + sprintf(val_buf, "v%07zu", ii); + jungle::SizedBuf val(VAL_SIZE, val_buf); + CHK_EQ( val, rec_out.kv.value ); + + pp.update(ii); + ii++; + } while (itr.next()); + itr.close(); + pp.done(); + TestSuite::_msg( "range lookup rate: %s ops/sec\n", + TestSuite::throughputStr + (MAX_NUM, tt.getTimeUs()).c_str() ); + } + + // Close & re-open & re-verify + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + { TestSuite::Timer tt; + TestSuite::Progress pp(MAX_NUM, "re-verifying (point)"); + for (size_t ii=0; iiget(jungle::SizedBuf(key), value_out) ); + value_out.free(); + pp.update(ii); + } + pp.done(); + TestSuite::_msg( "point lookup rate: %s ops/sec\n", + TestSuite::throughputStr + (MAX_NUM, tt.getTimeUs()).c_str() ); + } + + { TestSuite::Timer tt; + TestSuite::Progress pp(MAX_NUM, "re-verifying (iterator)"); + jungle::Iterator itr; + itr.init(db); + size_t ii = 0; + do { + TestSuite::setInfo("ii=%zu", ii); + jungle::Record rec_out; + jungle::Record::Holder h_rec_out(rec_out); + CHK_Z( itr.get(rec_out) ); + + std::string key = get_key_str(ii); + CHK_EQ( jungle::SizedBuf(key), rec_out.kv.key ); + + sprintf(val_buf, "v%07zu", ii); + jungle::SizedBuf val(VAL_SIZE, val_buf); + CHK_EQ( val, rec_out.kv.value ); + + pp.update(ii); + ii++; + } while (itr.next()); + itr.close(); + pp.done(); + TestSuite::_msg( "range lookup rate: %s ops/sec\n", + TestSuite::throughputStr + (MAX_NUM, tt.getTimeUs()).c_str() ); + } + } + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + if (given_path.empty()) { + TEST_SUITE_CLEANUP_PATH(); + } + + TestSuite::Msg msg_stream; + LatencyDumpDefaultImpl dump_impl; + msg_stream << std::endl << global_lat.dump(&dump_impl) << std::endl; + + return 0; +} + +void check_args(int argc, char** argv) { + for (int ii=0; ii({10}) ); + + return 0; +} + diff --git a/tests/stress/flush_stress_test.cc b/tests/stress/flush_stress_test.cc new file mode 100644 index 0000000..39676a1 --- /dev/null +++ b/tests/stress/flush_stress_test.cc @@ -0,0 +1,229 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "config_test_common.h" +#include "libjungle/jungle.h" +#include "test_common.h" + +#include +#include + +#include + +struct FlushReaderArgs : TestSuite::ThreadArgs { + FlushReaderArgs() + : TestSuite::ThreadArgs() + , lastInsertedIdx(0) + , termSignal(false) {} + jungle::DB* db; + std::atomic lastInsertedIdx; + std::atomic termSignal; +}; + +int flush_stress_test_reader(TestSuite::ThreadArgs* base_args) { + FlushReaderArgs* args = static_cast(base_args); + uint64_t succ_count = 0; + uint64_t fail_count = 0; + + while ( !args->termSignal ) { + if (args->lastInsertedIdx < 1000) { + std::this_thread::yield(); + continue; + } + + uint64_t inserted = args->lastInsertedIdx; + for (uint64_t ii=0; iitermSignal; ++ii) { + jungle::Status s; + std::string key = "k" + TestSuite::lzStr(7, ii); + std::string val = "v" + TestSuite::lzStr(7, ii); + + jungle::SizedBuf key_req(key); + jungle::SizedBuf value_out; + s = args->db->get(key_req, value_out); + CHK_Z(s); + + CHK_EQ(jungle::SizedBuf(val), value_out); + value_out.free(); + succ_count++; + } + } + TestSuite::_msg("%ld successful reads, %ld outdated reads\n", succ_count, fail_count); + + return 0; +} + +int flusher_stress_basic_test(size_t dur_sec) { + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.fdbCacheSize = (uint64_t)1024*1024*1024; // 1GB + jungle::init(g_config); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + //config.maxEntriesInLogFile = 1000; + + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + uint64_t idx = 0; + FlushReaderArgs args; + args.db = db; + + TestSuite::ThreadHolder h(&args, flush_stress_test_reader, nullptr); + + TestSuite::Progress pp(dur_sec, "populating", "sec"); + TestSuite::Timer tt(dur_sec * 1000); + while (!tt.timeover()) { + std::string key = "k" + TestSuite::lzStr(7, idx); + std::string val = "v" + TestSuite::lzStr(7, idx); + CHK_Z(db->set(jungle::KV(key, val))); + idx++; + args.lastInsertedIdx = idx; + + if (idx && idx % 5000 == 0) { + CHK_Z(db->flushLogsAsync(jungle::FlushOptions(), nullptr, nullptr)); + } + + uint64_t cur_sec = tt.getTimeUs() / 1000000; + pp.update(cur_sec); + } + pp.done(); + TestSuite::_msg("%ld writes\n", idx); + + args.termSignal = true; + h.join(); + CHK_Z(h.getResult()); + + // Close, reopen, verify (twice). + for (size_t ii=0; ii<2; ++ii) { + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + pp = TestSuite::Progress(args.lastInsertedIdx, "verifying"); + for (uint64_t ii=0; iiget(key_req, value_out); + CHK_Z(s); + + CHK_EQ(jungle::SizedBuf(val), value_out); + value_out.free(); + pp.update(ii); + } + pp.done(); + } + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TestSuite::clearTestFile(prefix, TestSuite::END_OF_TEST); + return 0; +} + +int auto_flusher_stress_test(size_t dur_sec) { + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.fdbCacheSize = (uint64_t)1024*1024*1024; // 1GB + jungle::init(g_config); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + //config.maxEntriesInLogFile = 1000; + + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + uint64_t idx = 0; + FlushReaderArgs args; + args.db = db; + + TestSuite::ThreadHolder h(&args, flush_stress_test_reader, nullptr); + + TestSuite::Progress pp(dur_sec, "populating", "sec"); + TestSuite::Timer tt(dur_sec * 1000); + while (!tt.timeover()) { + std::string key = "k" + TestSuite::lzStr(7, idx); + std::string val = "v" + TestSuite::lzStr(7, idx); + CHK_Z(db->set(jungle::KV(key, val))); + idx++; + args.lastInsertedIdx = idx; + + uint64_t cur_sec = tt.getTimeUs() / 1000000; + pp.update(cur_sec); + } + TestSuite::_msg("%ld writes\n", idx); + + args.termSignal = true; + h.join(); + CHK_Z(h.getResult()); + + // Close, reopen, verify (twice). + for (size_t ii=0; ii<2; ++ii) { + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + pp = TestSuite::Progress(args.lastInsertedIdx, "verifying"); + for (uint64_t ii=0; iiget(key_req, value_out); + CHK_Z(s); + + CHK_EQ(jungle::SizedBuf(val), value_out); + value_out.free(); + pp.update(ii); + } + pp.done(); + } + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TestSuite::clearTestFile(prefix, TestSuite::END_OF_TEST); + return 0; +} + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + ts.options.printTestMessage = true; + ts.options.abortOnFailure = true; + ts.doTest( "manual flusher stress test", + flusher_stress_basic_test, TestRange({10}) ); + ts.doTest( "auto flusher stress test", + auto_flusher_stress_test, TestRange({10}) ); + + return 0; +} + diff --git a/tests/stress/iterator_stress_test.cc b/tests/stress/iterator_stress_test.cc new file mode 100644 index 0000000..5fab31b --- /dev/null +++ b/tests/stress/iterator_stress_test.cc @@ -0,0 +1,161 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "config_test_common.h" +#include "libjungle/jungle.h" +#include "test_common.h" + +#include +#include + +#include + +namespace ns_iterator_stress_test { + +struct IteratorArgs : TestSuite::ThreadArgs { + IteratorArgs() + : TestSuite::ThreadArgs() + , db(nullptr) + , maxNum(0) + , batchSize(30) + , lastFlushedSeq(0) + , lastInsertedNumber(0) + , termSignal(false) {} + jungle::DB* db; + size_t maxNum; + size_t batchSize; + std::atomic lastFlushedSeq; + std::atomic lastInsertedNumber; + std::atomic termSignal; +}; + +int iterator_worker(TestSuite::ThreadArgs* base_args) { + IteratorArgs* args = static_cast(base_args); + uint64_t succ_count = 0; + uint64_t succ_get_count = 0; + uint64_t fail_count = 0; + jungle::Status s; + + while ( !args->termSignal ) { + jungle::Iterator itr; + size_t num_limit = args->lastInsertedNumber; + if (!num_limit) { + std::this_thread::yield(); + continue; + } + size_t rnd_start = std::rand() % num_limit; + std::string key_s = "k" + TestSuite::lzStr(7, rnd_start); + std::string key_e = "k" + TestSuite::lzStr(7, rnd_start + 100); + + s = itr.init(args->db, jungle::SizedBuf(key_s), jungle::SizedBuf(key_e)); + if (!s) { + fail_count++; + printf("%d %zu %lu\n", (int)s, rnd_start, num_limit); + continue; + } + + size_t batch_cnt = 0; + do { + jungle::Record rec_out; + s = itr.get(rec_out); + if (!s) break; + succ_get_count++; + + //printf("%.*s\n", (int)rec_out.kv.key.size, rec_out.kv.key.data); + rec_out.free(); + if (batch_cnt++ >= args->batchSize) break; + } while (itr.next().ok()); + + itr.close(); + succ_count++; + } + TestSuite::_msg("%zu %zu successful reads, %ld failed reads\n", + succ_count, succ_get_count, fail_count); + + return 0; +} + +int itr_stress_test(size_t dur_sec) { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.fdbCacheSize = (uint64_t)1024*1024*1024; // 1GB + jungle::init(g_config); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + //config.numL0Partitions = 4; + + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t max_num = 1000000; + size_t idx = 1; + IteratorArgs args; + args.db = db; + args.maxNum = max_num; + + TestSuite::ThreadHolder h(&args, iterator_worker, nullptr); + + TestSuite::Progress pp(dur_sec); + TestSuite::Timer tt(dur_sec * 1000); + while (!tt.timeover()) { + size_t number = (idx * 7) % max_num; + std::string key = "k" + TestSuite::lzStr(7, number); + std::string val = "v" + TestSuite::lzStr(7, number); + CHK_Z( db->setSN(idx, jungle::KV(key, val)) ); + args.lastInsertedNumber = number; + idx++; + + if ( idx >= 10000 && + idx % 1000 == 0) { + args.lastFlushedSeq = idx - 1000; + CHK_Z( db->flushLogsAsync( jungle::FlushOptions(), + nullptr, nullptr, + idx - 1000 ) ); + } + + uint64_t cur_sec = tt.getTimeUs() / 1000000; + pp.update(cur_sec); + } + + args.termSignal = true; + h.join(); + CHK_Z(h.getResult()); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +} +using namespace ns_iterator_stress_test; + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + ts.options.printTestMessage = true; + ts.doTest( "iterator stress test", itr_stress_test, TestRange({10}) ); + + return 0; +} + diff --git a/tests/stress/log_reclaim_stress_test.cc b/tests/stress/log_reclaim_stress_test.cc new file mode 100644 index 0000000..5a84782 --- /dev/null +++ b/tests/stress/log_reclaim_stress_test.cc @@ -0,0 +1,191 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "config_test_common.h" +#include "libjungle/jungle.h" +#include "test_common.h" + +#include +#include + +#include + +namespace log_reclaim_stress_test { + +struct ReaderArgs : TestSuite::ThreadArgs { + ReaderArgs() + : durationSec(1) + , db(nullptr) + , stopSignal(false) + , curNextSlot(nullptr) + , curStartIdx(nullptr) + {} + size_t durationSec; + jungle::DB* db; + std::atomic stopSignal; + std::atomic* curNextSlot; + std::atomic* curStartIdx; +}; + +int purge_thread_func(TestSuite::ThreadArgs* t_args) { + ReaderArgs* args = (ReaderArgs*)t_args; + + while (!args->stopSignal) { + TestSuite::sleep_sec(1); + uint64_t next_slot = args->curNextSlot->load(); + if (next_slot > 10000) { + jungle::FlushOptions f_opt; + f_opt.purgeOnly = true; + args->db->flushLogs(f_opt, next_slot - 10000); + args->curStartIdx->store(next_slot - 10000 + 1); + } + } + + return 0; +} + +int point_query_thread_func(TestSuite::ThreadArgs* t_args) { + ReaderArgs* args = (ReaderArgs*)t_args; + + while (!args->stopSignal) { + uint64_t start_idx = args->curStartIdx->load(); + uint64_t next_slot = args->curNextSlot->load(); + uint64_t gap = std::min((uint64_t)5000, next_slot - start_idx); + if (!gap) continue; + + uint64_t r = next_slot - (std::rand() % gap); + for (size_t ii=r; iidb->getSN(ii+1, kv_out); + kv_out.free(); + } + } + + return 0; +} + +int range_query_thread_func(TestSuite::ThreadArgs* t_args) { + ReaderArgs* args = (ReaderArgs*)t_args; + + while (!args->stopSignal) { + TestSuite::sleep_sec(1); + + uint64_t start_idx = args->curStartIdx->load(); + uint64_t next_slot = args->curNextSlot->load(); + uint64_t gap = std::min((uint64_t)1000, next_slot - start_idx); + if (!gap) continue; + + uint64_t r = next_slot - (std::rand() % gap); + + jungle::Iterator itr; + itr.initSN(args->db, r); + do { + jungle::Record rec_out; + jungle::Status s = itr.get(rec_out); + if (!s) break; + rec_out.free(); + } while (itr.next().ok()); + itr.close(); + } + + return 0; +} + +int log_reclaim_with_queries_test(size_t dur_sec) { + std::string filename; + TEST_SUITE_PREPARE_PATH(filename); + + jungle::Status s; + + jungle::GlobalConfig g_config; + g_config.logFileReclaimerSleep_sec = 1; + jungle::init(g_config); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.numL0Partitions = 4; + config.maxEntriesInLogFile = 500; + config.logSectionOnly = true; + config.logFileTtl_sec = 1; + + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + std::vector args(3); + + std::atomic next_slot(0); + std::atomic start_idx(0); + + for (ReaderArgs& entry: args) { + entry.durationSec = dur_sec; + entry.db = db; + entry.curNextSlot = &next_slot; + entry.curStartIdx = &start_idx; + } + + // 2K writes/sec + TestSuite::WorkloadGenerator wg(2000.0); + TestSuite::Timer tt(dur_sec * 1000); + TestSuite::Progress pp(dur_sec); + + TestSuite::ThreadHolder purge_thread(&args[0], purge_thread_func, nullptr); + TestSuite::ThreadHolder point_thread(&args[1], point_query_thread_func, nullptr); + TestSuite::ThreadHolder range_thread(&args[2], range_query_thread_func, nullptr); + + char val_buf[256]; + memset(val_buf, 'x', 256); + jungle::SizedBuf val(256, val_buf); + while (!tt.timeover()) { + if (!wg.getNumOpsToDo()) { + TestSuite::sleep_us(500); + continue; + } + + std::string key_str = "seq" + TestSuite::lzStr(7, next_slot); + jungle::SizedBuf key(key_str); + CHK_Z( db->setSN(next_slot+1, jungle::KV(key, val)) ); + (void)db->sync(false); // Sync may fail. + next_slot++; + + pp.update(tt.getTimeUs() / 1000000); + wg.addNumOpsDone(1); + } + pp.done(); + for (ReaderArgs& entry: args) entry.stopSignal = true; + + purge_thread.join(); + point_thread.join(); + range_thread.join(); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TEST_SUITE_CLEANUP_PATH(); + return 0; +} + +} using namespace log_reclaim_stress_test; + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + ts.options.printTestMessage = true; + ts.doTest("log reclaim with point and range queries test", + log_reclaim_with_queries_test, + TestRange({10})); + + return 0; +} diff --git a/tests/stress/many_log_files_test.cc b/tests/stress/many_log_files_test.cc new file mode 100644 index 0000000..8c296c6 --- /dev/null +++ b/tests/stress/many_log_files_test.cc @@ -0,0 +1,78 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "config_test_common.h" +#include "libjungle/jungle.h" +#include "test_common.h" + +#include + +int many_log_files_test(size_t num_logs) { + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + jungle::GlobalConfig g_config; + jungle::init(g_config); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.logSectionOnly = true; + config.maxEntriesInLogFile = 1000; + + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + size_t num_kv = config.maxEntriesInLogFile * num_logs; + TestSuite::Progress pp(num_kv, "insert"); + for (size_t ii=0; iiset(jungle::KV(key, val)) ); + pp.update(ii); + } + pp.update(num_kv); + + TestSuite::Timer tt(5000); + pp = TestSuite::Progress(num_kv, "retrieve"); + for (size_t ii=0; iiget(jungle::SizedBuf(key), value_out) ); + value_out.free(); + pp.update(ii); + if (tt.timeover()) break; + } + pp.update(num_kv); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TestSuite::clearTestFile(prefix, TestSuite::END_OF_TEST); + return 0; +} + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + ts.options.printTestMessage = true; + ts.doTest( "many log files test", many_log_files_test, TestRange({2048}) ); + + return 0; +} + diff --git a/tests/stress/purge_stress_test.cc b/tests/stress/purge_stress_test.cc new file mode 100644 index 0000000..fca7591 --- /dev/null +++ b/tests/stress/purge_stress_test.cc @@ -0,0 +1,217 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "config_test_common.h" +#include "libjungle/jungle.h" +#include "test_common.h" + +#include +#include + +#include + +struct PurgeReaderArgs : TestSuite::ThreadArgs { + PurgeReaderArgs() + : TestSuite::ThreadArgs() + , lastPurgedSeq(0) + , lastInsertedSeq(0) + , termSignal(false) {} + jungle::DB* db; + std::atomic lastPurgedSeq; + std::atomic lastInsertedSeq; + std::atomic termSignal; +}; + +int purge_stress_test_reader(TestSuite::ThreadArgs* base_args) { + PurgeReaderArgs* args = static_cast(base_args); + uint64_t succ_count = 0; + uint64_t fail_count = 0; + + while ( !args->termSignal ) { + if (args->lastInsertedSeq < 1000) { + std::this_thread::yield(); + continue; + } + + uint64_t purged = args->lastPurgedSeq; + uint64_t inserted = args->lastInsertedSeq; + for (uint64_t ii=purged+1; iidb->getSN(ii, kv_out); + if (!s) { + fail_count++; + purged = args->lastPurgedSeq; + CHK_GTEQ(purged, ii); + ii = purged + 1; + continue; + } + + std::string key = "k" + TestSuite::lzStr(7, ii); + std::string val = "v" + TestSuite::lzStr(7, ii); + jungle::KV kv_expected(key, val); + CHK_EQ(kv_expected.key, kv_out.key); + CHK_EQ(kv_expected.value, kv_out.value); + kv_out.free(); + succ_count++; + } + } + TestSuite::_msg("%ld successful reads, %ld outdated reads\n", succ_count, fail_count); + + return 0; +} + +int purge_stress_basic_test(size_t dur_sec) { + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.fdbCacheSize = (uint64_t)1024*1024*1024; // 1GB + jungle::init(g_config); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.logSectionOnly = true; + //config.maxEntriesInLogFile = 1000; + + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + uint64_t idx = 1; + PurgeReaderArgs args; + args.db = db; + + TestSuite::ThreadHolder h(&args, purge_stress_test_reader, nullptr); + + TestSuite::Progress pp(dur_sec); + TestSuite::Timer tt(dur_sec * 1000); + while (!tt.timeover()) { + std::string key = "k" + TestSuite::lzStr(7, idx); + std::string val = "v" + TestSuite::lzStr(7, idx); + CHK_Z( db->setSN(idx, jungle::KV(key, val)) ); + args.lastInsertedSeq = idx; + idx++; + + if ( idx >= 10000 && + idx % 5000 == 0) { + args.lastPurgedSeq = idx - 5000; + CHK_Z( db->flushLogsAsync( jungle::FlushOptions(), + nullptr, nullptr, + idx - 5000 ) ); + } + + uint64_t cur_sec = tt.getTimeUs() / 1000000; + pp.update(cur_sec); + } + + args.termSignal = true; + h.join(); + CHK_Z(h.getResult()); + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TestSuite::clearTestFile(prefix, TestSuite::END_OF_TEST); + return 0; +} + +int purge_with_auto_flusher_test(size_t dur_sec) { + jungle::Status s; + + const std::string prefix = TEST_SUITE_AUTO_PREFIX; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + jungle::GlobalConfig g_config; + g_config.numFlusherThreads = 1; + g_config.fdbCacheSize = (uint64_t)1024*1024*1024; // 1GB + jungle::init(g_config); + + jungle::DBConfig config; + TEST_CUSTOM_DB_CONFIG(config) + config.logSectionOnly = true; + //config.maxEntriesInLogFile = 1000; + + jungle::DB* db; + CHK_Z(jungle::DB::open(&db, filename, config)); + + uint64_t idx = 1; + PurgeReaderArgs args; + args.db = db; + + TestSuite::ThreadHolder h(&args, purge_stress_test_reader, nullptr); + + TestSuite::Progress pp(dur_sec); + TestSuite::Timer tt(dur_sec * 1000); + while (!tt.timeover()) { + std::string key = "k" + TestSuite::lzStr(7, idx); + std::string val = "v" + TestSuite::lzStr(7, idx); + CHK_Z( db->setSN(idx, jungle::KV(key, val)) ); + args.lastInsertedSeq = idx; + idx++; + + uint64_t cur_sec = tt.getTimeUs() / 1000000; + pp.update(cur_sec); + } + + args.termSignal = true; + h.join(); + CHK_Z(h.getResult()); + + // Close, reopen, verify (twice). + for (size_t kk=0; kk<2; ++kk) { + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::DB::open(&db, filename, config)); + + pp = TestSuite::Progress(args.lastInsertedSeq, "verifying"); + for (uint64_t ii=1; ii<=args.lastInsertedSeq; ++ii) { + std::string key = "k" + TestSuite::lzStr(7, ii); + std::string val = "v" + TestSuite::lzStr(7, ii); + + jungle::SizedBuf key_req(key); + jungle::SizedBuf value_out; + s = db->get(key_req, value_out); + CHK_Z(s); + + CHK_EQ(jungle::SizedBuf(val), value_out); + value_out.free(); + pp.update(ii); + } + pp.done(); + } + + CHK_Z(jungle::DB::close(db)); + CHK_Z(jungle::shutdown()); + + TestSuite::clearTestFile(prefix, TestSuite::END_OF_TEST); + return 0; +} + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + ts.options.printTestMessage = true; + ts.doTest( "purge stress test", purge_stress_basic_test, TestRange({10}) ); + ts.doTest( "purge with auto flusher test", + purge_with_auto_flusher_test, TestRange({5}) ); + + return 0; +} + diff --git a/tests/test_common.h b/tests/test_common.h new file mode 100644 index 0000000..7e67222 --- /dev/null +++ b/tests/test_common.h @@ -0,0 +1,1561 @@ +/************************************************************************ +Modifications Copyright 2017-2019 eBay Inc. + +Original Copyright 2017 Jung-Sang Ahn +See URL: https://github.com/greensky00/testsuite + (v0.1.68) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if defined(__linux__) || defined(__APPLE__) + #include + #include +#elif defined(WIN32) || defined(_WIN32) + #define NOMINMAX + #include + #include + typedef SSIZE_T ssize_t; +#endif + +#ifndef _CLM_DEFINED +#define _CLM_DEFINED (1) + +#ifdef TESTSUITE_NO_COLOR + #define _CLM_D_GRAY "" + #define _CLM_GREEN "" + #define _CLM_B_GREEN "" + #define _CLM_RED "" + #define _CLM_B_RED "" + #define _CLM_BROWN "" + #define _CLM_B_BROWN "" + #define _CLM_BLUE "" + #define _CLM_B_BLUE "" + #define _CLM_MAGENTA "" + #define _CLM_B_MAGENTA "" + #define _CLM_CYAN "" + #define _CLM_END "" + + #define _CLM_WHITE_FG_RED_BG "" +#else + #define _CLM_D_GRAY "\033[1;30m" + #define _CLM_GREEN "\033[32m" + #define _CLM_B_GREEN "\033[1;32m" + #define _CLM_RED "\033[31m" + #define _CLM_B_RED "\033[1;31m" + #define _CLM_BROWN "\033[33m" + #define _CLM_B_BROWN "\033[1;33m" + #define _CLM_BLUE "\033[34m" + #define _CLM_B_BLUE "\033[1;34m" + #define _CLM_MAGENTA "\033[35m" + #define _CLM_B_MAGENTA "\033[1;35m" + #define _CLM_CYAN "\033[36m" + #define _CLM_B_GREY "\033[1;37m" + #define _CLM_END "\033[0m" + + #define _CLM_WHITE_FG_RED_BG "\033[37;41m" +#endif + +#define _CL_D_GRAY(str) _CLM_D_GRAY str _CLM_END +#define _CL_GREEN(str) _CLM_GREEN str _CLM_END +#define _CL_RED(str) _CLM_RED str _CLM_END +#define _CL_B_RED(str) _CLM_B_RED str _CLM_END +#define _CL_MAGENTA(str) _CLM_MAGENTA str _CLM_END +#define _CL_BROWN(str) _CLM_BROWN str _CLM_END +#define _CL_B_BROWN(str) _CLM_B_BROWN str _CLM_END +#define _CL_B_BLUE(str) _CLM_B_BLUE str _CLM_END +#define _CL_B_MAGENTA(str) _CLM_B_MAGENTA str _CLM_END +#define _CL_CYAN(str) _CLM_CYAN str _CLM_END +#define _CL_B_GRAY(str) _CLM_B_GREY str _CLM_END + +#define _CL_WHITE_FG_RED_BG(str) _CLM_WHITE_FG_RED_BG str _CLM_END + +#endif + +#define __COUT_STACK_INFO__ \ + std::endl \ + << " time: " << _CLM_D_GRAY << \ + TestSuite::getTimeString() << _CLM_END << "\n" \ + << " thread: " << _CLM_BROWN \ + << std::hex << std::setw(4) << std::setfill('0') << \ + (std::hash{}( std::this_thread::get_id() ) & 0xffff) \ + << std::dec << _CLM_END << "\n" \ + << " in: " << _CLM_CYAN << __func__ << "()" _CLM_END << "\n" \ + << " at: " << _CLM_GREEN << __FILE__ << _CLM_END ":" \ + << _CLM_B_MAGENTA << __LINE__ << _CLM_END << "\n" \ + +// exp_value == value +#define CHK_EQ(exp_value, value) \ +{ \ + auto _ev = (exp_value); \ + decltype(_ev) _v = (decltype(_ev))(value); \ + if (_ev != _v) { \ + std::cout \ + << __COUT_STACK_INFO__ \ + << " value of: " _CLM_B_BLUE #value _CLM_END "\n" \ + << " expected: " _CLM_B_GREEN << _ev << _CLM_END "\n" \ + << " actual: " _CLM_B_RED << _v << _CLM_END "\n"; \ + TestSuite::failHandler(); \ + return -1; \ + } \ +} + +// exp_value != value +#define CHK_NEQ(exp_value, value) \ +{ \ + auto _ev = (exp_value); \ + decltype(_ev) _v = (decltype(_ev))(value); \ + if (_ev == _v) { \ + std::cout \ + << __COUT_STACK_INFO__ \ + << " value of: " _CLM_B_BLUE #value _CLM_END "\n" \ + << " expected: not " _CLM_B_GREEN << _ev << _CLM_END "\n" \ + << " actual: " _CLM_B_RED << _v << _CLM_END "\n"; \ + TestSuite::failHandler(); \ + return -1; \ + } \ +} + +// value == true +#define CHK_OK(value) \ + if (!(value)) { \ + std::cout \ + << __COUT_STACK_INFO__ \ + << " value of: " _CLM_B_BLUE #value _CLM_END "\n" \ + << " expected: " _CLM_B_GREEN << "true" << _CLM_END "\n" \ + << " actual: " _CLM_B_RED << "false" << _CLM_END "\n"; \ + TestSuite::failHandler(); \ + return -1; \ + } + +#define CHK_TRUE(value) CHK_OK(value) + +// value == false +#define CHK_NOT(value) \ + if (value) { \ + std::cout \ + << __COUT_STACK_INFO__ \ + << " value of: " _CLM_B_BLUE #value _CLM_END "\n" \ + << " expected: " _CLM_B_GREEN << "false" << _CLM_END "\n" \ + << " actual: " _CLM_B_RED << "true" << _CLM_END "\n"; \ + TestSuite::failHandler(); \ + return -1; \ + } + +#define CHK_FALSE(value) CHK_NOT(value) + +// value == NULL +#define CHK_NULL(value) \ +{ \ + auto _v = (value); \ + if (_v) { \ + std::cout \ + << __COUT_STACK_INFO__ \ + << " value of: " _CLM_B_BLUE #value _CLM_END "\n" \ + << " expected: " _CLM_B_GREEN << "NULL" << _CLM_END "\n"; \ + printf(" actual: " _CLM_B_RED "%p" _CLM_END "\n", _v); \ + TestSuite::failHandler(); \ + return -1; \ + } \ +} + +// value != NULL +#define CHK_NONNULL(value) \ + if (!(value)) { \ + std::cout \ + << __COUT_STACK_INFO__ \ + << " value of: " _CLM_B_BLUE #value _CLM_END "\n" \ + << " expected: " _CLM_B_GREEN << "non-NULL" << _CLM_END "\n" \ + << " actual: " _CLM_B_RED << "NULL" << _CLM_END "\n"; \ + TestSuite::failHandler(); \ + return -1; \ + } + +// value == 0 +#define CHK_Z(value) \ +{ \ + auto _v = (value); \ + if ((0) != _v) { \ + std::cout \ + << __COUT_STACK_INFO__ \ + << " value of: " _CLM_B_BLUE #value _CLM_END "\n" \ + << " expected: " _CLM_B_GREEN << "0" << _CLM_END "\n" \ + << " actual: " _CLM_B_RED << _v << _CLM_END "\n"; \ + TestSuite::failHandler(); \ + return -1; \ + } \ +} + +// smaller < greater +#define CHK_SM(smaller, greater) \ +{ \ + auto _sm = (smaller); \ + decltype(_sm) _gt = (decltype(_sm))(greater); \ + if (!(_sm < _gt)) { \ + std::cout \ + << __COUT_STACK_INFO__ \ + << " expected: " \ + << _CLM_B_BLUE #smaller " < " #greater _CLM_END "\n" \ + << " value of " \ + << _CLM_B_GREEN #smaller _CLM_END ": " \ + << _CLM_B_RED << _sm << _CLM_END "\n" \ + << " value of " \ + << _CLM_B_GREEN #greater _CLM_END ": " \ + << _CLM_B_RED << _gt << _CLM_END "\n"; \ + TestSuite::failHandler(); \ + return -1; \ + } \ +} + +// smaller <= greater +#define CHK_SMEQ(smaller , greater) \ +{ \ + auto _sm = (smaller); \ + decltype(_sm) _gt = (decltype(_sm))(greater); \ + if (!(_sm <= _gt)) { \ + std::cout \ + << __COUT_STACK_INFO__ \ + << " expected: " \ + << _CLM_B_BLUE #smaller " <= " #greater _CLM_END "\n" \ + << " value of " \ + << _CLM_B_GREEN #smaller _CLM_END ": " \ + << _CLM_B_RED << _sm << _CLM_END "\n" \ + << " value of " \ + << _CLM_B_GREEN #greater _CLM_END ": " \ + << _CLM_B_RED << _gt << _CLM_END "\n"; \ + TestSuite::failHandler(); \ + return -1; \ + } \ +} + +// greater > smaller +#define CHK_GT(greater, smaller) \ +{ \ + auto _sm = (smaller); \ + decltype(_sm) _gt = (decltype(_sm))(greater); \ + if (!(_gt > _sm)) { \ + std::cout \ + << __COUT_STACK_INFO__ \ + << " expected: " \ + << _CLM_B_BLUE #greater " > " #smaller _CLM_END "\n" \ + << " value of " \ + << _CLM_B_GREEN #greater _CLM_END ": " \ + << _CLM_B_RED << _gt << _CLM_END "\n" \ + << " value of " \ + << _CLM_B_GREEN #smaller _CLM_END ": " \ + << _CLM_B_RED << _sm << _CLM_END "\n"; \ + TestSuite::failHandler(); \ + return -1; \ + } \ +} + +// greater >= smaller +#define CHK_GTEQ(greater, smaller) \ +{ \ + auto _sm = (smaller); \ + decltype(_sm) _gt = (decltype(_sm))(greater); \ + if (!(_gt >= _sm)) { \ + std::cout \ + << __COUT_STACK_INFO__ \ + << " expected: " \ + << _CLM_B_BLUE #greater " >= " #smaller _CLM_END "\n" \ + << " value of " \ + << _CLM_B_GREEN #greater _CLM_END ": " \ + << _CLM_B_RED << _gt << _CLM_END "\n" \ + << " value of " \ + << _CLM_B_GREEN #smaller _CLM_END ": " \ + << _CLM_B_RED << _sm << _CLM_END "\n"; \ + TestSuite::failHandler(); \ + return -1; \ + } \ +} + + +using test_func = std::function; + +class TestArgsBase; +using test_func_args = std::function; + +class TestSuite; +class TestArgsBase { +public: + virtual ~TestArgsBase() { } + void setCallback(std::string test_name, + test_func_args func, + TestSuite* test_instance) { + testName = test_name; + testFunction = func; + testInstance = test_instance; + } + void testAll() { testAllInternal(0); } + virtual void setParam(size_t param_no, size_t param_idx) = 0; + virtual size_t getNumSteps(size_t param_no) = 0; + virtual size_t getNumParams() = 0; + virtual std::string toString() = 0; + +private: + inline void testAllInternal(size_t depth); + std::string testName; + test_func_args testFunction; + TestSuite* testInstance; +}; + +class TestArgsWrapper { +public: + TestArgsWrapper(TestArgsBase* _test_args) : test_args(_test_args) {} + ~TestArgsWrapper() { delete test_args; } + TestArgsBase* getArgs() const { return test_args; } + operator TestArgsBase*() const { return getArgs(); } +private: + TestArgsBase* test_args; +}; + +enum class StepType { + LINEAR, + EXPONENTIAL +}; + +template +class TestRange { +public: + TestRange() : type(RangeType::NONE), begin(), end(), step() {} + + // Constructor for given values + TestRange(const std::vector& _array) + : type(RangeType::ARRAY), array(_array) + , begin(), end(), step() {} + + // Constructor for regular steps + TestRange(T _begin, T _end, T _step, StepType _type) + : begin(_begin), end(_end), step(_step) + { + if (_type == StepType::LINEAR) { + type = RangeType::LINEAR; + } else { + type = RangeType::EXPONENTIAL; + } + } + + T getEntry(size_t idx) { + if (type == RangeType::ARRAY) { + return (T)(array[idx]); + } else if (type == RangeType::LINEAR) { + return (T)(begin + step * idx); + } else if (type == RangeType::EXPONENTIAL) { + ssize_t _begin = begin; + ssize_t _step = step; + ssize_t _ret = (ssize_t)( _begin * std::pow(_step, idx) ); + return (T)(_ret); + } + + return begin; + } + + size_t getSteps() { + if (type == RangeType::ARRAY) { + return array.size(); + } else if (type == RangeType::LINEAR) { + return ((end - begin) / step) + 1; + } else if (type == RangeType::EXPONENTIAL) { + ssize_t coe = ((ssize_t)end) / ((ssize_t)begin); + double steps_double = (double)std::log(coe) / std::log(step); + return (size_t)(steps_double + 1); + } + + return 0; + } + +private: + enum class RangeType { + NONE, + ARRAY, + LINEAR, + EXPONENTIAL + }; + + RangeType type; + std::vector array; + T begin; + T end; + T step; +}; + +struct TestOptions { + TestOptions() + : printTestMessage(false) + , abortOnFailure(false) + , preserveTestFiles(false) + {} + bool printTestMessage; + bool abortOnFailure; + bool preserveTestFiles; +}; + +class TestSuite { + friend TestArgsBase; +private: + static std::mutex& getResMsgLock() { + static std::mutex res_msg_lock; + return res_msg_lock; + } + static std::string& getResMsg() { + static std::string res_msg; + return res_msg; + } + static std::string& getInfoMsg() { + thread_local std::string info_msg; + return info_msg; + } + static std::string& getTestName() { + static std::string test_name; + return test_name; + } + static TestSuite*& getCurTest() { + static TestSuite* cur_test; + return cur_test; + } +public: + static bool& globalMsgFlag() { + static bool global_msg_flag = false; + return global_msg_flag; + } + static std::string getCurrentTestName() { + return getTestName(); + } + static bool isMsgAllowed() { + TestSuite* cur_test = TestSuite::getCurTest(); + if ( cur_test && + (cur_test->options.printTestMessage || cur_test->displayMsg) && + !cur_test->suppressMsg ) { + return true; + } + if (globalMsgFlag()) return true; + return false; + } + + static void setInfo(const char* format, ...) { + thread_local char info_buf[4096]; + size_t len = 0; + va_list args; + va_start(args, format); + len += vsnprintf(info_buf + len, 4096 - len, format, args); + va_end(args); + getInfoMsg() = info_buf; + } + static void clearInfo() { + getInfoMsg().clear(); + } + + static void failHandler() { + if (!getInfoMsg().empty()) { + std::cout << " info: " << getInfoMsg() << std::endl; + } + } + + static void usage(int argc, char** argv) { + printf("\n"); + printf("Usage: %s [-f ] [-r ] [-p]\n", argv[0]); + printf("\n"); + printf(" -f, --filter\n"); + printf(" Run specific tests matching the given keyword.\n"); + printf(" -r, --range\n"); + printf(" Run TestRange-based tests using given parameter value.\n"); + printf(" -p, --preserve\n"); + printf(" Do not clean up test files.\n"); + printf(" --abort-on-failure\n"); + printf(" Immediately abort the test if failure happens.\n"); + printf(" --suppress-msg\n"); + printf(" Suppress test messages.\n"); + printf(" --display-msg\n"); + printf(" Display test messages.\n"); + printf("\n"); + } + + static std::string usToString(uint64_t us) { + std::stringstream ss; + if (us < 1000) { + // us + ss << std::fixed << std::setprecision(0) << us << " us"; + } else if (us < 1000000) { + // ms + double tmp = static_cast(us / 1000.0); + ss << std::fixed << std::setprecision(1) << tmp << " ms"; + } else if (us < (uint64_t)600 * 1000000) { + // second: 1 s -- 600 s (10 mins) + double tmp = static_cast(us / 1000000.0); + ss << std::fixed << std::setprecision(1) << tmp << " s"; + } else { + // minute + double tmp = static_cast(us / 60.0 / 1000000.0); + ss << std::fixed << std::setprecision(0) << tmp << " m"; + } + return ss.str(); + } + + static std::string countToString(uint64_t count) { + std::stringstream ss; + if (count < 1000) { + ss << count; + } else if (count < 1000000) { + // K + double tmp = static_cast(count / 1000.0); + ss << std::fixed << std::setprecision(1) << tmp << "K"; + } else if (count < (uint64_t)1000000000) { + // M + double tmp = static_cast(count / 1000000.0); + ss << std::fixed << std::setprecision(1) << tmp << "M"; + } else { + // B + double tmp = static_cast(count / 1000000000.0); + ss << std::fixed << std::setprecision(1) << tmp << "B"; + } + return ss.str(); + } + + static std::string sizeToString(uint64_t size) { + std::stringstream ss; + if (size < 1024) { + ss << size << " B"; + } else if (size < 1024*1024) { + // K + double tmp = static_cast(size / 1024.0); + ss << std::fixed << std::setprecision(1) << tmp << " KiB"; + } else if (size < (uint64_t)1024*1024*1024) { + // M + double tmp = static_cast(size / 1024.0 / 1024.0); + ss << std::fixed << std::setprecision(1) << tmp << " MiB"; + } else { + // B + double tmp = static_cast(size / 1024.0 / 1024.0 / 1024.0); + ss << std::fixed << std::setprecision(1) << tmp << " GiB"; + } + return ss.str(); + } + +private: + struct TimeInfo { + TimeInfo(std::tm* src) + : year(src->tm_year + 1900) + , month(src->tm_mon + 1) + , day(src->tm_mday) + , hour(src->tm_hour) + , min(src->tm_min) + , sec(src->tm_sec) + , msec(0) + , usec(0) {} + TimeInfo(std::chrono::system_clock::time_point now) { + std::time_t raw_time = std::chrono::system_clock::to_time_t(now); + std::tm new_time; + +#if defined(__linux__) || defined(__APPLE__) + std::tm* lt_tm = localtime_r(&raw_time, &new_time); + +#elif defined(WIN32) || defined(_WIN32) + localtime_s(&new_time, &raw_time); + std::tm* lt_tm = &new_time; +#endif + + year = lt_tm->tm_year + 1900; + month = lt_tm->tm_mon + 1; + day = lt_tm->tm_mday; + hour = lt_tm->tm_hour; + min = lt_tm->tm_min; + sec = lt_tm->tm_sec; + + size_t us_epoch = std::chrono::duration_cast + < std::chrono::microseconds > + ( now.time_since_epoch() ).count(); + msec = (us_epoch / 1000) % 1000; + usec = us_epoch % 1000; + } + int year; + int month; + int day; + int hour; + int min; + int sec; + int msec; + int usec; + }; + +public: + TestSuite(int argc = 0, char **argv = nullptr) + : cntPass(0) + , cntFail(0) + , useGivenRange(false) + , preserveTestFiles(false) + , forceAbortOnFailure(false) + , suppressMsg(false) + , displayMsg(false) + , givenRange(0) + , startTimeGlobal(std::chrono::system_clock::now()) + { + for (int ii=1; ii cur_time = + std::chrono::system_clock::now();; + std::chrono::duration elapsed = cur_time - startTimeGlobal; + std::string time_str = usToString + ( (uint64_t)(elapsed.count() * 1000000) ); + + printf(_CL_GREEN("%zu") " tests passed", cntPass); + if (cntFail) { + printf(", " _CL_RED("%zu") " tests failed", cntFail); + } + printf(" out of " _CL_CYAN("%zu") " (" _CL_BROWN("%s") ")\n", + cntPass+cntFail, time_str.c_str()); + } + + // === Helper functions ==================================== + static std::string getTestFileName(const std::string& prefix) { + TimeInfo lt(std::chrono::system_clock::now()); + (void)lt; + + char time_char[64]; + sprintf(time_char, "%04d%02d%02d_%02d%02d%02d", + lt.year, lt.month, lt.day, lt.hour, lt.min, lt.sec); + + std::string ret = prefix; + ret += "_"; + ret += time_char; + return ret; + } + + static std::string getTimeString() { + TimeInfo lt(std::chrono::system_clock::now()); + char time_char[64]; + sprintf(time_char, "%04d-%02d-%02d %02d:%02d:%02d.%03d%03d", + lt.year, lt.month, lt.day, lt.hour, lt.min, lt.sec, lt.msec, lt.usec); + return time_char; + } + static std::string getTimeStringShort() { + TimeInfo lt(std::chrono::system_clock::now()); + char time_char[64]; + sprintf(time_char, "%02d:%02d.%03d %03d", + lt.min, lt.sec, lt.msec, lt.usec); + return time_char; + } + static std::string getTimeStringPlain() { + TimeInfo lt(std::chrono::system_clock::now()); + char time_char[64]; + sprintf(time_char, "%02d%02d_%02d%02d%02d", + lt.month, lt.day, lt.hour, lt.min, lt.sec); + return time_char; + } + + static int mkdir(const std::string& path) { +#if defined(__linux__) || defined(__APPLE__) + struct stat st; + if (stat(path.c_str(), &st) != 0) { + return ::mkdir(path.c_str(), 0755); + } + +#elif defined(WIN32) || defined(_WIN32) + if (GetFileAttributes(path.c_str()) == INVALID_FILE_ATTRIBUTES) { + return _mkdir(path.c_str()); + } +#endif + return 0; + } + static int copyfile(const std::string& src, + const std::string& dst) { +#if defined(__linux__) || defined(__APPLE__) + std::string cmd = "cp -R " + src + " " + dst; + int rc = ::system(cmd.c_str()); + return rc; + +#elif defined(WIN32) || defined(_WIN32) + // TODO: `xcopy` only copies folders, not files. + std::string cmd = "xcopy /e /i /h " + src + " " + dst + " > NUL"; + int rc = ::system(cmd.c_str()); + return rc; +#endif + } + static int remove(const std::string& path) { + int rc = ::remove(path.c_str()); + return rc; + } + static bool exist(const std::string& path) { +#if defined(__linux__) || defined(__APPLE__) + struct stat st; + int result = stat(path.c_str(), &st); + return (result == 0); + +#elif defined(WIN32) || defined(_WIN32) + if (GetFileAttributes(path.c_str()) != INVALID_FILE_ATTRIBUTES) { + return true; + } + return false; +#endif + } + + enum TestPosition { + BEGINNING_OF_TEST = 0, + MIDDLE_OF_TEST = 1, + END_OF_TEST = 2, + }; + static void clearTestFile( const std::string& prefix, + TestPosition test_pos = MIDDLE_OF_TEST ) { + TestSuite*& cur_test = TestSuite::getCurTest(); + if ( test_pos == END_OF_TEST && + ( cur_test->preserveTestFiles || + cur_test->options.preserveTestFiles ) ) return; + + int r; +#if defined(__linux__) || defined(__APPLE__) + std::string command = "rm -rf "; + command += prefix; + command += "*"; + r = system(command.c_str()); + (void)r; + +#elif defined(WIN32) || defined(_WIN32) + std::string command = "del /s /f /q "; + command += prefix; + command += "* > NUL"; + r = system(command.c_str()); + (void)r; + + // Windows `del` operation cannot delete folders. + // Just in case if there are any folders. + WIN32_FIND_DATA filedata; + HANDLE hfind; + std::string query_str = prefix + "*"; + hfind = FindFirstFile(query_str.c_str(), &filedata); + while (hfind != INVALID_HANDLE_VALUE) { + std::string f_name(filedata.cFileName); + size_t f_name_pos = f_name.find(prefix); + if (f_name_pos != std::string::npos) { + command = "rmdir /s /q " + f_name + " > NUL"; + r = system(command.c_str()); + (void)r; + } + + if (!FindNextFile(hfind, &filedata)) { + FindClose(hfind); + hfind = INVALID_HANDLE_VALUE; + } + } +#endif + } + + static void setResultMessage(const std::string& msg) { + TestSuite::getResMsg() = msg; + } + + static void appendResultMessage(const std::string& msg) { + std::lock_guard l(TestSuite::getResMsgLock()); + TestSuite::getResMsg() += msg; + } + + static size_t _msg(const char* format, ...) { + size_t cur_len = 0; + TestSuite* cur_test = TestSuite::getCurTest(); + if ( ( cur_test && + (cur_test->options.printTestMessage || cur_test->displayMsg) && + !cur_test->suppressMsg ) || + globalMsgFlag() ) { + va_list args; + va_start(args, format); + cur_len += vprintf(format, args); + va_end(args); + } + return cur_len; + } + static size_t _msgt(const char* format, ...) { + size_t cur_len = 0; + TestSuite* cur_test = TestSuite::getCurTest(); + if ( ( cur_test && + (cur_test->options.printTestMessage || cur_test->displayMsg) && + !cur_test->suppressMsg ) || + globalMsgFlag() ) { + std::cout << _CLM_D_GRAY + << getTimeStringShort() << _CLM_END << "] "; + va_list args; + va_start(args, format); + cur_len += vprintf(format, args); + va_end(args); + } + return cur_len; + } + + class Msg { + public: + Msg() {} + + template + inline Msg& operator<<(const T& data) { + if (TestSuite::isMsgAllowed()) { + std::cout << data; + } + return *this; + } + + using MyCout = std::basic_ostream< char, std::char_traits >; + typedef MyCout& (*EndlFunc)(MyCout&); + + Msg& operator<<(EndlFunc func) { + if (TestSuite::isMsgAllowed()) { + func(std::cout); + } + return *this; + } + }; + + static void sleep_us(size_t us, const std::string& msg = std::string()) { + if (!msg.empty()) TestSuite::_msg("%s (%zu us)\n", msg.c_str(), us); + std::this_thread::sleep_for(std::chrono::microseconds(us)); + } + static void sleep_ms(size_t ms, const std::string& msg = std::string()) { + if (!msg.empty()) TestSuite::_msg("%s (%zu ms)\n", msg.c_str(), ms); + std::this_thread::sleep_for(std::chrono::milliseconds(ms)); + } + static void sleep_sec(size_t sec, const std::string& msg = std::string()) { + if (!msg.empty()) TestSuite::_msg("%s (%zu s)\n", msg.c_str(), sec); + std::this_thread::sleep_for(std::chrono::seconds(sec)); + } + static std::string lzStr(size_t digit, uint64_t num) { + std::stringstream ss; + ss << std::setw(digit) << std::setfill('0') << std::to_string(num); + return ss.str(); + } + static double calcThroughput(uint64_t ops, uint64_t elapsed_us) { + return ops * 1000000.0 / elapsed_us; + } + static std::string throughputStr(uint64_t ops, uint64_t elapsed_us) { + return countToString(ops * 1000000 / elapsed_us); + } + static std::string sizeThroughputStr(uint64_t size_byte, uint64_t elapsed_us) { + return sizeToString(size_byte * 1000000 / elapsed_us); + } + + // === Timer things ==================================== + class Timer { + public: + Timer() : duration_ms(0) { + reset(); + } + Timer(size_t _duration_ms) : duration_ms(_duration_ms) { + reset(); + } + inline bool timeout() { return timeover(); } + bool timeover() { + auto cur = std::chrono::system_clock::now(); + std::chrono::duration elapsed = cur - start; + if (duration_ms < elapsed.count() * 1000) return true; + return false; + } + uint64_t getTimeSec() { + auto cur = std::chrono::system_clock::now(); + std::chrono::duration elapsed = cur - start; + return (uint64_t)(elapsed.count()); + } + uint64_t getTimeMs() { + auto cur = std::chrono::system_clock::now(); + std::chrono::duration elapsed = cur - start; + return (uint64_t)(elapsed.count() * 1000); + } + uint64_t getTimeUs() { + auto cur = std::chrono::system_clock::now(); + std::chrono::duration elapsed = cur - start; + return (uint64_t)(elapsed.count() * 1000000); + } + void reset() { + start = std::chrono::system_clock::now(); + } + void resetSec(size_t _duration_sec) { + duration_ms = _duration_sec * 1000; + reset(); + } + void resetMs(size_t _duration_ms) { + duration_ms = _duration_ms; + reset(); + } + private: + std::chrono::time_point start; + size_t duration_ms; + }; + + // === Workload generator things ==================================== + class WorkloadGenerator { + public: + WorkloadGenerator(double ops_per_sec = 0.0, uint64_t max_ops_per_batch = 0) + : opsPerSec(ops_per_sec) + , maxOpsPerBatch(max_ops_per_batch) + , numOpsDone(0) { + reset(); + } + void reset() { + start = std::chrono::system_clock::now(); + numOpsDone = 0; + } + size_t getNumOpsToDo() { + if (opsPerSec <= 0) return 0; + + auto cur = std::chrono::system_clock::now(); + std::chrono::duration elapsed = cur - start; + + double exp = opsPerSec * elapsed.count(); + if (numOpsDone < exp) { + if (maxOpsPerBatch) { + return std::min(maxOpsPerBatch, (uint64_t)exp - numOpsDone); + } + return (uint64_t)exp - numOpsDone; + } + return 0; + } + void addNumOpsDone(size_t num) { + numOpsDone += num; + } + private: + std::chrono::time_point start; + double opsPerSec; + uint64_t maxOpsPerBatch; + uint64_t numOpsDone; + }; + + // === Progress things ================================== + // Progress that knows the maximum value. + class Progress { + public: + Progress(uint64_t _num, + const std::string& _comment = std::string(), + const std::string& _unit = std::string()) + : curValue(0) + , num(_num) + , timer(0) + , lastPrintTimeUs(timer.getTimeUs()) + , comment(_comment) + , unit(_unit) {} + void update(uint64_t cur) { + curValue = cur; + uint64_t curTimeUs = timer.getTimeUs(); + if (curTimeUs - lastPrintTimeUs > 50000 || + cur == 0 || curValue >= num) { + // Print every 0.05 sec (20 Hz). + lastPrintTimeUs = curTimeUs; + std::string _comment = + (comment.empty()) ? "" : comment + ": "; + std::string _unit = + (unit.empty()) ? "" : unit + " "; + + _msg("\r%s%ld/%ld %s(%.1f%%)", + _comment.c_str(), curValue, num, _unit.c_str(), + (double)curValue*100/num); + fflush(stdout); + } + if (curValue >= num) { + _msg("\n"); + fflush(stdout); + } + } + void done() { if (curValue < num) update(num); } + private: + uint64_t curValue; + uint64_t num; + Timer timer; + uint64_t lastPrintTimeUs; + std::string comment; + std::string unit; + }; + + // Progress that doesn't know the maximum value. + class UnknownProgress { + public: + UnknownProgress(const std::string& _comment = std::string(), + const std::string& _unit = std::string()) + : curValue(0) + , timer(0) + , lastPrintTimeUs(timer.getTimeUs()) + , comment(_comment) + , unit(_unit) {} + void update(uint64_t cur) { + curValue = cur; + uint64_t curTimeUs = timer.getTimeUs(); + if ( curTimeUs - lastPrintTimeUs > 50000 || + cur == 0 ) { + // Print every 0.05 sec (20 Hz). + lastPrintTimeUs = curTimeUs; + std::string _comment = + (comment.empty()) ? "" : comment + ": "; + std::string _unit = + (unit.empty()) ? "" : unit + " "; + + _msg("\r%s%ld %s", _comment.c_str(), curValue, _unit.c_str()); + fflush(stdout); + } + } + void done() { + _msg("\n"); + fflush(stdout); + } + private: + uint64_t curValue; + Timer timer; + uint64_t lastPrintTimeUs; + std::string comment; + std::string unit; + }; + + // === Displayer things ================================== + class Displayer { + public: + Displayer(size_t num_raws, size_t num_cols) + : numRaws(num_raws) + , numCols(num_cols) + , colWidth(num_cols, 20) + , context(num_raws, std::vector(num_cols)) {} + void init() { + for (size_t ii=0; ii& src) { + size_t num_src = src.size(); + if (!num_src) return; + + for (size_t ii=0; ii= numRaws || col_idx >= numCols) return; + + thread_local char info_buf[32]; + size_t len = 0; + va_list args; + va_start(args, format); + len += vsnprintf(info_buf + len, 20 - len, format, args); + va_end(args); + context[raw_idx][col_idx] = info_buf; + } + void print() { + _msg("\033[%zuA", numRaws); + for (size_t ii=0; ii colWidth; + std::vector< std::vector< std::string > > context; + }; + + // === Gc things ==================================== + template + class GcVar { + public: + GcVar(T& _src, T2 _to) + : src(_src), to(_to) {} + ~GcVar() { + // GC by value. + src = to; + } + private: + T& src; + T2 to; + }; + + class GcFunc { + public: + GcFunc(std::function _func) + : func(_func) {} + ~GcFunc() { + // GC by function. + func(); + } + private: + std::function func; + }; + + // === Thread things ==================================== + struct ThreadArgs { /* Opaque. */ }; + using ThreadFunc = std::function< int(ThreadArgs*) >; + using ThreadExitHandler = std::function< void(ThreadArgs*) >; + +private: + struct ThreadInternalArgs { + ThreadInternalArgs() : userArgs(nullptr), func(nullptr), rc(0) {} + ThreadArgs* userArgs; + ThreadFunc func; + int rc; + }; + +public: + struct ThreadHolder { + ThreadHolder() : tid(nullptr), handler(nullptr) {} + ThreadHolder(std::thread* _tid, ThreadExitHandler _handler) + : tid(_tid), handler(_handler) {} + ThreadHolder(ThreadArgs* u_args, + ThreadFunc t_func, + ThreadExitHandler t_handler) + : tid(nullptr), handler(nullptr) + { spawn(u_args, t_func, t_handler); } + + ~ThreadHolder() { join(true); } + + void spawn(ThreadArgs* u_args, + ThreadFunc t_func, + ThreadExitHandler t_handler) { + if (tid) return; + handler = t_handler; + args.userArgs = u_args; + args.func = t_func; + tid = new std::thread(spawnThread, &args); + } + + void join(bool force = false) { + if (!tid) return; + if (tid->joinable()) { + if (force) { + // Force kill. + handler(args.userArgs); + } + tid->join(); + } + delete tid; + tid = nullptr; + } + int getResult() const { return args.rc; } + std::thread* tid; + ThreadExitHandler handler; + ThreadInternalArgs args; + }; + + + // === doTest things ==================================== + + // 1) Without parameter. + void doTest( const std::string& test_name, + test_func func ) + { + if (!matchFilter(test_name)) return; + + readyTest(test_name); + TestSuite::getResMsg() = ""; + TestSuite::getInfoMsg() = ""; + TestSuite::getCurTest() = this; + int ret = func(); + reportTestResult(test_name, ret); + } + + // 2) Ranged parameter. + template + void doTest( std::string test_name, + F func, + TestRange range ) + { + if (!matchFilter(test_name)) return; + + size_t n = (useGivenRange) ? 1 : range.getSteps(); + size_t i; + + for (i=0; i + void doTest( const std::string& test_name, + F func, + T1 arg1, + T2... args ) + { + if (!matchFilter(test_name)) return; + + readyTest(test_name); + TestSuite::getResMsg() = ""; + TestSuite::getInfoMsg() = ""; + TestSuite::getCurTest() = this; + int ret = func(arg1, args...); + reportTestResult(test_name, ret); + } + + // 4) Multi composite parameters. + template + void doTest( const std::string& test_name, + F func, + TestArgsWrapper& args_wrapper ) + { + if (!matchFilter(test_name)) return; + + TestArgsBase* args = args_wrapper.getArgs(); + args->setCallback(test_name, func, this); + args->testAll(); + } + + TestOptions options; + +private: + void doTestCB( const std::string& test_name, + test_func_args func, + TestArgsBase* args ) + { + readyTest(test_name); + TestSuite::getResMsg() = ""; + TestSuite::getInfoMsg() = ""; + TestSuite::getCurTest() = this; + int ret = func(args); + reportTestResult(test_name, ret); + } + + static void spawnThread(ThreadInternalArgs* args) { + args->rc = args->func(args->userArgs); + } + + bool matchFilter(const std::string& test_name) { + if (!filter.empty() && + test_name.find(filter) == std::string::npos) { + // Doesn't match with the given filter. + return false; + } + return true; + } + + void readyTest(const std::string& test_name) { + printf("[ " "...." " ] %s\n", test_name.c_str()); + if ( (options.printTestMessage || displayMsg) && + !suppressMsg ) { + printf(_CL_D_GRAY(" === TEST MESSAGE (BEGIN) ===\n")); + } + fflush(stdout); + + getTestName() = test_name; + startTimeLocal = std::chrono::system_clock::now(); + } + + void reportTestResult(const std::string& test_name, + int result) + { + std::chrono::time_point cur_time = + std::chrono::system_clock::now();; + std::chrono::duration elapsed = cur_time - startTimeLocal; + std::string time_str = usToString + ( (uint64_t)(elapsed.count() * 1000000) ); + + char msg_buf[1024]; + std::string res_msg = TestSuite::getResMsg(); + sprintf(msg_buf, "%s (" _CL_BROWN("%s") ")%s%s", + test_name.c_str(), + time_str.c_str(), + (res_msg.empty() ? "" : ": "), + res_msg.c_str() ); + + if (result < 0) { + printf("[ " _CL_RED("FAIL") " ] %s\n", msg_buf); + cntFail++; + } else { + if ( (options.printTestMessage || displayMsg) && + !suppressMsg ) { + printf(_CL_D_GRAY(" === TEST MESSAGE (END) ===\n")); + } else { + // Move a line up. + printf("\033[1A"); + // Clear current line. + printf("\r"); + // And then overwrite. + } + printf("[ " _CL_GREEN("PASS") " ] %s\n", msg_buf); + cntPass++; + } + + if ( result != 0 && + (options.abortOnFailure || forceAbortOnFailure) ) { + abort(); + } + getTestName().clear(); + } + + size_t cntPass; + size_t cntFail; + std::string filter; + bool useGivenRange; + bool preserveTestFiles; + bool forceAbortOnFailure; + bool suppressMsg; + bool displayMsg; + int64_t givenRange; + // Start time of each test. + std::chrono::time_point startTimeLocal; + // Start time of the entire test suite. + std::chrono::time_point startTimeGlobal; +}; + +// ===== Functor ===== + +struct TestArgsSetParamFunctor { + template + void operator()(T* t, TestRange& r, size_t param_idx) const { + *t = r.getEntry(param_idx); + } +}; + +template +inline typename std::enable_if::type +TestArgsSetParamScan(int, + std::tuple &, + std::tuple...> &, + FuncT, + size_t) { } + +template +inline typename std::enable_if::type +TestArgsSetParamScan(int index, + std::tuple& t, + std::tuple...>& r, + FuncT f, + size_t param_idx) { + if (index == 0) f(std::get(t), std::get(r), param_idx); + TestArgsSetParamScan(index-1, t, r, f, param_idx); +} +struct TestArgsGetNumStepsFunctor { + template + void operator()(T* t, TestRange& r, size_t& steps_ret) const { + (void)t; + steps_ret = r.getSteps(); + } +}; + +template +inline typename std::enable_if::type +TestArgsGetStepsScan(int, + std::tuple &, + std::tuple...> &, + FuncT, + size_t) { } + +template +inline typename std::enable_if::type +TestArgsGetStepsScan(int index, + std::tuple& t, + std::tuple...>& r, + FuncT f, + size_t& steps_ret) { + if (index == 0) f(std::get(t), std::get(r), steps_ret); + TestArgsGetStepsScan(index-1, t, r, f, steps_ret); +} + +#define TEST_ARGS_CONTENTS() \ + void setParam(size_t param_no, size_t param_idx) { \ + TestArgsSetParamScan(param_no, args, ranges, \ + TestArgsSetParamFunctor(), \ + param_idx); } \ + size_t getNumSteps(size_t param_no) { \ + size_t ret = 0; \ + TestArgsGetStepsScan(param_no, args, ranges, \ + TestArgsGetNumStepsFunctor(), \ + ret); \ + return ret; } \ + size_t getNumParams() { \ + return std::tuple_size::value; \ + } + + +// ===== TestArgsBase ===== + +void TestArgsBase::testAllInternal(size_t depth) { + size_t i; + size_t n_params = getNumParams(); + size_t n_steps = getNumSteps(depth); + + for (i=0; idoTestCB(test_name, + testFunction, + this); + } + } +} + +// ===== Parameter macros ===== + +#define DEFINE_PARAMS_2(name, \ + type1, param1, range1, \ + type2, param2, range2) \ + class name ## _class : public TestArgsBase { \ + public: \ + name ## _class() { \ + args = std::make_tuple(¶m1, ¶m2); \ + ranges = std::make_tuple( \ + TestRangerange1, \ + TestRangerange2 ); \ + } \ + std::string toString() { \ + std::stringstream ss; \ + ss << param1 << ", " << param2; \ + return ss.str(); \ + } \ + TEST_ARGS_CONTENTS() \ + type1 param1; \ + type2 param2; \ + private: \ + std::tuple args; \ + std::tuple, TestRange> ranges; \ + }; + +#define DEFINE_PARAMS_3(name, \ + type1, param1, range1, \ + type2, param2, range2, \ + type3, param3, range3) \ + class name ## _class : public TestArgsBase { \ + public: \ + name ## _class() { \ + args = std::make_tuple(¶m1, ¶m2, ¶m3); \ + ranges = std::make_tuple( \ + TestRangerange1, \ + TestRangerange2, \ + TestRangerange3 ); \ + } \ + std::string toString() { \ + std::stringstream ss; \ + ss << param1 << ", " << param2 << ", " << param3; \ + return ss.str(); \ + } \ + TEST_ARGS_CONTENTS() \ + type1 param1; \ + type2 param2; \ + type3 param3; \ + private: \ + std::tuple args; \ + std::tuple, \ + TestRange, \ + TestRange> ranges; \ + }; + +#define SET_PARAMS(name) \ + TestArgsWrapper name(new name ## _class()) + +#define GET_PARAMS(name) \ + name ## _class* name = static_cast(TEST_args_base__) + +#define PARAM_BASE TestArgsBase* TEST_args_base__ + +#define TEST_SUITE_AUTO_PREFIX __func__ + +#define TEST_SUITE_PREPARE_PATH(path) \ + const std::string _ts_auto_prefiix_ = TEST_SUITE_AUTO_PREFIX; \ + TestSuite::clearTestFile(_ts_auto_prefiix_); \ + path = TestSuite::getTestFileName(_ts_auto_prefiix_); + +#define TEST_SUITE_CLEANUP_PATH() \ + TestSuite::clearTestFile( _ts_auto_prefiix_, \ + TestSuite::END_OF_TEST ); + diff --git a/tests/unit/crc32_test.cc b/tests/unit/crc32_test.cc new file mode 100644 index 0000000..a71b8bc --- /dev/null +++ b/tests/unit/crc32_test.cc @@ -0,0 +1,60 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "test_common.h" + +#include "crc32.h" + +namespace crc32_test { + +int helloworld_crc_test() { + std::string tmp = "helloworld"; + uint32_t crc_val = crc32_8(tmp.data(), tmp.size(), 0); + TestSuite::_msg("%zx\n", crc_val); + + // CRC32 of `helloworld` is known as follows: + CHK_EQ( 0xf9eb20ad, crc_val ); + + std::string tmp2 = "hello"; + std::string tmp3 = "world"; + crc_val = crc32_8(tmp2.data(), tmp2.size(), 0); + crc_val = crc32_8(tmp3.data(), tmp3.size(), crc_val); + TestSuite::_msg("%zx\n", crc_val); + + // Should be the same. + CHK_EQ( 0xf9eb20ad, crc_val ); + + std::vector payload(1024, 0); + for (size_t ii=1; ii<=1024; ++ii) { + uint32_t crc_val = crc32_8(&payload[0], ii, 0); + TestSuite::_msg("%zx\n", crc_val); + } + + return 0; +} + +}; // namespace crc32_test +using namespace crc32_test; + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + ts.doTest("helloworld crc test", helloworld_crc_test); + + return 0; +} + + diff --git a/tests/unit/fileops_directio_test.cc b/tests/unit/fileops_directio_test.cc new file mode 100644 index 0000000..031c01a --- /dev/null +++ b/tests/unit/fileops_directio_test.cc @@ -0,0 +1,599 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include +#include +#include + +#include "fileops_directio.h" +#include "fileops_posix.h" + +#include "test_common.h" + +using namespace jungle; + +int read_write_without_open_test() +{ + const std::string prefix = "file_directio_test"; + TestSuite::clearTestFile(prefix); + FileOps* ops = new FileOpsDirectIO(nullptr); + FileHandle* fhandle = nullptr; + + Status s; + char buf[256]; + + s = ops->pread(fhandle, buf, 256, 0); + CHK_NOT(s); + + memset(buf, 'x', 256); + s = ops->append(fhandle, buf, 256); + CHK_NOT(s); + + s = ops->flush(fhandle); + CHK_NOT(s); + + s = ops->close(fhandle); + CHK_NOT(s); + + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int normal_read_write_test() +{ + const std::string prefix = "file_directio_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + Status s; + FileOps* ops = new FileOpsDirectIO(nullptr); + FileHandle* fhandle; + char buf[256]; + + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + CHK_OK(fhandle->isOpened()); + + memset(buf, 'x', 256); + // DirectIO random write not supported + s = ops->pwrite(fhandle, buf, 256, 0); + CHK_NOT(s); + s = ops->append(fhandle, buf, 256); + CHK_OK(s); + + s = ops->flush(fhandle); + CHK_OK(s); + + memset(buf, 'a', 256); + s = ops->pread(fhandle, buf, 256, 0); + CHK_OK(s); + + char buf_chk[256]; + memset(buf_chk, 'x', 256); + CHK_EQ(0, memcmp(buf_chk, buf, 256)); + + // Check file size + cs_off_t offset = ops->eof(fhandle); + CHK_EQ(ALIGNMENT, offset); + + // Check padding bytes + memset(buf, 'a', 256); + s = ops->pread(fhandle, buf, 256, 256); + CHK_OK(s); + uint64_t val = 0; + uint64_t offset_tmp = 0; + read_mem_64(buf, val, 0, offset_tmp); + CHK_EQ(PADDING_HEADER_FLAG, val); + + // Read beyond file size + s = ops->pread(fhandle, buf, 300, 256); + CHK_NOT(s); + + s = ops->close(fhandle); + CHK_OK(s); + CHK_NOT(fhandle->isOpened()); + + delete fhandle; + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +uint32_t randint(uint32_t begin_inclusive, uint32_t end_inclusive) { + assert(begin_inclusive <= end_inclusive); + return std::rand() % (end_inclusive - begin_inclusive + 1) + begin_inclusive; +} + +void _set_circling_data(uint8_t* buf, size_t size, size_t int_seq = 0) { + uint8_t seq = static_cast(int_seq % 256); + for (uint32_t i = 0; i < size; i++) { + buf[i] = seq; + seq++; + } +} + +int _check_circling_data(uint8_t* buf, size_t size, size_t int_seq = 0) { + uint8_t seq = static_cast(int_seq % 256); + for (uint32_t i = 0; i < size; i++) { + CHK_EQ(seq, buf[i]); + seq++; + } + return 0; +} + +size_t _file_size(size_t data_size) { + size_t need_align = data_size % ALIGNMENT; + if (0 == need_align) { + return data_size; + } else { + // Make sure enougth room for padding bytes + return need_align <= (ALIGNMENT - 8) + ? data_size - need_align + ALIGNMENT + : data_size - need_align + 2 * ALIGNMENT; + } +} + +int unaligned_read_write_test() +{ + const std::string prefix = "file_directio_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + Status s; + FileOps* ops = new FileOpsDirectIO(nullptr); + FileHandle* fhandle; + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + CHK_OK(fhandle->isOpened()); + + // Prepare write buffer + size_t buf_size = static_cast(randint(1, 10) + * ALIGNED_BUFFER_SIZE + + randint(0, ALIGNMENT - 1)); + uint8_t* buf = new uint8_t[buf_size]; + uint8_t* buf_chk = new uint8_t[buf_size]; + _set_circling_data(buf, buf_size); + _check_circling_data(buf, buf_size); + + // Append into file with random size + size_t buf_pos = 0; + bool truncated = false; + size_t truncate_pos = randint(1, buf_size - 1); + while (buf_pos < buf_size) { + size_t slice = randint(1, buf_size - buf_pos); + s = ops->append(fhandle, buf + buf_pos, slice); + CHK_OK(s); + buf_pos += slice; + + // Truncate + if (!truncated && buf_pos > truncate_pos) { + truncated = true; + buf_pos = truncate_pos; + + s = ops->flush(fhandle); + CHK_OK(s); + + s = ops->ftruncate(fhandle, truncate_pos); + CHK_OK(s); + // Check file size + cs_off_t file_size = ops->eof(fhandle); + CHK_EQ(truncate_pos, file_size); + // Read all + s = ops->pread(fhandle, buf_chk, truncate_pos, 0); + CHK_OK(s); + _check_circling_data(buf_chk, truncate_pos); + // Read tail + s = ops->pread(fhandle, buf_chk, 1, truncate_pos - 1); + CHK_OK(s); + _check_circling_data(buf_chk, 1, truncate_pos - 1); + // Read beyond file size + s = ops->pread(fhandle, buf_chk, 1, truncate_pos); + CHK_NOT(s); + } + } + s = ops->flush(fhandle); + CHK_OK(s); + + // Check file size + cs_off_t file_size = ops->eof(fhandle); + CHK_EQ(_file_size(buf_size), file_size); + + // Read all + s = ops->pread(fhandle, buf_chk, buf_size, 0); + CHK_OK(s); + _check_circling_data(buf_chk, buf_size); + + // Check padding bytes + if (buf_size % ALIGNMENT != 0) { + s = ops->pread(fhandle, buf_chk, 8, buf_size); + CHK_OK(s); + uint64_t val = 0; + uint64_t offset_tmp = 0; + read_mem_64(buf_chk, val, 0, offset_tmp); + CHK_EQ(PADDING_HEADER_FLAG, val); + } + + // Randomly read + for (int i = 0; i < 100; i++) { + size_t offset = randint(0, buf_size - 1); + size_t slice = randint(1, buf_size - offset); + s = ops->pread(fhandle, buf_chk, slice, offset); + CHK_OK(s); + _check_circling_data(buf_chk, slice, offset); + } + + s = ops->close(fhandle); + CHK_OK(s); + CHK_NOT(fhandle->isOpened()); + + delete[] buf; + delete[] buf_chk; + delete fhandle; + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int unaligned_file_read_test() +{ + const std::string prefix = "file_directio_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + Status s; + // Prepare write buffer + size_t buf_size = static_cast(randint(1, 10) + * ALIGNED_BUFFER_SIZE + + randint(1, ALIGNMENT - 1)); + uint8_t* buf = new uint8_t[buf_size]; + uint8_t* buf_chk = new uint8_t[buf_size]; + _set_circling_data(buf, buf_size); + _check_circling_data(buf, buf_size); + + // Generate unaligned file + { + FileOps* ops = new FileOpsPosix(); + FileHandle* fhandle; + + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + CHK_OK(fhandle->isOpened()); + + s = ops->pwrite(fhandle, buf, buf_size, 0); + CHK_OK(s); + + s = ops->close(fhandle); + CHK_OK(s); + CHK_NOT(fhandle->isOpened()); + + delete fhandle; + delete ops; + } + + // Open unaligned file with direct-io + FileOps* ops = new FileOpsDirectIO(nullptr); + FileHandle* fhandle; + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + CHK_OK(fhandle->isOpened()); + + // Check file size + cs_off_t file_size = ops->eof(fhandle); + CHK_EQ(buf_size, file_size); + + // Read all + s = ops->pread(fhandle, buf_chk, buf_size, 0); + CHK_OK(s); + _check_circling_data(buf_chk, buf_size); + + // Randomly read + for (int i = 0; i < 100; i++) { + size_t offset = randint(0, buf_size - 1); + size_t slice = randint(1, buf_size - offset); + s = ops->pread(fhandle, buf_chk, slice, offset); + CHK_OK(s); + _check_circling_data(buf_chk, slice, offset); + } + + s = ops->close(fhandle); + CHK_OK(s); + CHK_NOT(fhandle->isOpened()); + + delete[] buf; + delete[] buf_chk; + delete fhandle; + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int unaligned_file_write_test() +{ + const std::string prefix = "file_directio_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + Status s; + // Prepare write buffer + size_t buf_size = static_cast(randint(1, 10) + * ALIGNED_BUFFER_SIZE + + randint(1, ALIGNMENT - 1)); + uint8_t* buf = new uint8_t[buf_size]; + uint8_t* buf_chk = new uint8_t[buf_size]; + _set_circling_data(buf, buf_size); + _check_circling_data(buf, buf_size); + + // Generate unaligned file + { + FileOps* ops = new FileOpsPosix(); + FileHandle* fhandle; + + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + CHK_OK(fhandle->isOpened()); + + s = ops->pwrite(fhandle, buf, buf_size, 0); + CHK_OK(s); + + s = ops->close(fhandle); + CHK_OK(s); + CHK_NOT(fhandle->isOpened()); + + delete fhandle; + delete ops; + } + + // Open unaligned file with direct-io + FileOps* ops = new FileOpsDirectIO(nullptr); + FileHandle* fhandle; + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + CHK_OK(fhandle->isOpened()); + + // Append into file with random size + size_t buf_pos = 0; + while (buf_pos < buf_size) { + size_t slice = randint(1, buf_size - buf_pos); + s = ops->append(fhandle, buf + buf_pos, slice); + CHK_OK(s); + buf_pos += slice; + } + s = ops->flush(fhandle); + CHK_OK(s); + + // Check file size + cs_off_t file_size = ops->eof(fhandle); + CHK_EQ(_file_size(2 * buf_size), file_size); + + // Read all + s = ops->pread(fhandle, buf_chk, buf_size, 0); + CHK_OK(s); + _check_circling_data(buf_chk, buf_size); + s = ops->pread(fhandle, buf_chk, buf_size, buf_size); + CHK_OK(s); + _check_circling_data(buf_chk, buf_size); + + s = ops->close(fhandle); + CHK_OK(s); + CHK_NOT(fhandle->isOpened()); + + delete[] buf; + delete[] buf_chk; + delete fhandle; + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int fsync_test() +{ + const std::string prefix = "file_directio_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + Status s; + FileOps* ops = new FileOpsDirectIO(nullptr); + FileHandle* fhandle; + char buf[256]; + + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + + memset(buf, 'x', 256); + s = ops->append(fhandle, buf, 256); + CHK_OK(s); + + s = ops->flush(fhandle); + CHK_OK(s); + + s = ops->fsync(fhandle); + CHK_OK(s); + + s = ops->close(fhandle); + CHK_OK(s); + + delete fhandle; + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int exist_test() { + const std::string prefix = "file_directio_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + std::string not_exist_filename = "not_exist"; + + Status s; + FileOps* ops = new FileOpsDirectIO(nullptr); + FileHandle* fhandle; + char buf[256]; + + // Open, write something, close. + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + + memset(buf, 'x', 256); + s = ops->append(fhandle, buf, 256); + CHK_OK(s); + + s = ops->close(fhandle); + CHK_OK(s); + + // Should exist. + CHK_OK(ops->exist(filename.c_str())); + + // Should not exist. + CHK_NOT(ops->exist(not_exist_filename.c_str())); + + delete fhandle; + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int remove_test() { + const std::string prefix = "file_directio_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + Status s; + FileOps* ops = new FileOpsDirectIO(nullptr); + FileHandle* fhandle; + char buf[256]; + + // Open, write something, close. + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + + memset(buf, 'x', 256); + s = ops->append(fhandle, buf, 256); + CHK_OK(s); + + s = ops->close(fhandle); + CHK_OK(s); + + // Remove it. + s = ops->remove(filename.c_str()); + CHK_OK(s); + + // Should not exist. + CHK_NOT(ops->exist(filename.c_str())); + + delete fhandle; + delete ops; + + return 0; +} + +int mkdir_test() { + const std::string prefix = "file_directio_test"; + TestSuite::clearTestFile(prefix); + std::string dirname = TestSuite::getTestFileName(prefix); + std::string filename = dirname + "/file"; + + Status s; + FileOps* ops = new FileOpsDirectIO(nullptr); + FileHandle* fhandle; + char buf[256]; + + // Directory should not exist + CHK_NOT(ops->exist(dirname.c_str())); + + // Make a directory + s = ops->mkdir(dirname.c_str()); + CHK_OK(s); + + // Open, write something, close. + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + + memset(buf, 'x', 256); + s = ops->append(fhandle, buf, 256); + CHK_OK(s); + + s = ops->close(fhandle); + CHK_OK(s); + + // Remove file. + s = ops->remove(filename.c_str()); + CHK_OK(s); + + // Remove directory. + s = ops->remove(dirname.c_str()); + CHK_OK(s); + + // Directory should not exist + CHK_NOT(ops->exist(dirname.c_str())); + + delete fhandle; + delete ops; + + return 0; +} + +int get_ops_test() { + const std::string prefix = "file_directio_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + Status s; + FileOps* ops = new FileOpsDirectIO(nullptr); + FileHandle* fhandle; + + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + + // Return value of ops() should be same to the original ops. + CHK_EQ(ops, fhandle->ops()); + + // Call close() using ops() member function, + // instead of ops pointer. + s = fhandle->ops()->close(fhandle); + CHK_OK(s); + + delete fhandle; + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int main(int argc, char** argv) { + std::srand(std::time(0)); + TestSuite test(argc, argv); + + test.doTest("read write without open test", read_write_without_open_test); + test.doTest("normal read write test", normal_read_write_test); + test.doTest("unaligned read write test", unaligned_read_write_test); + test.doTest("unaligned file read test", unaligned_file_read_test); + test.doTest("unaligned file write test", unaligned_file_write_test); + test.doTest("file fsync test", fsync_test); + test.doTest("file exist test", exist_test); + test.doTest("file remove test", remove_test); + test.doTest("file mkdir test", mkdir_test); + test.doTest("get ops test", get_ops_test); + + return 0; +} diff --git a/tests/unit/fileops_test.cc b/tests/unit/fileops_test.cc new file mode 100644 index 0000000..7ff4df1 --- /dev/null +++ b/tests/unit/fileops_test.cc @@ -0,0 +1,333 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include +#include + +#include "fileops_posix.h" + +#include "test_common.h" + +using namespace jungle; + +int read_write_without_open_test() +{ + const std::string prefix = "file_posix_test"; + TestSuite::clearTestFile(prefix); + FileOps *ops = new FileOpsPosix(); + FileHandle* fhandle = nullptr; + + Status s; + char buf[256]; + + s = ops->pread(fhandle, buf, 256, 0); + CHK_NOT(s); + + memset(buf, 'x', 256); + s = ops->pwrite(fhandle, buf, 256, 0); + CHK_NOT(s); + + s = ops->flush(fhandle); + CHK_NOT(s); + + s = ops->close(fhandle); + CHK_NOT(s); + + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int normal_read_write_test() +{ + const std::string prefix = "file_posix_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + Status s; + FileOps* ops = new FileOpsPosix(); + FileHandle* fhandle; + char buf[256]; + + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + CHK_OK(fhandle->isOpened()); + + memset(buf, 'x', 256); + s = ops->pwrite(fhandle, buf, 256, 0); + CHK_OK(s); + + s = ops->flush(fhandle); + CHK_OK(s); + + memset(buf, 'a', 256); + s = ops->pread(fhandle,buf, 256, 0); + CHK_OK(s); + + char buf_chk[256]; + memset(buf_chk, 'x', 256); + CHK_EQ(0, memcmp(buf_chk, buf, 256)); + + s = ops->ftruncate(fhandle, 111); + CHK_OK(s); + cs_off_t file_size = ops->eof(fhandle); + CHK_EQ(111, file_size); + + s = ops->close(fhandle); + CHK_OK(s); + CHK_NOT(fhandle->isOpened()); + + delete fhandle; + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int eof_test() +{ + const std::string prefix = "file_posix_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + Status s; + FileOps *ops = new FileOpsPosix(); + FileHandle* fhandle; + char buf[256]; + + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + + memset(buf, 'x', 256); + s = ops->pwrite(fhandle, buf, 256, 0); + CHK_OK(s); + + s = ops->flush(fhandle); + CHK_OK(s); + + cs_off_t offset = ops->eof(fhandle); + CHK_EQ(256, offset); + + s = ops->close(fhandle); + CHK_OK(s); + + delete fhandle; + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int fsync_test() +{ + const std::string prefix = "file_posix_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + Status s; + FileOps *ops = new FileOpsPosix(); + FileHandle* fhandle; + char buf[256]; + + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + + memset(buf, 'x', 256); + s = ops->pwrite(fhandle, buf, 256, 0); + CHK_OK(s); + + s = ops->flush(fhandle); + CHK_OK(s); + + s = ops->fsync(fhandle); + CHK_OK(s); + + s = ops->close(fhandle); + CHK_OK(s); + + delete fhandle; + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int exist_test() { + const std::string prefix = "file_posix_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + std::string not_exist_filename = "not_exist"; + + Status s; + FileOps *ops = new FileOpsPosix(); + FileHandle* fhandle; + char buf[256]; + + // Open, write something, close. + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + + memset(buf, 'x', 256); + s = ops->pwrite(fhandle, buf, 256, 0); + CHK_OK(s); + + s = ops->flush(fhandle); + CHK_OK(s); + + s = ops->close(fhandle); + CHK_OK(s); + + // Should exist. + CHK_OK(ops->exist(filename.c_str())); + + // Should not exist. + CHK_NOT(ops->exist(not_exist_filename.c_str())); + + delete fhandle; + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int remove_test() { + const std::string prefix = "file_posix_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + Status s; + FileOps *ops = new FileOpsPosix(); + FileHandle* fhandle; + char buf[256]; + + // Open, write something, close. + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + + memset(buf, 'x', 256); + s = ops->pwrite(fhandle, buf, 256, 0); + CHK_OK(s); + + s = ops->flush(fhandle); + CHK_OK(s); + + s = ops->close(fhandle); + CHK_OK(s); + + // Remove it. + s = ops->remove(filename.c_str()); + CHK_OK(s); + + // Should not exist. + CHK_NOT(ops->exist(filename.c_str())); + + delete fhandle; + delete ops; + + return 0; +} + +int mkdir_test() { + const std::string prefix = "file_posix_test"; + TestSuite::clearTestFile(prefix); + std::string dirname = TestSuite::getTestFileName(prefix); + std::string filename = dirname + "/file"; + + Status s; + FileOps *ops = new FileOpsPosix(); + FileHandle* fhandle; + char buf[256]; + + // Directory should not exist + CHK_NOT(ops->exist(dirname.c_str())); + + // Make a directory + s = ops->mkdir(dirname.c_str()); + CHK_OK(s); + + // Open, write something, close. + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + + memset(buf, 'x', 256); + s = ops->pwrite(fhandle, buf, 256, 0); + CHK_OK(s); + + s = ops->flush(fhandle); + CHK_OK(s); + + s = ops->close(fhandle); + CHK_OK(s); + + // Remove file. + s = ops->remove(filename.c_str()); + CHK_OK(s); + + // Remove directory. + s = ops->remove(dirname.c_str()); + CHK_OK(s); + + // Directory should not exist + CHK_NOT(ops->exist(dirname.c_str())); + + delete fhandle; + delete ops; + + return 0; +} + +int get_ops_test() { + const std::string prefix = "file_posix_test"; + TestSuite::clearTestFile(prefix); + std::string filename = TestSuite::getTestFileName(prefix); + + Status s; + FileOps *ops = new FileOpsPosix(); + FileHandle* fhandle; + + s = ops->open(&fhandle, filename.c_str(), FileOps::NORMAL); + CHK_OK(s); + + // Return value of ops() should be same to the original ops. + CHK_EQ(ops, fhandle->ops()); + + // Call close() using ops() member function, + // instead of ops pointer. + s = fhandle->ops()->close(fhandle); + CHK_OK(s); + + delete fhandle; + delete ops; + TestSuite::clearTestFile(prefix); + + return 0; +} + +int main(int argc, char** argv) { + TestSuite test(argc, argv); + + test.doTest("read write without open test", read_write_without_open_test); + test.doTest("normal read write test", normal_read_write_test); + test.doTest("file EOF test", eof_test); + test.doTest("file fsync test", fsync_test); + test.doTest("file exist test", exist_test); + test.doTest("file remove test", remove_test); + test.doTest("file mkdir test", mkdir_test); + test.doTest("get ops test", get_ops_test); + + return 0; +} diff --git a/tests/unit/keyvalue_test.cc b/tests/unit/keyvalue_test.cc new file mode 100644 index 0000000..894f96f --- /dev/null +++ b/tests/unit/keyvalue_test.cc @@ -0,0 +1,364 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "test_common.h" + +#include "internal_helper.h" + +#include + +#include +#include + +using namespace jungle; + +int sb_empty_test() { + SizedBuf sb; + + CHK_EQ(0, sb.size); + CHK_NULL(sb.data); + + return 0; +} + +int sb_normal_test() { + SizedBuf sb; + char str[16]; + char str2[8]; + + sb = SizedBuf(16, str); + CHK_EQ(16, sb.size); + CHK_NONNULL(sb.data); + + sb.set(8, str2); + CHK_EQ(8, sb.size); + CHK_EQ((void*)str2, sb.data); + + return 0; +} + +int sb_clone_test() { + SizedBuf sb; + SizedBuf sb_clone(sb); + + CHK_EQ(sb.size, sb_clone.size); + CHK_EQ(sb.data, sb_clone.data); + + return 0; +} + +int sb_string_test() { + SizedBuf sb; + + std::string str1 = "test_string"; + std::string str2 = "hello world"; + + sb = SizedBuf(str1); + CHK_EQ(str1.size(), sb.size); + CHK_OK(!memcmp(str1.c_str(), sb.data, sb.size)); + + sb.set(str2); + CHK_EQ(str2.size(), sb.size); + CHK_OK(!memcmp(str2.c_str(), sb.data, sb.size)); + + std::string str3 = sb.toString(); + CHK_EQ(str2, str3); + + return 0; +} + +int sb_equal_test() { + std::string str1 = "test_string"; + std::string str2 = "test_string"; + std::string str3 = "!!test_string"; + SizedBuf sb1(str1), sb2(str2), sb3(str3); + + CHK_OK((void*)sb1.data != (void*)sb2.data); + CHK_OK((void*)sb2.data != (void*)sb3.data); + CHK_OK((void*)sb3.data != (void*)sb1.data); + + CHK_OK(sb1 == sb2); + CHK_OK(sb1 != sb3); + CHK_OK(sb2 != sb3); + + return 0; +} + +int sb_equal_empty_test() { + std::string str3 = "!!test_string"; + SizedBuf sb1, sb2, sb3(str3); + + CHK_OK((void*)sb2.data != (void*)sb3.data); + CHK_OK((void*)sb3.data != (void*)sb1.data); + + CHK_OK(sb1 == sb2); + CHK_OK(sb1 != sb3); + CHK_OK(sb2 != sb3); + + return 0; +} + +int sb_clear_test() +{ + SizedBuf sb; + + std::string str = "test_string"; + sb = SizedBuf(str); + CHK_EQ(str.size(), sb.size); + CHK_OK(!memcmp(str.c_str(), sb.data, sb.size)); + + sb.clear(); + CHK_EQ(0, sb.size); + CHK_NULL(sb.data); + + return 0; +} + +int sb_free_test() +{ + std::string str = "test_string"; + SizedBuf sb(str); + // Should fail. + CHK_NOT( sb.free() ); + + char str_raw[] = "test"; + SizedBuf sb2(4, str_raw); + // Should fail. + CHK_NOT( sb.free() ); + + SizedBuf sb3(4); + CHK_TRUE( sb3.free() ); + + char* str_raw2 = (char*)malloc(4); + SizedBuf sb4(4, str_raw2); + CHK_NOT( sb4.free() ); + + sb4.setNeedToFree(); + CHK_TRUE( sb4.free() ); + + char* str_raw3 = new char[4]; + SizedBuf sb5(4, str_raw3); + CHK_NOT( sb5.free() ); + + sb5.setNeedToDelete(); + CHK_TRUE( sb5.free() ); + + return 0; +} + +int sb_compare_test() { + SizedBuf aa("aa"); + SizedBuf aaa("aaa"); + SizedBuf abc("abc"); + + CHK_OK(aa == aa); + CHK_NOT(aa == aaa); + CHK_NOT(aa == abc); + + CHK_OK(aa < aaa); + CHK_OK(aa < abc); + CHK_OK(aaa < abc); + CHK_NOT(aa < aa); + + CHK_NOT(aaa < aa); + CHK_NOT(abc < aa); + CHK_NOT(abc < aaa); + + CHK_OK(aa <= aaa); + CHK_OK(aa <= abc); + CHK_OK(aaa <= abc); + CHK_OK(aa <= aa); + + CHK_NOT(aaa <= aa); + CHK_NOT(abc <= aa); + CHK_NOT(abc <= aaa); + + CHK_OK(aaa > aa); + CHK_OK(abc > aa); + CHK_OK(abc > aaa); + CHK_NOT(aa > aa); + + CHK_NOT(aa > aaa); + CHK_NOT(aa > abc); + CHK_NOT(aaa > abc); + + CHK_OK(aaa >= aa); + CHK_OK(abc >= aa); + CHK_OK(abc >= aaa); + CHK_OK(aa >= aa); + + CHK_NOT(aa >= aaa); + CHK_NOT(aa >= abc); + CHK_NOT(aaa >= abc); + + return 0; +} + +int sb_compare_empty_test() { + SizedBuf aa("aa"); + SizedBuf empty; + + CHK_OK(empty == empty); + CHK_OK(empty <= empty); + CHK_OK(empty >= empty); + + CHK_OK(empty < aa); + CHK_OK(empty <= aa); + CHK_OK(aa > empty); + CHK_OK(aa >= empty); + + CHK_NOT(empty > aa); + CHK_NOT(empty >= aa); + CHK_NOT(aa < empty); + CHK_NOT(aa <= empty); + + return 0; +} + +int sb_cmp_func_test() { + SizedBuf aa("aa"); + SizedBuf aaa("aaa"); + SizedBuf abc("abc"); + SizedBuf empty; + + CHK_EQ(0, SizedBuf::cmp(aa, aa)); + CHK_EQ(0, SizedBuf::cmp(empty, empty)); + + CHK_SM(SizedBuf::cmp(empty, aa), 0); + CHK_SM(SizedBuf::cmp(aa, aaa), 0); + CHK_SM(SizedBuf::cmp(aaa, abc), 0); + CHK_SM(SizedBuf::cmp(aa, abc), 0); + + CHK_GT(SizedBuf::cmp(aa, empty), 0); + CHK_GT(SizedBuf::cmp(aaa, aa), 0); + CHK_GT(SizedBuf::cmp(abc, aaa), 0); + CHK_GT(SizedBuf::cmp(abc, aa), 0); + + return 0; +} + +int rw_serializer_test() { + SizedBuf buf(128); + SizedBuf::Holder h_buf(buf); + + SizedBuf aaa("aaa"); + RwSerializer ww(buf); + + CHK_Z( ww.putSb(aaa) ); + CHK_Z( ww.putU8(1) ); + CHK_Z( ww.putU16(2) ); + CHK_Z( ww.putU32(3) ); + CHK_Z( ww.putU64(4) ); + + RwSerializer rr(buf); + SizedBuf tmp; + SizedBuf::Holder h_tmp(tmp); + CHK_Z( rr.getSb(tmp) ); + CHK_EQ(aaa, tmp); + CHK_EQ(1, rr.getU8()); + CHK_EQ(2, rr.getU16()); + CHK_EQ(3, rr.getU32()); + CHK_EQ(4, rr.getU64()); + + return 0; +} + +int resizable_rw_serializer_test() { + SizedBuf buf(4); + SizedBuf::Holder h_buf(buf); + + SizedBuf aaa("aaa"); + RwSerializer ww(&buf); + + CHK_Z( ww.putSb(aaa) ); + CHK_Z( ww.putU8(1) ); + CHK_Z( ww.putU16(2) ); + CHK_Z( ww.putU32(3) ); + CHK_Z( ww.putU64(4) ); + + RwSerializer rr(buf); + SizedBuf tmp; + SizedBuf::Holder h_tmp(tmp); + CHK_Z( rr.getSb(tmp) ); + CHK_EQ(aaa, tmp); + CHK_EQ(1, rr.getU8()); + CHK_EQ(2, rr.getU16()); + CHK_EQ(3, rr.getU32()); + CHK_EQ(4, rr.getU64()); + + return 0; +} + +int empty_rw_serializer_test() { + SizedBuf buf; + SizedBuf::Holder h_buf(buf); + + SizedBuf aaa("hello world 1234567890"); + RwSerializer ww(&buf); + + CHK_Z( ww.putSb(aaa) ); + CHK_Z( ww.putU8(1) ); + CHK_Z( ww.putU16(2) ); + CHK_Z( ww.putU32(3) ); + CHK_Z( ww.putU64(4) ); + + RwSerializer rr(buf); + SizedBuf tmp; + SizedBuf::Holder h_tmp(tmp); // dummy holder, should do nothing. + CHK_Z( rr.getSb(tmp, false) ); + CHK_EQ(aaa, tmp); + CHK_EQ(1, rr.getU8()); + CHK_EQ(2, rr.getU16()); + CHK_EQ(3, rr.getU32()); + CHK_EQ(4, rr.getU64()); + + return 0; +} + +int kv_empty_test() +{ + KV kv; + SizedBuf sb_empty; + + CHK_EQ(sb_empty, kv.key); + CHK_EQ(sb_empty, kv.value); + + return 0; +} + +int main(int argc, char** argv) { + TestSuite test(argc, argv); + + test.doTest("SizedBuf empty test", sb_empty_test); + test.doTest("SizedBuf normal test", sb_normal_test); + test.doTest("SizedBuf clone test", sb_clone_test); + test.doTest("SizedBuf string test", sb_string_test); + test.doTest("SizedBuf clear test", sb_clear_test); + test.doTest("SizedBuf free test", sb_free_test); + test.doTest("SizedBuf equal test", sb_equal_test); + test.doTest("SizedBuf equal empty test", sb_equal_empty_test); + test.doTest("SizedBuf comparison test", sb_compare_test); + test.doTest("SizedBuf comparison with empty test", sb_compare_empty_test); + test.doTest("SizedBuf cmp function test", sb_cmp_func_test); + + test.doTest("RW Serializer test", rw_serializer_test); + test.doTest("Resizable RW Serializer test", resizable_rw_serializer_test); + test.doTest("Empty RW Serializer test", empty_rw_serializer_test); + + test.doTest("KV empty test", kv_empty_test); + + return 0; +} diff --git a/tests/unit/memtable_test.cc b/tests/unit/memtable_test.cc new file mode 100644 index 0000000..7dc7218 --- /dev/null +++ b/tests/unit/memtable_test.cc @@ -0,0 +1,182 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include +#include + +#include +#include + +#include "libjungle/jungle.h" +#include "log_file.h" +#include "log_mgr.h" +#include "memtable.h" + +#include "test_common.h" + +using namespace jungle; + +int memtable_key_itr_test() { + Status s; + DBConfig config; + LogMgrOptions l_opt; + l_opt.dbConfig = &config; + LogMgr l_mgr(nullptr, l_opt); + LogFile l_file(&l_mgr); + MemTable mt(&l_file); + mt.init(); + + size_t num = 10; + size_t modulo = 3; + std::vector rec(num); + size_t idx[modulo]; + char keybuf[32]; + char valbuf[32]; + for (size_t ii=0; ii rec(num); + + char keybuf[32]; + char valbuf[32]; + for (size_t ii=0; ii +#include + +#include +#include + +using namespace jungle; + +namespace booster_test { + +using TElem = TableLookupBooster::Elem; + +int basic_get_set_test() { + size_t NUM = 1000; + TableLookupBooster tlb(NUM, nullptr, nullptr); + std::hash h_func; + + for (size_t ii=1; ii<=NUM; ++ii) { + CHK_Z( tlb.setIfNew( TElem(h_func(ii), ii*2, ii*10) ) ); + } + CHK_EQ(NUM, tlb.size()); + + for (size_t ii=1; ii<=NUM; ++ii) { + uint64_t offset_out = 0; + CHK_Z( tlb.get( h_func(ii), offset_out ) ); + CHK_EQ( ii*10, offset_out ); + } + + return 0; +} + +int set_newer_test() { + size_t NUM = 1000; + TableLookupBooster tlb(NUM, nullptr, nullptr); + std::hash h_func; + + // Initial. + for (size_t ii=1; ii<=NUM; ++ii) { + CHK_Z( tlb.setIfNew( TElem(h_func(ii), ii*2, ii*10) ) ); + } + CHK_EQ(NUM, tlb.size()); + + // Set newer. + for (size_t ii=1; ii<=NUM; ++ii) { + CHK_Z( tlb.setIfNew( TElem(h_func(ii), ii*3, ii*20) ) ); + } + CHK_EQ(NUM, tlb.size()); + + // Set older. + for (size_t ii=1; ii<=NUM; ++ii) { + CHK_FALSE( tlb.setIfNew( TElem(h_func(ii), ii, ii*30) ) ); + } + CHK_EQ(NUM, tlb.size()); + + for (size_t ii=1; ii<=NUM; ++ii) { + uint64_t offset_out = 0; + CHK_Z( tlb.get( h_func(ii), offset_out ) ); + CHK_EQ( ii*20, offset_out ); + } + + return 0; +} + +int eviction_test() { + size_t NUM = 10; + TableLookupBooster tlb(NUM, nullptr, nullptr); + std::hash h_func; + + // Initial. + for (size_t ii = 1; ii <= NUM * 100; ++ii) { + CHK_Z( tlb.setIfNew( TElem(h_func(ii), ii*2, ii*10) ) ); + } + CHK_EQ(NUM, tlb.size()); + + size_t count = 0; + for (size_t ii = 1; ii <= NUM * 100; ++ii) { + uint64_t offset_out = 0; + Status s; + s = tlb.get( h_func(ii), offset_out ); + if (s) { + count++; + } + } + CHK_EQ(NUM, count); + + for (size_t ii = NUM * 99 + 1; ii <= NUM * 99 + 3; ++ii) { + uint64_t offset_out = 0; + CHK_Z( tlb.get( h_func(ii), offset_out ) ); + } + for (size_t ii = 1; ii <= 3; ++ii) { + CHK_Z( tlb.setIfNew( TElem(h_func(ii), ii*3, ii*20) ) ); + } + CHK_EQ(NUM, tlb.size()); + + count = 0; + for (size_t ii = 1; ii <= NUM * 100; ++ii) { + uint64_t offset_out = 0; + Status s; + s = tlb.get( h_func(ii), offset_out ); + if (s) { + count++; + } + } + CHK_EQ(NUM, count); + + return 0; +} + +}; // namespace booster_test; +using namespace booster_test; + +int main(int argc, char** argv) { + TestSuite ts(argc, argv); + + ts.doTest("basic get set test", basic_get_set_test); + ts.doTest("set newer test", set_newer_test); + ts.doTest("eviction test", eviction_test); + + return 0; +} diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt new file mode 100644 index 0000000..e856a46 --- /dev/null +++ b/tools/CMakeLists.txt @@ -0,0 +1,18 @@ +set(TOOLS_DIR ${PROJECT_SOURCE_DIR}/tools) + +set(JUNGLE_TOOLS_DEPS + ${CMAKE_CURRENT_BINARY_DIR}/../libjungle.a + ${LIBSIMPLELOGGER} + ${FDB_LIB_DIR}/libforestdb.a + ${LIBSNAPPY} + ${LIBDL}) + +set(BF_GEN ${TOOLS_DIR}/bloomfilter_generator.cc) +add_executable(bloomfilter_generator ${BF_GEN}) +target_link_libraries(bloomfilter_generator ${JUNGLE_TOOLS_DEPS}) +add_dependencies(bloomfilter_generator static_lib) + +set(JUNGLE_CHECKER ${TOOLS_DIR}/jungle_checker.cc) +add_executable(jungle_checker ${JUNGLE_CHECKER}) +target_link_libraries(jungle_checker ${JUNGLE_TOOLS_DEPS}) +add_dependencies(jungle_checker static_lib) diff --git a/tools/bloomfilter_generator.cc b/tools/bloomfilter_generator.cc new file mode 100644 index 0000000..c866ac9 --- /dev/null +++ b/tools/bloomfilter_generator.cc @@ -0,0 +1,196 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "bloomfilter.h" +#include "db_mgr.h" +#include "fileops_posix.h" +#include "table_file.h" +#include "table_mgr.h" +#include "test_common.h" + +#include _MACRO_TO_STR(LOGGER_H) + +#include + +#include +#include + +#include + +using namespace jungle; + +namespace bf_generator { + +class MutableTableMgr : public TableMgr { +public: + MutableTableMgr(DB* parent_db) : TableMgr(parent_db) {} + + void setOpt(const TableMgrOptions& to) { opt = to; } +}; + +enum BfMode { + ORIGINAL = 0x0, + APPEND = 0x1, +}; + +int bf_gen_file(TableMgr* t_mgr, + FileOps* f_ops, + const std::string& file_path) +{ + size_t pos = file_path.rfind("_"); + if (pos == std::string::npos) return -10000; + + const DBConfig* db_config = t_mgr->getDbConfig(); + + uint64_t t_num = atoi(file_path.substr(pos + 1).c_str()); + + TableFile t_file(t_mgr); + Status s; + TableFileOptions tf_opt; + s = t_file.load(0, t_num, file_path, f_ops, tf_opt); + if (!s) return (int)s; + + std::list recs; + TableFile::Iterator t_itr; + s = t_itr.init(nullptr, &t_file, SizedBuf(), SizedBuf()); + if (!s) return (int)s; + do { + Record rec_out; + Record::Holder h_rec_out(rec_out); + s = t_itr.get(rec_out); + if (!s) break; + + Record* rec_obj = new Record(); + rec_out.moveTo(*rec_obj); + recs.push_back(rec_obj); + } while (t_itr.next().ok()); + t_itr.close(); + + uint64_t bf_size = recs.size() * db_config->bloomFilterBitsPerUnit; + + BloomFilter bf(bf_size, 3); + for (Record* rr: recs) { + bf.set(rr->kv.key.data, rr->kv.key.size); + rr->free(); + delete rr; + } + + // Remove previous file if exists. + std::string bf_file_name = file_path + ".bf"; + if (FileMgr::exist(bf_file_name)) { + FileMgr::remove(bf_file_name); + } + t_file.saveBloomFilter(bf_file_name, &bf, true); + + return 0; +} + +int bf_gen(const std::string& db_path, + BfMode bf_mode, + size_t bpk, + size_t max_table_size_mb) +{ + GlobalConfig g_config; + DBMgr::init(g_config); + + FileOpsPosix f_ops; + DBConfig db_config; + + db_config.bloomFilterBitsPerUnit = bpk; + + TableMgrOptions t_mgr_opt; + t_mgr_opt.path = db_path; + t_mgr_opt.fOps = &f_ops; + t_mgr_opt.dbConfig = &db_config; + + MutableTableMgr t_mgr(nullptr); + t_mgr.setOpt(t_mgr_opt); + + // Scan & find table files. + std::vector files; + int rc = FileMgr::scan(db_path, files); + if (rc != 0) return rc; + + std::vector actual_files; + + for (auto& entry: files) { + std::string& cur_file = entry; + std::string file_part = FileMgr::filePart(cur_file); + + if ( file_part.find("table") == 0 && + file_part.rfind("manifest") == std::string::npos && + file_part.rfind("bf") == std::string::npos ) { + actual_files.push_back(cur_file); + } + } + + size_t cnt = 0; + TestSuite::Progress pp(actual_files.size()); + for (auto& entry: actual_files) { + std::string& cur_file = entry; + std::string file_part = FileMgr::filePart(cur_file); + + rc = bf_gen_file(&t_mgr, &f_ops, db_path + "/" + cur_file); + if (rc != 0) return rc; + + pp.update(++cnt); + } + pp.done(); + + DBMgr::destroy(); + + return 0; +} + +void usage(int argc, char** argv) { + std::stringstream ss; + ss << "Usage: \n"; + ss << " " << argv[0] << " [DB path] " + << "[max table size in MB] " + << "[bits per key]\n"; + ss << std::endl; + + std::cout << ss.str(); + + exit(0); +} + +}; // namespace bf_generator; +using namespace bf_generator; + +int main(int argc, char** argv) { + if (argc < 4) { + usage(argc, argv); + } + + TestSuite ts(argc, argv); + ts.options.printTestMessage = true; + + std::string db_path = argv[1]; + + size_t max_table_size_mb = atoi(argv[2]); + BfMode bf_mode = ORIGINAL; + if (max_table_size_mb) bf_mode = APPEND; + + size_t bpk = atoi(argv[3]); + if (bpk == 0 || bpk > 100000) usage(argc, argv); + + ts.doTest( "bloomfilter generator", bf_gen, + db_path, bf_mode, bpk, max_table_size_mb ); + + return 0; +} + diff --git a/tools/jungle_checker.cc b/tools/jungle_checker.cc new file mode 100644 index 0000000..95c5eaa --- /dev/null +++ b/tools/jungle_checker.cc @@ -0,0 +1,406 @@ +/************************************************************************ +Copyright 2017-2019 eBay Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +**************************************************************************/ + +#include "db_internal.h" +#include "internal_helper.h" + +#include + +#include +#include +#include +#include + +#include + +namespace jungle { + +namespace checker { + +static std::string exec_filename; + +class Checker { +public: + +static int load_db(const std::string& db_path, + jungle::DB*& db_out, + bool& log_mode_out) { + if (!FileMgr::exist(db_path)) { + std::cout << "DB does not exist: " << db_path << std::endl; + return -1; + } + + GlobalConfig g_conf; + g_conf.fdbCacheSize = 0; + g_conf.numCompactorThreads = 0; + g_conf.numFlusherThreads = 0; + g_conf.numTableWriters = 0; + jungle::init(g_conf); + + // Check the mode of the given DB. + log_mode_out = DB::isLogSectionMode(db_path); + + Status s; + db_out = nullptr; + DBConfig d_conf; + d_conf.readOnly = true; + d_conf.logSectionOnly = log_mode_out; + if (log_mode_out) { + d_conf.logFileTtl_sec = 3; + } + s = DB::open(&db_out, db_path, d_conf); + if (!s) { + std::cout << "DB open failed: " << (int)s << std::endl; + return -1; + } + + // Number of log files. + std::cout << "path: " << db_path << std::endl; + + // Log mode. + if (log_mode_out) { + std::cout << "mode: log store" << std::endl; + } else { + std::cout << "mode: database" << std::endl; + } + return 0; +} + +static int db_overview(const std::vector& args) { + if (args.size() < 2) { + std::cout + << "too few arguments:" << std::endl + << " " << exec_filename + << " " << args[0] << " " + << std::endl; + return -1; + } + + Status s; + const std::string& db_path = args[1]; + jungle::DB* db = nullptr; + bool log_mode = false; + int rc = load_db(db_path, db, log_mode); + if (rc != 0) return rc; + + size_t num_log_files = db->p->logMgr->getNumLogFiles(); + + // Log file number range. + uint64_t min_log_file_idx = 0; + uint64_t max_log_file_idx = 0; + db->p->logMgr->mani->getMinLogFileNum(min_log_file_idx); + db->p->logMgr->mani->getMaxLogFileNum(max_log_file_idx); + + uint64_t last_flushed_idx = 0; + uint64_t last_synced_idx = 0; + db->p->logMgr->mani->getLastFlushedLog(last_flushed_idx); + db->p->logMgr->mani->getLastSyncedLog(last_synced_idx); + + printf( "number of log files: %zu (%zu - %zu)\n", + (size_t)num_log_files, + (size_t)min_log_file_idx, + (size_t)max_log_file_idx ); + if (valid_number(last_flushed_idx)) { + uint64_t last_flushed_seqnum = 0; + db->p->logMgr->getLastFlushedSeqNum(last_flushed_seqnum); + printf( " last flushed log file index: %zu (seq %zu)\n", + (size_t)last_flushed_idx, + (size_t)last_flushed_seqnum ); + } + if (valid_number(last_synced_idx)) { + uint64_t last_synced_seqnum = 0; + db->p->logMgr->getLastSyncedSeqNum(last_synced_seqnum); + printf( " last synced log file index: %zu (seq %zu)\n", + (size_t)last_synced_idx, + (size_t)last_synced_seqnum ); + } + uint64_t min_seq = 0; + uint64_t max_seq = 0; + db->p->logMgr->getMinSeqNum(min_seq); + db->p->logMgr->getMaxSeqNum(max_seq); + if ( min_seq && max_seq && valid_number(min_seq) && valid_number(max_seq) ) { + printf(" sequence number range: %zu - %zu\n", + (size_t)min_seq, (size_t)max_seq); + } else { + printf(" no active log (all logs have been flushed)\n"); + } + + if (!log_mode) { + // Number of levels. + size_t num_levels = db->p->tableMgr->getNumLevels(); + printf("number of levels: %zu (bottommost level %zu)\n", + num_levels, num_levels - 1); + + size_t total_num_tables = 0; + size_t total_num_records = 0; + size_t total_size = 0; + size_t total_active_size = 0; + for (size_t ii=0; ii tables; + + db->p->tableMgr->mani->getTablesRange + ( ii, SizedBuf(), SizedBuf(), tables ); + num_tables = tables.size(); + TableStats t_stats; + for (auto& entry: tables) { + TableInfo*& t_info = entry; + t_info->file->getStats(t_stats); + level_size_total += t_stats.totalSizeByte; + level_size_active += t_stats.workingSetSizeByte; + num_records += t_stats.numKvs; + t_info->done(); + } + printf(" level %2zu: %4zu tables, %zu records, " + "%zu / %zu, %s / %s\n", + ii, num_tables, num_records, + level_size_active, level_size_total, + Formatter::sizeToString(level_size_active).c_str(), + Formatter::sizeToString(level_size_total).c_str()); + + total_num_records += num_records; + total_num_tables += num_tables; + total_size += level_size_total; + total_active_size += level_size_active; + } + printf(" ---\n"); + printf(" total : %4zu tables, %zu records, " + "%zu / %zu, %s / %s\n", + total_num_tables, total_num_records, + total_active_size, total_size, + Formatter::sizeToString(total_active_size).c_str(), + Formatter::sizeToString(total_size).c_str()); + } + + s = DB::close(db); + if (!s) { + std::cout << "DB close failed: " << db_path << std::endl; + return -1; + } + + return 0; +} + +static int dump_logs(const std::vector& args) { + if (args.size() < 3) { + std::cout + << "too few arguments:" << std::endl + << " " << exec_filename + << " " << args[0] << " []" + << std::endl; + return -1; + } + + Status s; + const std::string& db_path = args[1]; + uint64_t start_idx = std::atoll(args[2].c_str()); + uint64_t end_idx = start_idx; + if (args.size() >= 4) end_idx = std::atoll(args[3].c_str()); + + jungle::DB* db = nullptr; + bool log_mode = false; + int rc = load_db(db_path, db, log_mode); + if (rc != 0) return rc; + + // 0 1 2 (3) + // logmeta path 100 + // => display meta of log 100 on terminal. + // + // logmeta path 100 110 + // => display meta of logs [100, 110] on terminal. + // + // dumplog path 100 110 + // => display meta and value (hex) of logs [100, 110] on terminal. + // + // dumplog2file path 100 110 + // => display meta of logs [100, 110] on terminal, + // and dump values to file. + + for (uint64_t ii=start_idx; ii<=end_idx; ++ii) { + Record rec_out; + Record::Holder h(rec_out); + s = db->p->logMgr->getSN(ii, rec_out); + printf( " seq: %zu\n", (size_t)ii ); + if (!s) { + printf(" READ FAILED\n"); + continue; + } + printf( " key: %s\n", + HexDump::toString(rec_out.kv.key).c_str() ); + printf( " meta: %s\n", + HexDump::toString(rec_out.meta).c_str() ); + if (args[0] == "dumplog") { + printf( " value: %s\n", + HexDump::toString(rec_out.kv.value).c_str() ); + + } else if (args[0] == "dumplog2file") { + std::ofstream fs; + std::string filename = "log_dump_" + std::to_string(ii); + fs.open(filename); + fs << rec_out.kv.value.toString(); + fs.close(); + } + printf( "\n" ); + } + + return 0; +} + +static int table_info(const std::vector& args) { + if (args.size() < 2) { + std::cout + << "too few arguments:" << std::endl + << " " << exec_filename + << " " << args[0] << " []" + << std::endl; + return -1; + } + + Status s; + const std::string& db_path = args[1]; + jungle::DB* db = nullptr; + bool log_mode = false; + int rc = load_db(db_path, db, log_mode); + if (rc != 0) return rc; + + // Number of levels. + size_t num_levels = db->p->tableMgr->getNumLevels(); + printf("number of levels: %zu (bottommost level %zu)\n", + num_levels, num_levels - 1); + + int target_level = -1; + if (args.size() >= 3) { + target_level = std::atoi(args[2].c_str()); + } + + for (size_t ii=0; ii= 0 && target_level != (int)ii) continue; + + printf(" level %zu:\n", ii); + std::list tables; + db->p->tableMgr->mani->getTablesRange + ( ii, SizedBuf(), SizedBuf(), tables ); + for (auto& entry: tables) { + TableInfo*& t_info = entry; + TableStats t_stats; + t_info->file->getStats(t_stats); + + printf(" table %zu:\n", (size_t)t_info->number); + if (ii == 0) { + // L0: hash partition. + printf(" hash: %u\n", t_info->hashNum); + } else { + printf(" min key: %s", + HexDump::toString(t_info->minKey).c_str()); + + SizedBuf max_key_out; + SizedBuf::Holder h(max_key_out); + t_info->file->getMaxKey(max_key_out); + printf(" max key: %s", + HexDump::toString(max_key_out).c_str()); + } + printf(" number of records: %zu\n", (size_t)t_stats.numKvs); + printf(" last sequence number: %zu\n", (size_t)t_stats.lastSeqnum); + printf(" space: %zu / %zu, %s / %s\n", + (size_t)t_stats.workingSetSizeByte, + (size_t)t_stats.totalSizeByte, + Formatter::sizeToString(t_stats.workingSetSizeByte).c_str(), + Formatter::sizeToString(t_stats.totalSizeByte).c_str()); + printf(" block reuse cycle: %zu\n", (size_t)t_stats.blockReuseCycle); + printf(" status: %d\n", t_info->status.load()); + + t_info->done(); + printf("\n"); + } + printf("\n"); + } + + s = DB::close(db); + if (!s) { + std::cout << "DB close failed: " << db_path << std::endl; + return -1; + } + + return 0; +} + +}; + +void usage(int argc, char** argv) { + std::stringstream ss; + ss << "Usage:" << std::endl; + ss << " " << argv[0] << " []" + << std::endl << std::endl; + + ss << "Commands:" << std::endl; + ss << " overview " + << "Print log and table file info." << std::endl; + ss << " logmeta " + << "Print key, seq number, and meta of logs in given range." << std::endl; + ss << " dumplog " + << "In addition to logmeta, print value as well." << std::endl; + ss << " dumplog2file " + << "In addition to logmeta, dump value to a file (per log)." << std::endl; + ss << " tableinfo " + << "Print table info in each level." << std::endl; + + std::cout << ss.str(); + + exit(0); +} + +int process_cmd(int argc, char** argv) { + std::vector args; + for (int ii=1; ii