diff --git a/.github/licenserc.yml b/.github/licenserc.yml
index eaf7a49eee4..b122b2c9775 100644
--- a/.github/licenserc.yml
+++ b/.github/licenserc.yml
@@ -25,6 +25,7 @@ header:
     - '**/LICENSE.TXT'
     - '**/cipher-file-256'
     - '**/asan.suppression'
+    - '**/tsan.suppression'
     - '**/LICENSE.TXT'
     - '**/LICENSE'
     - '**/README'
diff --git a/.gitmodules b/.gitmodules
index 8472d78404e..335e1dbd9c8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -82,3 +82,6 @@
 [submodule "contrib/cpu_features"]
 	path = contrib/cpu_features
 	url = https://github.com/google/cpu_features
+[submodule "contrib/arm-optimized-routines"]
+	path = contrib/arm-optimized-routines
+	url = https://github.com/ARM-software/optimized-routines
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e14c205f18..2e33a127807 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -239,10 +239,8 @@ else ()
     set (CMAKE_CXX_STANDARD_REQUIRED ON)
 endif ()
 
-if (NOT ARCH_ARM)
-    set (CMAKE_CXX_FLAGS_RELWITHDEBINFO      "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3")
-    set (CMAKE_C_FLAGS_RELWITHDEBINFO        "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3")
-endif ()
+set (CMAKE_CXX_FLAGS_RELWITHDEBINFO      "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3")
+set (CMAKE_C_FLAGS_RELWITHDEBINFO        "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3")
 
 option (DEBUG_WITHOUT_DEBUG_INFO "Set to ON to build dev target without debug info (remove flag `-g` in order to accelerate compiling speed and reduce target binary size)" OFF)
 if (DEBUG_WITHOUT_DEBUG_INFO)
@@ -432,9 +430,6 @@ else (ENABLE_FAILPOINTS)
     message (STATUS "Failpoints are disabled")
 endif (ENABLE_FAILPOINTS)
 
-# Enable PageStorage V3 test.
-option (ENABLE_V3_PAGESTORAGE "Enables V3 PageStorage" ON)
-
 # Flags for test coverage
 option (TEST_COVERAGE "Enables flags for test coverage" OFF)
 option (TEST_COVERAGE_XML "Output XML report for test coverage" OFF)
diff --git a/README.md b/README.md
index 8a2217b9a42..ab996b6f3d6 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ TiFlash repository is based on [ClickHouse](https://github.com/ClickHouse/ClickH
 
 ### Start with TiDB Cloud
 
-Quickly explore TiFlash with [a free trial of TiDB Cloud](https://tidbcloud.com/signup).
+Quickly explore TiFlash with [a free trial of TiDB Cloud](https://tidbcloud.com/free-trial).
 
 See [TiDB Cloud Quick Start Guide](https://docs.pingcap.com/tidbcloud/tidb-cloud-quickstart).
 
@@ -242,7 +242,30 @@ LSAN_OPTIONS=suppressions=$WORKSPACE/tiflash/test/sanitize/asan.suppression
 
 ## Run Integration Tests
 
-TBD.
+1. Build your own tiflash binary in $BUILD with `-DCMAKE_BUILD_TYPE=DEBUG`.
+```
+cd $BUILD
+cmake $WORKSPACE/tiflash -GNinja -DCMAKE_BUILD_TYPE=DEBUG
+ninja tiflash
+```
+2. Run tidb cluster locally using tiup playgroud or other tools. 
+```
+tiup playground nightly --tiflash.binpath $BUILD/dbms/src/Server/tiflash
+```
+3. Check $WORKSPACE/tests/_env.sh to make the port and build dir right.
+4. Run your integration tests using commands like "./run-test.sh fullstack-test2/ddl" under $WORKSPACE/tests dir
+
+## Run MicroBenchmark Tests
+
+To run micro benchmark tests, you need to build with -DCMAKE_BUILD_TYPE=RELEASE -DENABLE_TESTS=ON:
+
+```shell
+cd $BUILD
+cmake $WORKSPACE/tiflash -GNinja -DCMAKE_BUILD_TYPE=RELEASE -DENABLE_TESTS=ON
+ninja bench_dbms       
+```
+
+And the microbenchmark-test executables are at `$BUILD/dbms/bench_dbms`, you can run it with `./bench_dbms` or `./bench_dbms --benchmark_filter=xxx` . More usage please check with `./bench_dbms --help`.
 
 ## Generate LLVM Coverage Report
 
diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
index 71f81ae3ee5..4520d1cb176 100644
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -165,3 +165,7 @@ add_subdirectory(benchmark)
 
 set (BUILD_TESTING OFF CACHE BOOL "Disable cpu-features testing" FORCE)
 add_subdirectory(cpu_features)
+
+if (ARCH_AARCH64 AND ARCH_LINUX)
+    add_subdirectory(arm-optimized-routines-cmake)
+endif ()
diff --git a/contrib/arm-optimized-routines b/contrib/arm-optimized-routines
new file mode 160000
index 00000000000..e373f659523
--- /dev/null
+++ b/contrib/arm-optimized-routines
@@ -0,0 +1 @@
+Subproject commit e373f6595230087a8ddea449bfb14b47150b4059
diff --git a/contrib/arm-optimized-routines-cmake/CMakeLists.txt b/contrib/arm-optimized-routines-cmake/CMakeLists.txt
new file mode 100644
index 00000000000..89baa7222f3
--- /dev/null
+++ b/contrib/arm-optimized-routines-cmake/CMakeLists.txt
@@ -0,0 +1,45 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This library is to override performance-critical routines for aarch64 targets.
+# The implementations are imported from official ARM repo.
+# To reduce dispatching cost, indirect function technique is utilized. Therefore,
+# this library should only be enabled with ELF targets.
+
+# Considerations:
+# - By Jun, 2022, most enterprise OSs (CentOS 7, CentOS Stream 8 and RHEL 8) still
+#   use relatively old glibc on ARM64, where ASIMD, MTE, DC ZVA and SVE are not
+#   fully utilized. However, it is becoming increasingly common to use ARM64 instances
+#   in cloud-native situations.
+# - `optimized-routines` repo is actively maintained by ARM officials. Therefore,
+#   the qualities can be ensured while using it also enables us to keep sync with latest
+#   acceleration techniques.
+
+set(CMAKE_C_FLAGS "")
+ENABLE_LANGUAGE(C)
+ENABLE_LANGUAGE(ASM)
+set(TIFLASH_AOR_DIR ../arm-optimized-routines)
+
+file(GLOB TIFLASH_AARCH64_STRING_FILES ${TIFLASH_AOR_DIR}/string/aarch64/*.S)
+add_library(tiflash-aarch64-string STATIC ${TIFLASH_AARCH64_STRING_FILES} src/aor.c)
+target_compile_options(tiflash-aarch64-string PRIVATE -march=armv8-a+sve)
+target_include_directories(tiflash-aarch64-string PRIVATE ${TIFLASH_AOR_DIR}/string/include)
+
+file(GLOB TIFLASH_AARCH64_MATH_FILES ${TIFLASH_AOR_DIR}/math/*.c)
+add_library(tiflash-aarch64-math STATIC ${TIFLASH_AARCH64_MATH_FILES})
+target_include_directories(tiflash-aarch64-math PRIVATE ${TIFLASH_AOR_DIR}/math/include)
+
+# it is reasonable to keep these libraries optimized
+target_compile_options(tiflash-aarch64-string PRIVATE -O3 -g3 -fno-omit-frame-pointer -ffunction-sections -fdata-sections)
+target_compile_options(tiflash-aarch64-math PRIVATE -O3 -g3 -fno-omit-frame-pointer -ffunction-sections -fdata-sections)
diff --git a/contrib/arm-optimized-routines-cmake/src/aor.c b/contrib/arm-optimized-routines-cmake/src/aor.c
new file mode 100644
index 00000000000..daff1df3c4b
--- /dev/null
+++ b/contrib/arm-optimized-routines-cmake/src/aor.c
@@ -0,0 +1,115 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <stringlib.h>
+#include <sys/auxv.h>
+
+// Provide default macro definitions in case that they are not defined on current linux distro.
+// For example, TiFlash compiled on older linux kernels may also be used in newer ones.
+// These values should be stable for Linux: only false negative is expected when running on
+// older kernels, but it is acceptable as `google/cpu_features` is also doing so.
+#ifndef HWCAP2_MTE
+#define HWCAP2_MTE (1 << 18)
+#endif
+
+#ifndef HWCAP_SVE
+#define HWCAP_SVE (1 << 22)
+#endif
+
+#ifndef AT_HWCAP2
+#define AT_HWCAP2 26
+#endif
+
+#ifndef AT_HWCAP
+#define AT_HWCAP 16
+#endif
+
+/// check if MTE is supported in current environment
+static inline bool mte_supported(void)
+{
+    return (getauxval(AT_HWCAP2) & HWCAP2_MTE) != 0;
+}
+
+/// check if SVE is supported in current environment
+static inline bool sve_supported(void)
+{
+    return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
+}
+
+#define STRINGIFY_IMPL(X) #X
+#define STRINGIFY(X) STRINGIFY_IMPL(X)
+/**
+ *  \brief
+ *  Symbol is defined as hidden visibility. Therefore, implementations here are only to override routines with TiFlash
+ *  binary itself. This is because dependencies like `ld.so`, `libgcc_s.so`, etc will need essential routines like
+ *  `memcpy` to finish the early loading procedure. Therefore, declare such symbols as visible indirect function will
+ *  create cyclic dependency. It shall be good enough to override symbols within TiFlash, as most heavy computation works
+ *  are happening in the main binary.
+ *  \param NAME: exported symbol name
+ *  \param SVE: preferred implementation when SVE is available
+ *  \param MTE: preferred implementation when MTE is available
+ *  \param ASIMD: preferred implementation for generic aarch64 targets (ASIMD is required by default for Armv8 and above)
+ */
+#define DISPATCH(NAME, SVE, MTE, ASIMD)                                                                                  \
+    extern typeof(ASIMD) __tiflash_##NAME __attribute__((ifunc(STRINGIFY(__tiflash_##NAME##_resolver))));                \
+    extern typeof(ASIMD) NAME __attribute__((visibility("hidden"), alias(STRINGIFY(__tiflash_##NAME))));                 \
+    _Pragma("GCC diagnostic push")                                                                                       \
+        _Pragma("GCC diagnostic ignored \"-Wunused-function\"") static typeof(ASIMD) * __tiflash_##NAME##_resolver(void) \
+    {                                                                                                                    \
+        if (sve_supported())                                                                                             \
+        {                                                                                                                \
+            return SVE;                                                                                                  \
+        }                                                                                                                \
+        if (mte_supported())                                                                                             \
+        {                                                                                                                \
+            return MTE;                                                                                                  \
+        }                                                                                                                \
+        return ASIMD;                                                                                                    \
+    }                                                                                                                    \
+    _Pragma("GCC diagnostic pop")
+#undef memcpy
+#undef memmove
+#undef memset
+#undef memchr
+#undef memrchr
+#undef memcmp
+#undef strcpy
+#undef stpcpy
+#undef strcmp
+#undef strchr
+#undef strrchr
+#undef strchrnul
+#undef strlen
+#undef strnlen
+#undef strncmp
+
+DISPATCH(memcpy, __memcpy_aarch64_sve, __memcpy_aarch64_simd, __memcpy_aarch64_simd)
+DISPATCH(memmove, __memmove_aarch64_sve, __memmove_aarch64_simd, __memmove_aarch64_simd)
+DISPATCH(memset, __memset_aarch64, __memset_aarch64, __memset_aarch64)
+DISPATCH(memchr, __memchr_aarch64_sve, __memchr_aarch64_mte, __memchr_aarch64)
+DISPATCH(memrchr, __memrchr_aarch64, __memrchr_aarch64, __memrchr_aarch64)
+DISPATCH(memcmp, __memcmp_aarch64_sve, __memcmp_aarch64, __memcmp_aarch64)
+DISPATCH(strcpy, __strcpy_aarch64_sve, __strcpy_aarch64, __strcpy_aarch64)
+DISPATCH(stpcpy, __stpcpy_aarch64_sve, __stpcpy_aarch64, __stpcpy_aarch64)
+DISPATCH(strcmp, __strcmp_aarch64_sve, __strcmp_aarch64, __strcmp_aarch64)
+DISPATCH(strchr, __strchr_aarch64_sve, __strchr_aarch64_mte, __strchr_aarch64)
+DISPATCH(strrchr, __strrchr_aarch64_sve, __strrchr_aarch64_mte, __strrchr_aarch64)
+DISPATCH(strchrnul, __strchrnul_aarch64_sve, __strchrnul_aarch64_mte, __strchrnul_aarch64)
+DISPATCH(strlen, __strlen_aarch64_sve, __strlen_aarch64_mte, __strlen_aarch64)
+DISPATCH(strnlen, __strnlen_aarch64_sve, __strnlen_aarch64, __strnlen_aarch64)
+DISPATCH(strncmp, __strncmp_aarch64_sve, __strncmp_aarch64, __strncmp_aarch64)
\ No newline at end of file
diff --git a/contrib/client-c b/contrib/client-c
index 36e05cb0f24..034d1e782cb 160000
--- a/contrib/client-c
+++ b/contrib/client-c
@@ -1 +1 @@
-Subproject commit 36e05cb0f24c085785abf367176dac2a45bfd67b
+Subproject commit 034d1e782cb4697f99b09b679c00dade00f19dd5
diff --git a/contrib/jemalloc b/contrib/jemalloc
index ea6b3e973b4..54eaed1d8b5 160000
--- a/contrib/jemalloc
+++ b/contrib/jemalloc
@@ -1 +1 @@
-Subproject commit ea6b3e973b477b8061e0076bb257dbd7f3faa756
+Subproject commit 54eaed1d8b56b1aa528be3bdd1877e59c56fa90c
diff --git a/contrib/jemalloc-cmake/CMakeLists.txt b/contrib/jemalloc-cmake/CMakeLists.txt
index ef02fbabc81..91b17eb8ec7 100644
--- a/contrib/jemalloc-cmake/CMakeLists.txt
+++ b/contrib/jemalloc-cmake/CMakeLists.txt
@@ -4,65 +4,136 @@ endif()
 
 set(JEMALLOC_SOURCE_DIR ${TiFlash_SOURCE_DIR}/contrib/jemalloc)
 
-set(SRCS
-${JEMALLOC_SOURCE_DIR}/src/arena.c
-${JEMALLOC_SOURCE_DIR}/src/background_thread.c
-${JEMALLOC_SOURCE_DIR}/src/base.c
-${JEMALLOC_SOURCE_DIR}/src/bin.c
-${JEMALLOC_SOURCE_DIR}/src/bitmap.c
-${JEMALLOC_SOURCE_DIR}/src/ckh.c
-${JEMALLOC_SOURCE_DIR}/src/ctl.c
-${JEMALLOC_SOURCE_DIR}/src/div.c
-${JEMALLOC_SOURCE_DIR}/src/extent.c
-${JEMALLOC_SOURCE_DIR}/src/extent_dss.c
-${JEMALLOC_SOURCE_DIR}/src/extent_mmap.c
-${JEMALLOC_SOURCE_DIR}/src/hash.c
-${JEMALLOC_SOURCE_DIR}/src/hook.c
-${JEMALLOC_SOURCE_DIR}/src/jemalloc.c
-${JEMALLOC_SOURCE_DIR}/src/jemalloc_cpp.cpp
-${JEMALLOC_SOURCE_DIR}/src/large.c
-${JEMALLOC_SOURCE_DIR}/src/log.c
-${JEMALLOC_SOURCE_DIR}/src/malloc_io.c
-${JEMALLOC_SOURCE_DIR}/src/mutex.c
-${JEMALLOC_SOURCE_DIR}/src/mutex_pool.c
-${JEMALLOC_SOURCE_DIR}/src/nstime.c
-${JEMALLOC_SOURCE_DIR}/src/pages.c
-${JEMALLOC_SOURCE_DIR}/src/prng.c
-${JEMALLOC_SOURCE_DIR}/src/prof.c
-${JEMALLOC_SOURCE_DIR}/src/rtree.c
-${JEMALLOC_SOURCE_DIR}/src/sc.c
-${JEMALLOC_SOURCE_DIR}/src/stats.c
-${JEMALLOC_SOURCE_DIR}/src/sz.c
-${JEMALLOC_SOURCE_DIR}/src/tcache.c
-${JEMALLOC_SOURCE_DIR}/src/test_hooks.c
-${JEMALLOC_SOURCE_DIR}/src/ticker.c
-${JEMALLOC_SOURCE_DIR}/src/tsd.c
-${JEMALLOC_SOURCE_DIR}/src/witness.c
-${JEMALLOC_SOURCE_DIR}/src/safety_check.c
+set (SRCS
+        "${JEMALLOC_SOURCE_DIR}/src/arena.c"
+        "${JEMALLOC_SOURCE_DIR}/src/background_thread.c"
+        "${JEMALLOC_SOURCE_DIR}/src/base.c"
+        "${JEMALLOC_SOURCE_DIR}/src/bin.c"
+        "${JEMALLOC_SOURCE_DIR}/src/bin_info.c"
+        "${JEMALLOC_SOURCE_DIR}/src/bitmap.c"
+        "${JEMALLOC_SOURCE_DIR}/src/buf_writer.c"
+        "${JEMALLOC_SOURCE_DIR}/src/cache_bin.c"
+        "${JEMALLOC_SOURCE_DIR}/src/ckh.c"
+        "${JEMALLOC_SOURCE_DIR}/src/counter.c"
+        "${JEMALLOC_SOURCE_DIR}/src/ctl.c"
+        "${JEMALLOC_SOURCE_DIR}/src/decay.c"
+        "${JEMALLOC_SOURCE_DIR}/src/div.c"
+        "${JEMALLOC_SOURCE_DIR}/src/ecache.c"
+        "${JEMALLOC_SOURCE_DIR}/src/edata.c"
+        "${JEMALLOC_SOURCE_DIR}/src/edata_cache.c"
+        "${JEMALLOC_SOURCE_DIR}/src/ehooks.c"
+        "${JEMALLOC_SOURCE_DIR}/src/emap.c"
+        "${JEMALLOC_SOURCE_DIR}/src/eset.c"
+        "${JEMALLOC_SOURCE_DIR}/src/exp_grow.c"
+        "${JEMALLOC_SOURCE_DIR}/src/extent.c"
+        "${JEMALLOC_SOURCE_DIR}/src/extent_dss.c"
+        "${JEMALLOC_SOURCE_DIR}/src/extent_mmap.c"
+        "${JEMALLOC_SOURCE_DIR}/src/fxp.c"
+        "${JEMALLOC_SOURCE_DIR}/src/hook.c"
+        "${JEMALLOC_SOURCE_DIR}/src/hpa.c"
+        "${JEMALLOC_SOURCE_DIR}/src/hpa_hooks.c"
+        "${JEMALLOC_SOURCE_DIR}/src/hpdata.c"
+        "${JEMALLOC_SOURCE_DIR}/src/inspect.c"
+        "${JEMALLOC_SOURCE_DIR}/src/jemalloc.c"
+        "${JEMALLOC_SOURCE_DIR}/src/large.c"
+        "${JEMALLOC_SOURCE_DIR}/src/log.c"
+        "${JEMALLOC_SOURCE_DIR}/src/malloc_io.c"
+        "${JEMALLOC_SOURCE_DIR}/src/mutex.c"
+        "${JEMALLOC_SOURCE_DIR}/src/nstime.c"
+        "${JEMALLOC_SOURCE_DIR}/src/pa.c"
+        "${JEMALLOC_SOURCE_DIR}/src/pac.c"
+        "${JEMALLOC_SOURCE_DIR}/src/pa_extra.c"
+        "${JEMALLOC_SOURCE_DIR}/src/pages.c"
+        "${JEMALLOC_SOURCE_DIR}/src/pai.c"
+        "${JEMALLOC_SOURCE_DIR}/src/peak_event.c"
+        "${JEMALLOC_SOURCE_DIR}/src/prof.c"
+        "${JEMALLOC_SOURCE_DIR}/src/prof_data.c"
+        "${JEMALLOC_SOURCE_DIR}/src/prof_log.c"
+        "${JEMALLOC_SOURCE_DIR}/src/prof_recent.c"
+        "${JEMALLOC_SOURCE_DIR}/src/prof_stats.c"
+        "${JEMALLOC_SOURCE_DIR}/src/prof_sys.c"
+        "${JEMALLOC_SOURCE_DIR}/src/psset.c"
+        "${JEMALLOC_SOURCE_DIR}/src/rtree.c"
+        "${JEMALLOC_SOURCE_DIR}/src/safety_check.c"
+        "${JEMALLOC_SOURCE_DIR}/src/san_bump.c"
+        "${JEMALLOC_SOURCE_DIR}/src/san.c"
+        "${JEMALLOC_SOURCE_DIR}/src/sc.c"
+        "${JEMALLOC_SOURCE_DIR}/src/sec.c"
+        "${JEMALLOC_SOURCE_DIR}/src/stats.c"
+        "${JEMALLOC_SOURCE_DIR}/src/sz.c"
+        "${JEMALLOC_SOURCE_DIR}/src/tcache.c"
+        "${JEMALLOC_SOURCE_DIR}/src/test_hooks.c"
+        "${JEMALLOC_SOURCE_DIR}/src/thread_event.c"
+        "${JEMALLOC_SOURCE_DIR}/src/ticker.c"
+        "${JEMALLOC_SOURCE_DIR}/src/tsd.c"
+        "${JEMALLOC_SOURCE_DIR}/src/witness.c"
 )
 
 if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
   list(APPEND SRCS ${JEMALLOC_SOURCE_DIR}/src/zone.c)
 endif()
 
+if (ARCH_LINUX)
+    # ThreadPool select job randomly, and there can be some threads that had been
+    # performed some memory heavy task before and will be inactive for some time,
+    # but until it will became active again, the memory will not be freed since by
+    # default each thread has it's own arena, but there should be not more then
+    # 4*CPU arenas (see opt.nareans description).
+    #
+    # By enabling percpu_arena number of arenas limited to number of CPUs and hence
+    # this problem should go away.
+    #
+    # muzzy_decay_ms -- use MADV_FREE when available on newer Linuxes, to
+    # avoid spurious latencies and additional work associated with
+    # MADV_DONTNEED. See
+    # https://github.com/ClickHouse/ClickHouse/issues/11121 for motivation.
+    set (JEMALLOC_CONFIG_MALLOC_CONF "percpu_arena:percpu,oversize_threshold:0,muzzy_decay_ms:5000,dirty_decay_ms:5000")
+else()
+    set (JEMALLOC_CONFIG_MALLOC_CONF "oversize_threshold:0,muzzy_decay_ms:5000,dirty_decay_ms:5000")
+endif()
+
+message (STATUS "jemalloc malloc_conf: ${JEMALLOC_CONFIG_MALLOC_CONF}")
+
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -w")
 endif ()
 
 add_library(jemalloc STATIC ${SRCS})
+set (JEMALLOC_INCLUDE_PREFIX)
+
+if (ARCH_LINUX)
+    set (JEMALLOC_INCLUDE_PREFIX "include_linux")
+    target_compile_definitions(jemalloc PRIVATE JEMALLOC_MADV_FREE=8)
+elseif (ARCH_FREEBSD)
+    set (JEMALLOC_INCLUDE_PREFIX "include_freebsd")
+elseif (APPLE)
+    set (JEMALLOC_INCLUDE_PREFIX "include_darwin")
+else ()
+    message (FATAL_ERROR "internal jemalloc: This OS is not supported")
+endif ()
 
-if (ARCH_ARM)
-    target_include_directories(jemalloc PUBLIC
-        ${CMAKE_CURRENT_SOURCE_DIR}/include
-        ${CMAKE_CURRENT_SOURCE_DIR}/include_linux_aarch64)
+if (ARCH_AMD64)
+    if (USE_MUSL)
+        set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_x86_64_musl")
+    else()
+        set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_x86_64")
+    endif()
+elseif (ARCH_AARCH64)
+    set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_aarch64")
+elseif (ARCH_PPC64LE)
+    set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_ppc64le")
+elseif (ARCH_RISCV64)
+    set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_riscv64")
 else ()
-    target_include_directories(jemalloc PUBLIC
-        ${CMAKE_CURRENT_SOURCE_DIR}/include
-        ${CMAKE_CURRENT_SOURCE_DIR}/include_linux_x86_64)
+    message (FATAL_ERROR "internal jemalloc: This arch is not supported")
 endif ()
 
-target_include_directories(jemalloc PRIVATE
-    ${JEMALLOC_SOURCE_DIR}/include)
+configure_file(${JEMALLOC_INCLUDE_PREFIX}/jemalloc/internal/jemalloc_internal_defs.h.in
+        ${JEMALLOC_INCLUDE_PREFIX}/jemalloc/internal/jemalloc_internal_defs.h)
+target_include_directories(jemalloc SYSTEM PRIVATE
+        "${CMAKE_CURRENT_BINARY_DIR}/${JEMALLOC_INCLUDE_PREFIX}/jemalloc/internal")
+
+target_include_directories(jemalloc PUBLIC ${JEMALLOC_SOURCE_DIR}/include ${TiFlash_SOURCE_DIR}/contrib/jemalloc-cmake/include)
 
 target_compile_definitions(jemalloc PRIVATE -DJEMALLOC_NO_PRIVATE_NAMESPACE)
 
@@ -80,3 +151,5 @@ if (ENABLE_JEMALLOC_PROF)
         target_link_libraries (jemalloc PRIVATE ${UNWIND_LIBRARY})
     endif ()
 endif ()
+
+target_compile_options(jemalloc PRIVATE -D_GNU_SOURCE)
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_preamble.h b/contrib/jemalloc-cmake/include/jemalloc/internal/jemalloc_preamble.h
similarity index 69%
rename from contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_preamble.h
rename to contrib/jemalloc-cmake/include/jemalloc/internal/jemalloc_preamble.h
index d79551e1f25..45f43a6cd02 100644
--- a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_preamble.h
+++ b/contrib/jemalloc-cmake/include/jemalloc/internal/jemalloc_preamble.h
@@ -1,27 +1,33 @@
 #ifndef JEMALLOC_PREAMBLE_H
 #define JEMALLOC_PREAMBLE_H
 
-#include "jemalloc_internal_defs.h"
 #include "jemalloc/internal/jemalloc_internal_decls.h"
+#include "jemalloc_internal_defs.h"
 
-#ifdef JEMALLOC_UTRACE
+#if defined(JEMALLOC_UTRACE) || defined(JEMALLOC_UTRACE_LABEL)
 #include <sys/ktrace.h>
+#if defined(JEMALLOC_UTRACE)
+#define UTRACE_CALL(p, l) utrace(p, l)
+#else
+#define UTRACE_CALL(p, l) utrace("jemalloc_process", p, l)
+#define JEMALLOC_UTRACE
+#endif
 #endif
 
 #define JEMALLOC_NO_DEMANGLE
 #ifdef JEMALLOC_JET
-#  undef JEMALLOC_IS_MALLOC
-#  define JEMALLOC_N(n) jet_##n
-#  include "jemalloc/internal/public_namespace.h"
-#  define JEMALLOC_NO_RENAME
-#  include "jemalloc/jemalloc.h"
-#  undef JEMALLOC_NO_RENAME
+#undef JEMALLOC_IS_MALLOC
+#define JEMALLOC_N(n) jet_##n
+#include "jemalloc/internal/public_namespace.h"
+#define JEMALLOC_NO_RENAME
+#include "jemalloc/jemalloc.h"
+#undef JEMALLOC_NO_RENAME
 #else
-#  define JEMALLOC_N(n) je_##n
-#  include "jemalloc/jemalloc.h"
+#define JEMALLOC_N(n) je_##n
+#include "jemalloc/jemalloc.h"
 #endif
 
-#if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
+#if defined(JEMALLOC_OSATOMIC)
 #include <libkern/OSAtomic.h>
 #endif
 
@@ -39,16 +45,16 @@
  * possible.
  */
 #ifndef JEMALLOC_NO_PRIVATE_NAMESPACE
-#  ifndef JEMALLOC_JET
-#    include "jemalloc/internal/private_namespace.h"
-#  else
-#    include "jemalloc/internal/private_namespace_jet.h"
-#  endif
+#ifndef JEMALLOC_JET
+#include "jemalloc/internal/private_namespace.h"
+#else
+#include "jemalloc/internal/private_namespace_jet.h"
+#endif
 #endif
 #include "jemalloc/internal/test_hooks.h"
 
 #ifdef JEMALLOC_DEFINE_MADVISE_FREE
-#  define JEMALLOC_MADV_FREE 8
+#define JEMALLOC_MADV_FREE 8
 #endif
 
 static const bool config_debug =
@@ -161,7 +167,55 @@ static const bool config_log =
     false
 #endif
     ;
-#ifdef JEMALLOC_HAVE_SCHED_GETCPU
+/*
+ * Are extra safety checks enabled; things like checking the size of sized
+ * deallocations, double-frees, etc.
+ */
+static const bool config_opt_safety_checks =
+#ifdef JEMALLOC_OPT_SAFETY_CHECKS
+    true
+#elif defined(JEMALLOC_DEBUG)
+    /*
+     * This lets us only guard safety checks by one flag instead of two; fast
+     * checks can guard solely by config_opt_safety_checks and run in debug mode
+     * too.
+     */
+    true
+#else
+    false
+#endif
+    ;
+
+/*
+ * Extra debugging of sized deallocations too onerous to be included in the
+ * general safety checks.
+ */
+static const bool config_opt_size_checks =
+#if defined(JEMALLOC_OPT_SIZE_CHECKS) || defined(JEMALLOC_DEBUG)
+    true
+#else
+    false
+#endif
+    ;
+
+static const bool config_uaf_detection =
+#if defined(JEMALLOC_UAF_DETECTION) || defined(JEMALLOC_DEBUG)
+    true
+#else
+    false
+#endif
+    ;
+
+/* Whether or not the C++ extensions are enabled. */
+static const bool config_enable_cxx =
+#ifdef JEMALLOC_ENABLE_CXX
+    true
+#else
+    false
+#endif
+    ;
+
+#if defined(_WIN32) || defined(JEMALLOC_HAVE_SCHED_GETCPU)
 /* Currently percpu_arena depends on sched_getcpu. */
 #define JEMALLOC_PERCPU_ARENA
 #endif
@@ -190,23 +244,16 @@ static const bool have_background_thread =
     false
 #endif
     ;
-
-#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS 1
-#define JEMALLOC_GCC_U8_SYNC_ATOMICS 1
-
-/*
- * Are extra safety checks enabled; things like checking the size of sized
- * deallocations, double-frees, etc.
- */
-static const bool config_opt_safety_checks =
-#ifdef JEMALLOC_OPT_SAFETY_CHECKS
+static const bool config_high_res_timer =
+#ifdef JEMALLOC_HAVE_CLOCK_REALTIME
     true
-#elif defined(JEMALLOC_DEBUG)
-    /*
-     * This lets us only guard safety checks by one flag instead of two; fast
-     * checks can guard solely by config_opt_safety_checks and run in debug mode
-     * too.
-     */
+#else
+    false
+#endif
+    ;
+
+static const bool have_memcntl =
+#ifdef JEMALLOC_HAVE_MEMCNTL
     true
 #else
     false
diff --git a/contrib/jemalloc-cmake/include/jemalloc/jemalloc.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc.h
index d06243c5239..e90fa892100 100644
--- a/contrib/jemalloc-cmake/include/jemalloc/jemalloc.h
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc.h
@@ -4,13 +4,21 @@
 extern "C" {
 #endif
 
+#if !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#endif
+
 #include <jemalloc/jemalloc_defs.h>
-#include <jemalloc/jemalloc_rename.h>
 #include <jemalloc/jemalloc_macros.h>
 #include <jemalloc/jemalloc_protos.h>
+#include <jemalloc/jemalloc_rename.h>
 #include <jemalloc/jemalloc_typedefs.h>
 
+#if !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_defs.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_defs.h
similarity index 67%
rename from contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_defs.h
rename to contrib/jemalloc-cmake/include/jemalloc/jemalloc_defs.h
index d1389237a77..1fc77be57cf 100644
--- a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_defs.h
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_defs.h
@@ -5,15 +5,29 @@
 /* Defined if alloc_size attribute is supported. */
 #define JEMALLOC_HAVE_ATTR_ALLOC_SIZE
 
+/* Defined if format_arg(...) attribute is supported. */
+#define JEMALLOC_HAVE_ATTR_FORMAT_ARG
+
+/* Defined if format(gnu_printf, ...) attribute is supported. */
+/* #undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF */
+
 /* Defined if format(printf, ...) attribute is supported. */
 #define JEMALLOC_HAVE_ATTR_FORMAT_PRINTF
 
+/* Defined if fallthrough attribute is supported. */
+#define JEMALLOC_HAVE_ATTR_FALLTHROUGH
+
+/* Defined if cold attribute is supported. */
+#define JEMALLOC_HAVE_ATTR_COLD
+
 /*
  * Define overrides for non-standard allocator-related functions if they are
  * present on the system.
  */
+#if !defined(USE_MUSL)
 #define JEMALLOC_OVERRIDE_MEMALIGN
 #define JEMALLOC_OVERRIDE_VALLOC
+#endif
 
 /*
  * At least Linux omits the "const" in:
@@ -32,11 +46,11 @@
 #define JEMALLOC_USE_CXX_THROW
 
 #ifdef _MSC_VER
-#  ifdef _WIN64
-#    define LG_SIZEOF_PTR_WIN 3
-#  else
-#    define LG_SIZEOF_PTR_WIN 2
-#  endif
+#ifdef _WIN64
+#define LG_SIZEOF_PTR_WIN 3
+#else
+#define LG_SIZEOF_PTR_WIN 2
+#endif
 #endif
 
 /* sizeof(void *) == 2^LG_SIZEOF_PTR. */
diff --git a/contrib/jemalloc-cmake/include/jemalloc/jemalloc_macros.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_macros.h
new file mode 100644
index 00000000000..ccb22470e64
--- /dev/null
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_macros.h
@@ -0,0 +1,148 @@
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#define JEMALLOC_VERSION "5.3-RC"
+#define JEMALLOC_VERSION_MAJOR 5
+#define JEMALLOC_VERSION_MINOR 3
+#define JEMALLOC_VERSION_BUGFIX 0
+#define JEMALLOC_VERSION_NREV 0
+#define JEMALLOC_VERSION_GID "ca709c3139f77f4c00a903cdee46d71e9028f6c6"
+#define JEMALLOC_VERSION_GID_IDENT ca709c3139f77f4c00a903cdee46d71e9028f6c6
+
+#define MALLOCX_LG_ALIGN(la) ((int)(la))
+#if LG_SIZEOF_PTR == 2
+#define MALLOCX_ALIGN(a) ((int)(ffs((int)(a)) - 1))
+#else
+#define MALLOCX_ALIGN(a) \
+    ((int)(((size_t)(a) < (size_t)INT_MAX) ? ffs((int)(a)) - 1 : ffs((int)(((size_t)(a)) >> 32)) + 31))
+#endif
+#define MALLOCX_ZERO ((int)0x40)
+/*
+ * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1
+ * encodes MALLOCX_TCACHE_NONE.
+ */
+#define MALLOCX_TCACHE(tc) ((int)(((tc) + 2) << 8))
+#define MALLOCX_TCACHE_NONE MALLOCX_TCACHE(-1)
+/*
+ * Bias arena index bits so that 0 encodes "use an automatically chosen arena".
+ */
+#define MALLOCX_ARENA(a) ((((int)(a)) + 1) << 20)
+
+/*
+ * Use as arena index in "arena.<i>.{purge,decay,dss}" and
+ * "stats.arenas.<i>.*" mallctl interfaces to select all arenas.  This
+ * definition is intentionally specified in raw decimal format to support
+ * cpp-based string concatenation, e.g.
+ *
+ *   #define STRINGIFY_HELPER(x) #x
+ *   #define STRINGIFY(x) STRINGIFY_HELPER(x)
+ *
+ *   mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL,
+ *       0);
+ */
+#define MALLCTL_ARENAS_ALL 4096
+/*
+ * Use as arena index in "stats.arenas.<i>.*" mallctl interfaces to select
+ * destroyed arenas.
+ */
+#define MALLCTL_ARENAS_DESTROYED 4097
+
+#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW)
+#define JEMALLOC_CXX_THROW throw()
+#else
+#define JEMALLOC_CXX_THROW
+#endif
+
+#if defined(_MSC_VER)
+#define JEMALLOC_ATTR(s)
+#define JEMALLOC_ALIGNED(s) __declspec(align(s))
+#define JEMALLOC_ALLOC_SIZE(s)
+#define JEMALLOC_ALLOC_SIZE2(s1, s2)
+#ifndef JEMALLOC_EXPORT
+#ifdef DLLEXPORT
+#define JEMALLOC_EXPORT __declspec(dllexport)
+#else
+#define JEMALLOC_EXPORT __declspec(dllimport)
+#endif
+#endif
+#define JEMALLOC_FORMAT_ARG(i)
+#define JEMALLOC_FORMAT_PRINTF(s, i)
+#define JEMALLOC_FALLTHROUGH
+#define JEMALLOC_NOINLINE __declspec(noinline)
+#ifdef __cplusplus
+#define JEMALLOC_NOTHROW __declspec(nothrow)
+#else
+#define JEMALLOC_NOTHROW
+#endif
+#define JEMALLOC_SECTION(s) __declspec(allocate(s))
+#define JEMALLOC_RESTRICT_RETURN __declspec(restrict)
+#if _MSC_VER >= 1900 && !defined(__EDG__)
+#define JEMALLOC_ALLOCATOR __declspec(allocator)
+#else
+#define JEMALLOC_ALLOCATOR
+#endif
+#define JEMALLOC_COLD
+#elif defined(JEMALLOC_HAVE_ATTR)
+#define JEMALLOC_ATTR(s) __attribute__((s))
+#define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s))
+#ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
+#define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
+#define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2))
+#else
+#define JEMALLOC_ALLOC_SIZE(s)
+#define JEMALLOC_ALLOC_SIZE2(s1, s2)
+#endif
+#ifndef JEMALLOC_EXPORT
+#define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
+#endif
+#ifdef JEMALLOC_HAVE_ATTR_FORMAT_ARG
+#define JEMALLOC_FORMAT_ARG(i) JEMALLOC_ATTR(__format_arg__(3))
+#else
+#define JEMALLOC_FORMAT_ARG(i)
+#endif
+#ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF
+#define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i))
+#elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF)
+#define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i))
+#else
+#define JEMALLOC_FORMAT_PRINTF(s, i)
+#endif
+#ifdef JEMALLOC_HAVE_ATTR_FALLTHROUGH
+#define JEMALLOC_FALLTHROUGH JEMALLOC_ATTR(fallthrough)
+#else
+#define JEMALLOC_FALLTHROUGH
+#endif
+#define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline)
+#define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
+#define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
+#define JEMALLOC_RESTRICT_RETURN
+#define JEMALLOC_ALLOCATOR
+#ifdef JEMALLOC_HAVE_ATTR_COLD
+#define JEMALLOC_COLD JEMALLOC_ATTR(__cold__)
+#else
+#define JEMALLOC_COLD
+#endif
+#else
+#define JEMALLOC_ATTR(s)
+#define JEMALLOC_ALIGNED(s)
+#define JEMALLOC_ALLOC_SIZE(s)
+#define JEMALLOC_ALLOC_SIZE2(s1, s2)
+#define JEMALLOC_EXPORT
+#define JEMALLOC_FORMAT_PRINTF(s, i)
+#define JEMALLOC_FALLTHROUGH
+#define JEMALLOC_NOINLINE
+#define JEMALLOC_NOTHROW
+#define JEMALLOC_SECTION(s)
+#define JEMALLOC_RESTRICT_RETURN
+#define JEMALLOC_ALLOCATOR
+#define JEMALLOC_COLD
+#endif
+
+#if (defined(__APPLE__) || defined(__FreeBSD__)) && !defined(JEMALLOC_NO_RENAME)
+#define JEMALLOC_SYS_NOTHROW
+#else
+#define JEMALLOC_SYS_NOTHROW JEMALLOC_NOTHROW
+#endif
diff --git a/contrib/jemalloc-cmake/include/jemalloc/jemalloc_protos.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_protos.h
new file mode 100644
index 00000000000..31f72d3a2af
--- /dev/null
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_protos.h
@@ -0,0 +1,86 @@
+// OSX does not have this for system alloc functions, so you will get
+// "exception specification in declaration" error.
+#if defined(__APPLE__) || defined(__FreeBSD__) || defined(USE_MUSL)
+#undef JEMALLOC_NOTHROW
+#define JEMALLOC_NOTHROW
+
+#undef JEMALLOC_SYS_NOTHROW
+#define JEMALLOC_SYS_NOTHROW
+
+#undef JEMALLOC_CXX_THROW
+#define JEMALLOC_CXX_THROW
+#endif
+
+#include "jemalloc_rename.h"
+
+/*
+ * The je_ prefix on the following public symbol declarations is an artifact
+ * of namespace management, and should be omitted in application code unless
+ * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle.h).
+ */
+extern JEMALLOC_EXPORT const char * je_malloc_conf;
+extern JEMALLOC_EXPORT void (*je_malloc_message)(void * cbopaque,
+                                                 const char * s);
+
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * je_malloc(size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * je_calloc(size_t num, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2);
+JEMALLOC_EXPORT int JEMALLOC_SYS_NOTHROW je_posix_memalign(
+    void ** memptr,
+    size_t alignment,
+    size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ATTR(nonnull(1));
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * je_aligned_alloc(size_t alignment,
+                                                                                                         size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc)
+    JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * je_realloc(void * ptr, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT void JEMALLOC_SYS_NOTHROW je_free(void * ptr)
+    JEMALLOC_CXX_THROW;
+
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * je_mallocx(size_t size, int flags)
+    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * je_rallocx(void * ptr, size_t size, int flags) JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_xallocx(void * ptr, size_t size, size_t extra, int flags);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_sallocx(const void * ptr,
+                                                   int flags) JEMALLOC_ATTR(pure);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_dallocx(void * ptr, int flags);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_sdallocx(void * ptr, size_t size, int flags);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_nallocx(size_t size, int flags)
+    JEMALLOC_ATTR(pure);
+
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctl(const char * name,
+                                                void * oldp,
+                                                size_t * oldlenp,
+                                                void * newp,
+                                                size_t newlen);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctlnametomib(const char * name,
+                                                         size_t * mibp,
+                                                         size_t * miblenp);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctlbymib(const size_t * mib,
+                                                     size_t miblen,
+                                                     void * oldp,
+                                                     size_t * oldlenp,
+                                                     void * newp,
+                                                     size_t newlen);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_malloc_stats_print(
+    void (*write_cb)(void *, const char *),
+    void * je_cbopaque,
+    const char * opts);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_malloc_usable_size(
+    JEMALLOC_USABLE_SIZE_CONST void * ptr) JEMALLOC_CXX_THROW;
+#ifdef JEMALLOC_HAVE_MALLOC_SIZE
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_malloc_size(
+    const void * ptr);
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_MEMALIGN
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * je_memalign(size_t alignment, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc);
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_VALLOC
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * je_valloc(size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ATTR(malloc);
+#endif
diff --git a/contrib/jemalloc-cmake/include/jemalloc/jemalloc_protos_jet.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_protos_jet.h
new file mode 100644
index 00000000000..195d57e2997
--- /dev/null
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_protos_jet.h
@@ -0,0 +1,71 @@
+/*
+ * The jet_ prefix on the following public symbol declarations is an artifact
+ * of namespace management, and should be omitted in application code unless
+ * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle@install_suffix@.h).
+ */
+extern JEMALLOC_EXPORT const char * jet_malloc_conf;
+extern JEMALLOC_EXPORT void (*jet_malloc_message)(void * cbopaque,
+                                                  const char * s);
+
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * jet_malloc(size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * jet_calloc(size_t num, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2);
+JEMALLOC_EXPORT int JEMALLOC_SYS_NOTHROW jet_posix_memalign(
+    void ** memptr,
+    size_t alignment,
+    size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ATTR(nonnull(1));
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * jet_aligned_alloc(size_t alignment,
+                                                                                                          size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc)
+    JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * jet_realloc(void * ptr, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT void JEMALLOC_SYS_NOTHROW jet_free(void * ptr)
+    JEMALLOC_CXX_THROW;
+
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * jet_mallocx(size_t size, int flags)
+    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * jet_rallocx(void * ptr, size_t size, int flags) JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_xallocx(void * ptr, size_t size, size_t extra, int flags);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_sallocx(const void * ptr,
+                                                    int flags) JEMALLOC_ATTR(pure);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW jet_dallocx(void * ptr, int flags);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW jet_sdallocx(void * ptr, size_t size, int flags);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_nallocx(size_t size, int flags)
+    JEMALLOC_ATTR(pure);
+
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW jet_mallctl(const char * name,
+                                                 void * oldp,
+                                                 size_t * oldlenp,
+                                                 void * newp,
+                                                 size_t newlen);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW jet_mallctlnametomib(const char * name,
+                                                          size_t * mibp,
+                                                          size_t * miblenp);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW jet_mallctlbymib(const size_t * mib,
+                                                      size_t miblen,
+                                                      void * oldp,
+                                                      size_t * oldlenp,
+                                                      void * newp,
+                                                      size_t newlen);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW jet_malloc_stats_print(
+    void (*write_cb)(void *, const char *),
+    void * jet_cbopaque,
+    const char * opts);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_malloc_usable_size(
+    JEMALLOC_USABLE_SIZE_CONST void * ptr) JEMALLOC_CXX_THROW;
+#ifdef JEMALLOC_HAVE_MALLOC_SIZE
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_malloc_size(
+    const void * ptr);
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_MEMALIGN
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * jet_memalign(size_t alignment, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc);
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_VALLOC
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * jet_valloc(size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ATTR(malloc);
+#endif
diff --git a/contrib/jemalloc-cmake/include/jemalloc/jemalloc_rename.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_rename.h
index a2ea2dd3533..d032d46752d 100644
--- a/contrib/jemalloc-cmake/include/jemalloc/jemalloc_rename.h
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_rename.h
@@ -4,26 +4,28 @@
  * these macro definitions.
  */
 #ifndef JEMALLOC_NO_RENAME
-#  define je_aligned_alloc aligned_alloc
-#  define je_calloc calloc
-#  define je_dallocx dallocx
-#  define je_free free
-#  define je_mallctl mallctl
-#  define je_mallctlbymib mallctlbymib
-#  define je_mallctlnametomib mallctlnametomib
-#  define je_malloc malloc
-#  define je_malloc_conf malloc_conf
-#  define je_malloc_message malloc_message
-#  define je_malloc_stats_print malloc_stats_print
-#  define je_malloc_usable_size malloc_usable_size
-#  define je_mallocx mallocx
-#  define je_nallocx nallocx
-#  define je_posix_memalign posix_memalign
-#  define je_rallocx rallocx
-#  define je_realloc realloc
-#  define je_sallocx sallocx
-#  define je_sdallocx sdallocx
-#  define je_xallocx xallocx
-#  define je_memalign memalign
-#  define je_valloc valloc
+#define je_aligned_alloc aligned_alloc
+#define je_calloc calloc
+#define je_dallocx dallocx
+#define je_free free
+#define je_mallctl mallctl
+#define je_mallctlbymib mallctlbymib
+#define je_mallctlnametomib mallctlnametomib
+#define je_malloc malloc
+#define je_malloc_conf malloc_conf
+#define je_malloc_conf_2_conf_harder malloc_conf_2_conf_harder
+#define je_malloc_message malloc_message
+#define je_malloc_stats_print malloc_stats_print
+#define je_malloc_usable_size malloc_usable_size
+#define je_mallocx mallocx
+#define je_smallocx_ca709c3139f77f4c00a903cdee46d71e9028f6c6 smallocx_ca709c3139f77f4c00a903cdee46d71e9028f6c6
+#define je_nallocx nallocx
+#define je_posix_memalign posix_memalign
+#define je_rallocx rallocx
+#define je_realloc realloc
+#define je_sallocx sallocx
+#define je_sdallocx sdallocx
+#define je_xallocx xallocx
+#define je_memalign memalign
+#define je_valloc valloc
 #endif
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_typedefs.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_typedefs.h
similarity index 57%
rename from contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_typedefs.h
rename to contrib/jemalloc-cmake/include/jemalloc/jemalloc_typedefs.h
index 1a58874306e..eeaf7a6760e 100644
--- a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_typedefs.h
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_typedefs.h
@@ -5,73 +5,66 @@ typedef struct extent_hooks_s extent_hooks_t;
  * extent_alloc(extent_hooks_t *extent_hooks, void *new_addr, size_t size,
  *     size_t alignment, bool *zero, bool *commit, unsigned arena_ind);
  */
-typedef void *(extent_alloc_t)(extent_hooks_t *, void *, size_t, size_t, bool *,
-    bool *, unsigned);
+typedef void *(extent_alloc_t)(extent_hooks_t *, void *, size_t, size_t, bool *, bool *, unsigned);
 
 /*
  * bool
  * extent_dalloc(extent_hooks_t *extent_hooks, void *addr, size_t size,
  *     bool committed, unsigned arena_ind);
  */
-typedef bool (extent_dalloc_t)(extent_hooks_t *, void *, size_t, bool,
-    unsigned);
+typedef bool(extent_dalloc_t)(extent_hooks_t *, void *, size_t, bool, unsigned);
 
 /*
  * void
  * extent_destroy(extent_hooks_t *extent_hooks, void *addr, size_t size,
  *     bool committed, unsigned arena_ind);
  */
-typedef void (extent_destroy_t)(extent_hooks_t *, void *, size_t, bool,
-    unsigned);
+typedef void(extent_destroy_t)(extent_hooks_t *, void *, size_t, bool, unsigned);
 
 /*
  * bool
  * extent_commit(extent_hooks_t *extent_hooks, void *addr, size_t size,
  *     size_t offset, size_t length, unsigned arena_ind);
  */
-typedef bool (extent_commit_t)(extent_hooks_t *, void *, size_t, size_t, size_t,
-    unsigned);
+typedef bool(extent_commit_t)(extent_hooks_t *, void *, size_t, size_t, size_t, unsigned);
 
 /*
  * bool
  * extent_decommit(extent_hooks_t *extent_hooks, void *addr, size_t size,
  *     size_t offset, size_t length, unsigned arena_ind);
  */
-typedef bool (extent_decommit_t)(extent_hooks_t *, void *, size_t, size_t,
-    size_t, unsigned);
+typedef bool(extent_decommit_t)(extent_hooks_t *, void *, size_t, size_t, size_t, unsigned);
 
 /*
  * bool
  * extent_purge(extent_hooks_t *extent_hooks, void *addr, size_t size,
  *     size_t offset, size_t length, unsigned arena_ind);
  */
-typedef bool (extent_purge_t)(extent_hooks_t *, void *, size_t, size_t, size_t,
-    unsigned);
+typedef bool(extent_purge_t)(extent_hooks_t *, void *, size_t, size_t, size_t, unsigned);
 
 /*
  * bool
  * extent_split(extent_hooks_t *extent_hooks, void *addr, size_t size,
  *     size_t size_a, size_t size_b, bool committed, unsigned arena_ind);
  */
-typedef bool (extent_split_t)(extent_hooks_t *, void *, size_t, size_t, size_t,
-    bool, unsigned);
+typedef bool(extent_split_t)(extent_hooks_t *, void *, size_t, size_t, size_t, bool, unsigned);
 
 /*
  * bool
  * extent_merge(extent_hooks_t *extent_hooks, void *addr_a, size_t size_a,
  *     void *addr_b, size_t size_b, bool committed, unsigned arena_ind);
  */
-typedef bool (extent_merge_t)(extent_hooks_t *, void *, size_t, void *, size_t,
-    bool, unsigned);
+typedef bool(extent_merge_t)(extent_hooks_t *, void *, size_t, void *, size_t, bool, unsigned);
 
-struct extent_hooks_s {
-	extent_alloc_t		*alloc;
-	extent_dalloc_t		*dalloc;
-	extent_destroy_t	*destroy;
-	extent_commit_t		*commit;
-	extent_decommit_t	*decommit;
-	extent_purge_t		*purge_lazy;
-	extent_purge_t		*purge_forced;
-	extent_split_t		*split;
-	extent_merge_t		*merge;
+struct extent_hooks_s
+{
+    extent_alloc_t * alloc;
+    extent_dalloc_t * dalloc;
+    extent_destroy_t * destroy;
+    extent_commit_t * commit;
+    extent_decommit_t * decommit;
+    extent_purge_t * purge_lazy;
+    extent_purge_t * purge_forced;
+    extent_split_t * split;
+    extent_merge_t * merge;
 };
diff --git a/contrib/jemalloc-cmake/include_darwin_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_darwin_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..8ad95c51560
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_darwin_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,425 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+#define JEMALLOC_PREFIX "je_"
+#define JEMALLOC_CPREFIX "JE_"
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+/* #undef JEMALLOC_OVERRIDE___LIBC_CALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_FREE */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN */
+/* #undef JEMALLOC_OVERRIDE___LIBC_REALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_VALLOC */
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 0
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 64
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+#define JEMALLOC_OS_UNFAIR_LOCK
+
+/* Defined if syscall(2) is usable. */
+/* #undef JEMALLOC_USE_SYSCALL */
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#define JEMALLOC_HAVE_ISSETUGID
+
+/* Defined if pthread_atfork(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Defined if pthread_setname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP */
+
+/* Defined if pthread_getname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC */
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+#define JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+/* #undef JEMALLOC_THREADED_INIT */
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+/* #undef JEMALLOC_MUTEX_INIT_CB */
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+/* #undef JEMALLOC_DSS */
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+/* #undef JEMALLOC_LAZY_LOCK */
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 14
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 21
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+/* #undef JEMALLOC_RETAIN */
+
+/* TLS is used to map arenas and magazine caches to threads. */
+/* #undef JEMALLOC_TLS */
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+#define JEMALLOC_ZONE
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
+/* #undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY */
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+/* #undef JEMALLOC_HAVE_MADVISE_HUGE */
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+/* #undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS */
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_DONTDUMP */
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+#define JEMALLOC_HAVE_MALLOC_SIZE
+
+/* Define if operating system has alloca.h header. */
+/* #undef JEMALLOC_HAS_ALLOCA_H */
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+/* #undef JEMALLOC_GLIBC_MALLOC_HOOK */
+
+/* glibc memalign hook. */
+/* #undef JEMALLOC_GLIBC_MEMALIGN_HOOK */
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+/* #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP */
+
+/* GNU specific sched_getcpu support */
+/* #undef JEMALLOC_HAVE_SCHED_GETCPU */
+
+/* GNU specific sched_setaffinity support */
+/* #undef JEMALLOC_HAVE_SCHED_SETAFFINITY */
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+/* #undef JEMALLOC_BACKGROUND_THREAD */
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+/* #undef JEMALLOC_IS_MALLOC */
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+/* #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE */
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+#define JEMALLOC_HAVE_VM_MAKE_TAG
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_darwin_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_darwin_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..8671da5db69
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_darwin_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,425 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+#define JEMALLOC_PREFIX "je_"
+#define JEMALLOC_CPREFIX "JE_"
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+/* #undef JEMALLOC_OVERRIDE___LIBC_CALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_FREE */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN */
+/* #undef JEMALLOC_OVERRIDE___LIBC_REALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_VALLOC */
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT __asm__ volatile("pause")
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 1
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 48
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+#define JEMALLOC_OS_UNFAIR_LOCK
+
+/* Defined if syscall(2) is usable. */
+/* #undef JEMALLOC_USE_SYSCALL */
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#define JEMALLOC_HAVE_ISSETUGID
+
+/* Defined if pthread_atfork(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Defined if pthread_setname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP */
+
+/* Defined if pthread_getname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC */
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+#define JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+/* #undef JEMALLOC_THREADED_INIT */
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+/* #undef JEMALLOC_MUTEX_INIT_CB */
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+/* #undef JEMALLOC_DSS */
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+/* #undef JEMALLOC_LAZY_LOCK */
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 12
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 21
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+/* #undef JEMALLOC_RETAIN */
+
+/* TLS is used to map arenas and magazine caches to threads. */
+/* #undef JEMALLOC_TLS */
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+#define JEMALLOC_ZONE
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
+/* #undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY */
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+/* #undef JEMALLOC_HAVE_MADVISE_HUGE */
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+/* #undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS */
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_DONTDUMP */
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+#define JEMALLOC_HAVE_MALLOC_SIZE
+
+/* Define if operating system has alloca.h header. */
+/* #undef JEMALLOC_HAS_ALLOCA_H */
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+/* #undef JEMALLOC_GLIBC_MALLOC_HOOK */
+
+/* glibc memalign hook. */
+/* #undef JEMALLOC_GLIBC_MEMALIGN_HOOK */
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+/* #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP */
+
+/* GNU specific sched_getcpu support */
+/* #undef JEMALLOC_HAVE_SCHED_GETCPU */
+
+/* GNU specific sched_setaffinity support */
+/* #undef JEMALLOC_HAVE_SCHED_SETAFFINITY */
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+/* #undef JEMALLOC_BACKGROUND_THREAD */
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+/* #undef JEMALLOC_IS_MALLOC */
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+/* #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE */
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+#define JEMALLOC_HAVE_VM_MAKE_TAG
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_freebsd_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_freebsd_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..0f61417d65f
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_freebsd_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,427 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+/* #undef JEMALLOC_PREFIX */
+/* #undef JEMALLOC_CPREFIX */
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+/* #undef JEMALLOC_OVERRIDE___LIBC_CALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_FREE */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN */
+/* #undef JEMALLOC_OVERRIDE___LIBC_REALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_VALLOC */
+#define JEMALLOC_OVERRIDE___POSIX_MEMALIGN
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 0
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 48
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+/* #undef JEMALLOC_OS_UNFAIR_LOCK */
+
+/* Defined if syscall(2) is usable. */
+#define JEMALLOC_USE_SYSCALL
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#define JEMALLOC_HAVE_ISSETUGID
+
+/* Defined if pthread_atfork(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Only since 12.1-STABLE */
+/* Defined if pthread_setname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP */
+
+/* Only since 12.1-STABLE */
+/* Defined if pthread_getname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GETNAME_NP */
+
+/* Defined if pthread_get_name_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GET_NAME_NP
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+#define JEMALLOC_MALLOC_THREAD_CLEANUP
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+/* #undef JEMALLOC_THREADED_INIT */
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+#define JEMALLOC_MUTEX_INIT_CB
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+#define JEMALLOC_DSS
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+#define JEMALLOC_LAZY_LOCK
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 16
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 29
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+/* #undef JEMALLOC_RETAIN */
+
+/* TLS is used to map arenas and magazine caches to threads. */
+#define JEMALLOC_TLS
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+/* #undef JEMALLOC_ZONE */
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+#define JEMALLOC_SYSCTL_VM_OVERCOMMIT
+/* #undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY */
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+/* #undef JEMALLOC_HAVE_MADVISE_HUGE */
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+/* #undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS */
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_DONTDUMP */
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+#define JEMALLOC_MADVISE_NOCORE
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
+/* Define if operating system has alloca.h header. */
+/* #undef JEMALLOC_HAS_ALLOCA_H */
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+/* #undef JEMALLOC_GLIBC_MALLOC_HOOK */
+
+/* glibc memalign hook. */
+/* #undef JEMALLOC_GLIBC_MEMALIGN_HOOK */
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/* GNU specific sched_getcpu support */
+/* #undef JEMALLOC_HAVE_SCHED_GETCPU */
+
+/* GNU specific sched_setaffinity support */
+/* #undef JEMALLOC_HAVE_SCHED_SETAFFINITY */
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+#define JEMALLOC_BACKGROUND_THREAD
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+#define JEMALLOC_IS_MALLOC
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+/* #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE */
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_freebsd_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_freebsd_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..32cad025f5f
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_freebsd_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,427 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+/* #undef JEMALLOC_PREFIX */
+/* #undef JEMALLOC_CPREFIX */
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+/* #undef JEMALLOC_OVERRIDE___LIBC_CALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_FREE */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN */
+/* #undef JEMALLOC_OVERRIDE___LIBC_REALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_VALLOC */
+#define JEMALLOC_OVERRIDE___POSIX_MEMALIGN
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT __asm__ volatile("pause")
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 1
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 48
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+/* #undef JEMALLOC_OS_UNFAIR_LOCK */
+
+/* Defined if syscall(2) is usable. */
+#define JEMALLOC_USE_SYSCALL
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#define JEMALLOC_HAVE_ISSETUGID
+
+/* Defined if pthread_atfork(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Only since 12.1-STABLE */
+/* Defined if pthread_setname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP */
+
+/* Only since 12.1-STABLE */
+/* Defined if pthread_getname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GETNAME_NP */
+
+/* Defined if pthread_get_name_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GET_NAME_NP
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+#define JEMALLOC_MALLOC_THREAD_CLEANUP
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+/* #undef JEMALLOC_THREADED_INIT */
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+#define JEMALLOC_MUTEX_INIT_CB
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+#define JEMALLOC_DSS
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+#define JEMALLOC_LAZY_LOCK
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 12
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 21
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+/* #undef JEMALLOC_RETAIN */
+
+/* TLS is used to map arenas and magazine caches to threads. */
+#define JEMALLOC_TLS
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+/* #undef JEMALLOC_ZONE */
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+#define JEMALLOC_SYSCTL_VM_OVERCOMMIT
+/* #undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY */
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+/* #undef JEMALLOC_HAVE_MADVISE_HUGE */
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+/* #undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS */
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_DONTDUMP */
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+#define JEMALLOC_MADVISE_NOCORE
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
+/* Define if operating system has alloca.h header. */
+/* #undef JEMALLOC_HAS_ALLOCA_H */
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+/* #undef JEMALLOC_GLIBC_MALLOC_HOOK */
+
+/* glibc memalign hook. */
+/* #undef JEMALLOC_GLIBC_MEMALIGN_HOOK */
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/* GNU specific sched_getcpu support */
+/* #undef JEMALLOC_HAVE_SCHED_GETCPU */
+
+/* GNU specific sched_setaffinity support */
+/* #undef JEMALLOC_HAVE_SCHED_SETAFFINITY */
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+#define JEMALLOC_BACKGROUND_THREAD
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+#define JEMALLOC_IS_MALLOC
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+/* #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE */
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/README b/contrib/jemalloc-cmake/include_linux_aarch64/README
deleted file mode 100644
index 2ab582803a2..00000000000
--- a/contrib/jemalloc-cmake/include_linux_aarch64/README
+++ /dev/null
@@ -1,7 +0,0 @@
-Here are pre-generated files from jemalloc on Linux aarch64.
-You can obtain these files by running ./autogen.sh inside jemalloc source directory.
-
-Added #define GNU_SOURCE
-Added JEMALLOC_OVERRIDE___POSIX_MEMALIGN because why not.
-Removed JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF because it's non standard.
-Removed JEMALLOC_PURGE_MADVISE_FREE because it's available only from Linux 4.5.
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_internal_defs.h b/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
similarity index 80%
rename from contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_internal_defs.h
rename to contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
index 5e598348e72..ad535e6d773 100644
--- a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_internal_defs.h
+++ b/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -1,12 +1,6 @@
 /* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
 #ifndef JEMALLOC_INTERNAL_DEFS_H_
 #define JEMALLOC_INTERNAL_DEFS_H_
-
-
-#ifndef _GNU_SOURCE
-    #define _GNU_SOURCE
-#endif
-
 /*
  * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
  * public APIs to be prefixed.  This makes it possible, with some care, to use
@@ -19,13 +13,15 @@
  * Define overrides for non-standard allocator-related functions if they are
  * present on the system.
  */
-#define JEMALLOC_OVERRIDE___LIBC_CALLOC
-#define JEMALLOC_OVERRIDE___LIBC_FREE
-#define JEMALLOC_OVERRIDE___LIBC_MALLOC
-#define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
-#define JEMALLOC_OVERRIDE___LIBC_REALLOC
-#define JEMALLOC_OVERRIDE___LIBC_VALLOC
-#define JEMALLOC_OVERRIDE___POSIX_MEMALIGN
+#if !defined(USE_MUSL)
+    #define JEMALLOC_OVERRIDE___LIBC_CALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_FREE
+    #define JEMALLOC_OVERRIDE___LIBC_MALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
+    #define JEMALLOC_OVERRIDE___LIBC_REALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_VALLOC
+#endif
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
 
 /*
  * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
@@ -51,29 +47,17 @@
 #define LG_VADDR 48
 
 /* Defined if C11 atomics are available. */
-#define JEMALLOC_C11_ATOMICS 1
+#define JEMALLOC_C11_ATOMICS
 
 /* Defined if GCC __atomic atomics are available. */
-#define JEMALLOC_GCC_ATOMIC_ATOMICS 1
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
 
 /* Defined if GCC __sync atomics are available. */
-#define JEMALLOC_GCC_SYNC_ATOMICS 1
-
-/*
- * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
- * __sync_sub_and_fetch(uint32_t *, uint32_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-/* #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_4 */
-
-/*
- * Defined if __sync_add_and_fetch(uint64_t *, uint64_t) and
- * __sync_sub_and_fetch(uint64_t *, uint64_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-/* #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8 */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
 
 /*
  * Defined if __builtin_clz() and __builtin_clzl() are available.
@@ -85,19 +69,13 @@
  */
 /* #undef JEMALLOC_OS_UNFAIR_LOCK */
 
-/*
- * Defined if OSSpin*() functions are available, as provided by Darwin, and
- * documented in the spinlock(3) manual page.
- */
-/* #undef JEMALLOC_OSSPIN */
-
 /* Defined if syscall(2) is usable. */
 #define JEMALLOC_USE_SYSCALL
 
 /*
  * Defined if secure_getenv(3) is available.
  */
-#define JEMALLOC_HAVE_SECURE_GETENV
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
 
 /*
  * Defined if issetugid(2) is available.
@@ -110,21 +88,32 @@
 /* Defined if pthread_setname_np(3) is available. */
 #define JEMALLOC_HAVE_PTHREAD_SETNAME_NP
 
+/* Defined if pthread_getname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
 /*
  * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
  */
-#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
 
 /*
  * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
  */
-#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
 
 /*
  * Defined if mach_absolute_time() is available.
  */
 /* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
 
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
 /*
  * Defined if _malloc_thread_cleanup() exists.  At least in the case of
  * FreeBSD, pthread_key_create() allocates, which if used during malloc
@@ -187,6 +176,9 @@
 /* Support utrace(2)-based tracing. */
 /* #undef JEMALLOC_UTRACE */
 
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
 /* Support optional abort() on OOM. */
 /* #undef JEMALLOC_XMALLOC */
 
@@ -202,6 +194,9 @@
 /* One page is 2^LG_PAGE bytes. */
 #define LG_PAGE 16
 
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
 /*
  * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
  * system does not explicitly support huge pages; system calls that require
@@ -243,6 +238,12 @@
 #define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
 #define JEMALLOC_INTERNAL_FFS __builtin_ffs
 
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
 /*
  * If defined, explicitly attempt to more uniformly distribute large allocation
  * pointer alignments across all cache indices.
@@ -297,7 +298,7 @@
  *                                 MADV_FREE, though typically with higher
  *                                 system overhead.
  */
-// #define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_FREE
 #define JEMALLOC_PURGE_MADVISE_DONTNEED
 #define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
 
@@ -309,17 +310,46 @@
  */
 #define JEMALLOC_MADVISE_DONTDUMP
 
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
 /*
  * Defined if transparent huge pages (THPs) are supported via the
  * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
  */
 /* #undef JEMALLOC_THP */
 
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
 /* Define if operating system has alloca.h header. */
-#define JEMALLOC_HAS_ALLOCA_H 1
+#define JEMALLOC_HAS_ALLOCA_H
 
 /* C99 restrict keyword supported. */
-#define JEMALLOC_HAS_RESTRICT 1
+#define JEMALLOC_HAS_RESTRICT
 
 /* For use by hash code. */
 /* #undef JEMALLOC_BIG_ENDIAN */
@@ -360,7 +390,7 @@
 /*
  * If defined, all the features necessary for background threads are present.
  */
-#define JEMALLOC_BACKGROUND_THREAD 1
+#define JEMALLOC_BACKGROUND_THREAD
 
 /*
  * If defined, jemalloc symbols are not exported (doesn't work when
@@ -369,20 +399,29 @@
 /* #undef JEMALLOC_EXPORT */
 
 /* config.malloc_conf options string. */
-#define JEMALLOC_CONFIG_MALLOC_CONF ""
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
 
 /* If defined, jemalloc takes the malloc/free/etc. symbol names. */
-#define JEMALLOC_IS_MALLOC 1
+#define JEMALLOC_IS_MALLOC
 
 /*
  * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
  */
 #define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
 
-/*
- * popcount*() functions to use for bitmapping.
- */
-#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
-#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
 
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_defs.h b/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_defs.h
deleted file mode 100644
index d1389237a77..00000000000
--- a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_defs.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* include/jemalloc/jemalloc_defs.h.  Generated from jemalloc_defs.h.in by configure.  */
-/* Defined if __attribute__((...)) syntax is supported. */
-#define JEMALLOC_HAVE_ATTR
-
-/* Defined if alloc_size attribute is supported. */
-#define JEMALLOC_HAVE_ATTR_ALLOC_SIZE
-
-/* Defined if format(printf, ...) attribute is supported. */
-#define JEMALLOC_HAVE_ATTR_FORMAT_PRINTF
-
-/*
- * Define overrides for non-standard allocator-related functions if they are
- * present on the system.
- */
-#define JEMALLOC_OVERRIDE_MEMALIGN
-#define JEMALLOC_OVERRIDE_VALLOC
-
-/*
- * At least Linux omits the "const" in:
- *
- *   size_t malloc_usable_size(const void *ptr);
- *
- * Match the operating system's prototype.
- */
-#define JEMALLOC_USABLE_SIZE_CONST
-
-/*
- * If defined, specify throw() for the public function prototypes when compiling
- * with C++.  The only justification for this is to match the prototypes that
- * glibc defines.
- */
-#define JEMALLOC_USE_CXX_THROW
-
-#ifdef _MSC_VER
-#  ifdef _WIN64
-#    define LG_SIZEOF_PTR_WIN 3
-#  else
-#    define LG_SIZEOF_PTR_WIN 2
-#  endif
-#endif
-
-/* sizeof(void *) == 2^LG_SIZEOF_PTR. */
-#define LG_SIZEOF_PTR 3
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_macros.h b/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_macros.h
deleted file mode 100644
index 34235894285..00000000000
--- a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_macros.h
+++ /dev/null
@@ -1,129 +0,0 @@
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <limits.h>
-#include <strings.h>
-
-#define JEMALLOC_VERSION "5.2.1-0-gea6b3e973b477b8061e0076bb257dbd7f3faa756"
-#define JEMALLOC_VERSION_MAJOR 5
-#define JEMALLOC_VERSION_MINOR 2
-#define JEMALLOC_VERSION_BUGFIX 1
-#define JEMALLOC_VERSION_NREV 0
-#define JEMALLOC_VERSION_GID "ea6b3e973b477b8061e0076bb257dbd7f3faa756"
-#define JEMALLOC_VERSION_GID_IDENT ea6b3e973b477b8061e0076bb257dbd7f3faa756
-
-#define MALLOCX_LG_ALIGN(la)	((int)(la))
-#if LG_SIZEOF_PTR == 2
-#  define MALLOCX_ALIGN(a)	((int)(ffs((int)(a))-1))
-#else
-#  define MALLOCX_ALIGN(a)						\
-     ((int)(((size_t)(a) < (size_t)INT_MAX) ? ffs((int)(a))-1 :	\
-     ffs((int)(((size_t)(a))>>32))+31))
-#endif
-#define MALLOCX_ZERO	((int)0x40)
-/*
- * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1
- * encodes MALLOCX_TCACHE_NONE.
- */
-#define MALLOCX_TCACHE(tc)	((int)(((tc)+2) << 8))
-#define MALLOCX_TCACHE_NONE	MALLOCX_TCACHE(-1)
-/*
- * Bias arena index bits so that 0 encodes "use an automatically chosen arena".
- */
-#define MALLOCX_ARENA(a)	((((int)(a))+1) << 20)
-
-/*
- * Use as arena index in "arena.<i>.{purge,decay,dss}" and
- * "stats.arenas.<i>.*" mallctl interfaces to select all arenas.  This
- * definition is intentionally specified in raw decimal format to support
- * cpp-based string concatenation, e.g.
- *
- *   #define STRINGIFY_HELPER(x) #x
- *   #define STRINGIFY(x) STRINGIFY_HELPER(x)
- *
- *   mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL,
- *       0);
- */
-#define MALLCTL_ARENAS_ALL	4096
-/*
- * Use as arena index in "stats.arenas.<i>.*" mallctl interfaces to select
- * destroyed arenas.
- */
-#define MALLCTL_ARENAS_DESTROYED	4097
-
-#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW)
-#  define JEMALLOC_CXX_THROW throw()
-#else
-#  define JEMALLOC_CXX_THROW
-#endif
-
-#if defined(_MSC_VER)
-#  define JEMALLOC_ATTR(s)
-#  define JEMALLOC_ALIGNED(s) __declspec(align(s))
-#  define JEMALLOC_ALLOC_SIZE(s)
-#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
-#  ifndef JEMALLOC_EXPORT
-#    ifdef DLLEXPORT
-#      define JEMALLOC_EXPORT __declspec(dllexport)
-#    else
-#      define JEMALLOC_EXPORT __declspec(dllimport)
-#    endif
-#  endif
-#  define JEMALLOC_FORMAT_ARG(i)
-#  define JEMALLOC_FORMAT_PRINTF(s, i)
-#  define JEMALLOC_NOINLINE __declspec(noinline)
-#  ifdef __cplusplus
-#    define JEMALLOC_NOTHROW __declspec(nothrow)
-#  else
-#    define JEMALLOC_NOTHROW
-#  endif
-#  define JEMALLOC_SECTION(s) __declspec(allocate(s))
-#  define JEMALLOC_RESTRICT_RETURN __declspec(restrict)
-#  if _MSC_VER >= 1900 && !defined(__EDG__)
-#    define JEMALLOC_ALLOCATOR __declspec(allocator)
-#  else
-#    define JEMALLOC_ALLOCATOR
-#  endif
-#elif defined(JEMALLOC_HAVE_ATTR)
-#  define JEMALLOC_ATTR(s) __attribute__((s))
-#  define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s))
-#  ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
-#    define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
-#    define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2))
-#  else
-#    define JEMALLOC_ALLOC_SIZE(s)
-#    define JEMALLOC_ALLOC_SIZE2(s1, s2)
-#  endif
-#  ifndef JEMALLOC_EXPORT
-#    define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
-#  endif
-#  ifdef JEMALLOC_HAVE_ATTR_FORMAT_ARG
-#    define JEMALLOC_FORMAT_ARG(i) JEMALLOC_ATTR(__format_arg__(3))
-#  else
-#    define JEMALLOC_FORMAT_ARG(i)
-#  endif
-#  ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF
-#    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i))
-#  elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF)
-#    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i))
-#  else
-#    define JEMALLOC_FORMAT_PRINTF(s, i)
-#  endif
-#  define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline)
-#  define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
-#  define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
-#  define JEMALLOC_RESTRICT_RETURN
-#  define JEMALLOC_ALLOCATOR
-#else
-#  define JEMALLOC_ATTR(s)
-#  define JEMALLOC_ALIGNED(s)
-#  define JEMALLOC_ALLOC_SIZE(s)
-#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
-#  define JEMALLOC_EXPORT
-#  define JEMALLOC_FORMAT_PRINTF(s, i)
-#  define JEMALLOC_NOINLINE
-#  define JEMALLOC_NOTHROW
-#  define JEMALLOC_SECTION(s)
-#  define JEMALLOC_RESTRICT_RETURN
-#  define JEMALLOC_ALLOCATOR
-#endif
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_protos.h b/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_protos.h
deleted file mode 100644
index ff025e30fa7..00000000000
--- a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_protos.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * The je_ prefix on the following public symbol declarations is an artifact
- * of namespace management, and should be omitted in application code unless
- * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle.h).
- */
-extern JEMALLOC_EXPORT const char	*je_malloc_conf;
-extern JEMALLOC_EXPORT void		(*je_malloc_message)(void *cbopaque,
-    const char *s);
-
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_malloc(size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_calloc(size_t num, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2);
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_posix_memalign(void **memptr,
-    size_t alignment, size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_aligned_alloc(size_t alignment,
-    size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc)
-    JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_realloc(void *ptr, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_free(void *ptr)
-    JEMALLOC_CXX_THROW;
-
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_mallocx(size_t size, int flags)
-    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_rallocx(void *ptr, size_t size,
-    int flags) JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_xallocx(void *ptr, size_t size,
-    size_t extra, int flags);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_sallocx(const void *ptr,
-    int flags) JEMALLOC_ATTR(pure);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_dallocx(void *ptr, int flags);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_sdallocx(void *ptr, size_t size,
-    int flags);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_nallocx(size_t size, int flags)
-    JEMALLOC_ATTR(pure);
-
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_mallctl(const char *name,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen);
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_mallctlnametomib(const char *name,
-    size_t *mibp, size_t *miblenp);
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_mallctlbymib(const size_t *mib,
-    size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_malloc_stats_print(
-    void (*write_cb)(void *, const char *), void *je_cbopaque,
-    const char *opts);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_malloc_usable_size(
-    JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW;
-
-#ifdef JEMALLOC_OVERRIDE_MEMALIGN
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_memalign(size_t alignment, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc);
-#endif
-
-#ifdef JEMALLOC_OVERRIDE_VALLOC
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_valloc(size_t size) JEMALLOC_CXX_THROW
-    JEMALLOC_ATTR(malloc);
-#endif
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_typedefs.h b/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_typedefs.h
deleted file mode 100644
index 1a58874306e..00000000000
--- a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_typedefs.h
+++ /dev/null
@@ -1,77 +0,0 @@
-typedef struct extent_hooks_s extent_hooks_t;
-
-/*
- * void *
- * extent_alloc(extent_hooks_t *extent_hooks, void *new_addr, size_t size,
- *     size_t alignment, bool *zero, bool *commit, unsigned arena_ind);
- */
-typedef void *(extent_alloc_t)(extent_hooks_t *, void *, size_t, size_t, bool *,
-    bool *, unsigned);
-
-/*
- * bool
- * extent_dalloc(extent_hooks_t *extent_hooks, void *addr, size_t size,
- *     bool committed, unsigned arena_ind);
- */
-typedef bool (extent_dalloc_t)(extent_hooks_t *, void *, size_t, bool,
-    unsigned);
-
-/*
- * void
- * extent_destroy(extent_hooks_t *extent_hooks, void *addr, size_t size,
- *     bool committed, unsigned arena_ind);
- */
-typedef void (extent_destroy_t)(extent_hooks_t *, void *, size_t, bool,
-    unsigned);
-
-/*
- * bool
- * extent_commit(extent_hooks_t *extent_hooks, void *addr, size_t size,
- *     size_t offset, size_t length, unsigned arena_ind);
- */
-typedef bool (extent_commit_t)(extent_hooks_t *, void *, size_t, size_t, size_t,
-    unsigned);
-
-/*
- * bool
- * extent_decommit(extent_hooks_t *extent_hooks, void *addr, size_t size,
- *     size_t offset, size_t length, unsigned arena_ind);
- */
-typedef bool (extent_decommit_t)(extent_hooks_t *, void *, size_t, size_t,
-    size_t, unsigned);
-
-/*
- * bool
- * extent_purge(extent_hooks_t *extent_hooks, void *addr, size_t size,
- *     size_t offset, size_t length, unsigned arena_ind);
- */
-typedef bool (extent_purge_t)(extent_hooks_t *, void *, size_t, size_t, size_t,
-    unsigned);
-
-/*
- * bool
- * extent_split(extent_hooks_t *extent_hooks, void *addr, size_t size,
- *     size_t size_a, size_t size_b, bool committed, unsigned arena_ind);
- */
-typedef bool (extent_split_t)(extent_hooks_t *, void *, size_t, size_t, size_t,
-    bool, unsigned);
-
-/*
- * bool
- * extent_merge(extent_hooks_t *extent_hooks, void *addr_a, size_t size_a,
- *     void *addr_b, size_t size_b, bool committed, unsigned arena_ind);
- */
-typedef bool (extent_merge_t)(extent_hooks_t *, void *, size_t, void *, size_t,
-    bool, unsigned);
-
-struct extent_hooks_s {
-	extent_alloc_t		*alloc;
-	extent_dalloc_t		*dalloc;
-	extent_destroy_t	*destroy;
-	extent_commit_t		*commit;
-	extent_decommit_t	*decommit;
-	extent_purge_t		*purge_lazy;
-	extent_purge_t		*purge_forced;
-	extent_split_t		*split;
-	extent_merge_t		*merge;
-};
diff --git a/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..12890f80ef1
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,427 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+/* #undef JEMALLOC_PREFIX */
+/* #undef JEMALLOC_CPREFIX */
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+#if !defined(USE_MUSL)
+    #define JEMALLOC_OVERRIDE___LIBC_CALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_FREE
+    #define JEMALLOC_OVERRIDE___LIBC_MALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
+    #define JEMALLOC_OVERRIDE___LIBC_REALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_VALLOC
+#endif
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 0
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 64
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+/* #undef JEMALLOC_OS_UNFAIR_LOCK */
+
+/* Defined if syscall(2) is usable. */
+#define JEMALLOC_USE_SYSCALL
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+/* #undef JEMALLOC_HAVE_ISSETUGID */
+
+/* Defined if pthread_atfork(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_ATFORK */
+
+/* Defined if pthread_setname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_SETNAME_NP
+
+/* Defined if pthread_getname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+#define JEMALLOC_THREADED_INIT
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+/* #undef JEMALLOC_MUTEX_INIT_CB */
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+#define JEMALLOC_DSS
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+/* #undef JEMALLOC_LAZY_LOCK */
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 16
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 21
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+#define JEMALLOC_RETAIN
+
+/* TLS is used to map arenas and magazine caches to threads. */
+#define JEMALLOC_TLS
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+/* #undef JEMALLOC_ZONE */
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
+#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+#define JEMALLOC_HAVE_MADVISE_HUGE
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+#define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+#define JEMALLOC_DEFINE_MADVISE_FREE
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+#define JEMALLOC_MADVISE_DONTDUMP
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
+/* Define if operating system has alloca.h header. */
+#define JEMALLOC_HAS_ALLOCA_H
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+#define JEMALLOC_GLIBC_MALLOC_HOOK
+
+/* glibc memalign hook. */
+#define JEMALLOC_GLIBC_MEMALIGN_HOOK
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/* GNU specific sched_getcpu support */
+#define JEMALLOC_HAVE_SCHED_GETCPU
+
+/* GNU specific sched_setaffinity support */
+#define JEMALLOC_HAVE_SCHED_SETAFFINITY
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+#define JEMALLOC_BACKGROUND_THREAD
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+#define JEMALLOC_IS_MALLOC
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_linux_riscv64/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_linux_riscv64/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..ad535e6d773
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_linux_riscv64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,427 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+/* #undef JEMALLOC_PREFIX */
+/* #undef JEMALLOC_CPREFIX */
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+#if !defined(USE_MUSL)
+    #define JEMALLOC_OVERRIDE___LIBC_CALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_FREE
+    #define JEMALLOC_OVERRIDE___LIBC_MALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
+    #define JEMALLOC_OVERRIDE___LIBC_REALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_VALLOC
+#endif
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 0
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 48
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+/* #undef JEMALLOC_OS_UNFAIR_LOCK */
+
+/* Defined if syscall(2) is usable. */
+#define JEMALLOC_USE_SYSCALL
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+/* #undef JEMALLOC_HAVE_ISSETUGID */
+
+/* Defined if pthread_atfork(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Defined if pthread_setname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_SETNAME_NP
+
+/* Defined if pthread_getname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+#define JEMALLOC_THREADED_INIT
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+/* #undef JEMALLOC_MUTEX_INIT_CB */
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+#define JEMALLOC_DSS
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+/* #undef JEMALLOC_LAZY_LOCK */
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 16
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 29
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+#define JEMALLOC_RETAIN
+
+/* TLS is used to map arenas and magazine caches to threads. */
+#define JEMALLOC_TLS
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+/* #undef JEMALLOC_ZONE */
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
+#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+#define JEMALLOC_HAVE_MADVISE_HUGE
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+#define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+#define JEMALLOC_MADVISE_DONTDUMP
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
+/* Define if operating system has alloca.h header. */
+#define JEMALLOC_HAS_ALLOCA_H
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+#define JEMALLOC_GLIBC_MALLOC_HOOK
+
+/* glibc memalign hook. */
+#define JEMALLOC_GLIBC_MEMALIGN_HOOK
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/* GNU specific sched_getcpu support */
+#define JEMALLOC_HAVE_SCHED_GETCPU
+
+/* GNU specific sched_setaffinity support */
+#define JEMALLOC_HAVE_SCHED_SETAFFINITY
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+#define JEMALLOC_BACKGROUND_THREAD
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+#define JEMALLOC_IS_MALLOC
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/README b/contrib/jemalloc-cmake/include_linux_x86_64/README
deleted file mode 100644
index bf7663bda8d..00000000000
--- a/contrib/jemalloc-cmake/include_linux_x86_64/README
+++ /dev/null
@@ -1,7 +0,0 @@
-Here are pre-generated files from jemalloc on Linux x86_64.
-You can obtain these files by running ./autogen.sh inside jemalloc source directory.
-
-Added #define GNU_SOURCE
-Added JEMALLOC_OVERRIDE___POSIX_MEMALIGN because why not.
-Removed JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF because it's non standard.
-Removed JEMALLOC_PURGE_MADVISE_FREE because it's available only from Linux 4.5.
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h b/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
similarity index 78%
rename from contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h
rename to contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
index 7c21fa79397..99ab2d53ca9 100644
--- a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h
+++ b/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -1,11 +1,6 @@
 /* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
 #ifndef JEMALLOC_INTERNAL_DEFS_H_
 #define JEMALLOC_INTERNAL_DEFS_H_
-
-#ifndef _GNU_SOURCE
-    #define _GNU_SOURCE
-#endif
-
 /*
  * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
  * public APIs to be prefixed.  This makes it possible, with some care, to use
@@ -18,13 +13,15 @@
  * Define overrides for non-standard allocator-related functions if they are
  * present on the system.
  */
-#define JEMALLOC_OVERRIDE___LIBC_CALLOC
-#define JEMALLOC_OVERRIDE___LIBC_FREE
-#define JEMALLOC_OVERRIDE___LIBC_MALLOC
-#define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
-#define JEMALLOC_OVERRIDE___LIBC_REALLOC
-#define JEMALLOC_OVERRIDE___LIBC_VALLOC
-#define JEMALLOC_OVERRIDE___POSIX_MEMALIGN
+#if !defined(USE_MUSL)
+    #define JEMALLOC_OVERRIDE___LIBC_CALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_FREE
+    #define JEMALLOC_OVERRIDE___LIBC_MALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
+    #define JEMALLOC_OVERRIDE___LIBC_REALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_VALLOC
+#endif
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
 
 /*
  * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
@@ -50,29 +47,17 @@
 #define LG_VADDR 48
 
 /* Defined if C11 atomics are available. */
-#define JEMALLOC_C11_ATOMICS 1
+#define JEMALLOC_C11_ATOMICS
 
 /* Defined if GCC __atomic atomics are available. */
-#define JEMALLOC_GCC_ATOMIC_ATOMICS 1
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
 
 /* Defined if GCC __sync atomics are available. */
-#define JEMALLOC_GCC_SYNC_ATOMICS 1
-
-/*
- * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
- * __sync_sub_and_fetch(uint32_t *, uint32_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-/* #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_4 */
-
-/*
- * Defined if __sync_add_and_fetch(uint64_t *, uint64_t) and
- * __sync_sub_and_fetch(uint64_t *, uint64_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-/* #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8 */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
 
 /*
  * Defined if __builtin_clz() and __builtin_clzl() are available.
@@ -84,20 +69,13 @@
  */
 /* #undef JEMALLOC_OS_UNFAIR_LOCK */
 
-/*
- * Defined if OSSpin*() functions are available, as provided by Darwin, and
- * documented in the spinlock(3) manual page.
- */
-/* #undef JEMALLOC_OSSPIN */
-
 /* Defined if syscall(2) is usable. */
 #define JEMALLOC_USE_SYSCALL
 
 /*
  * Defined if secure_getenv(3) is available.
  */
-// Don't want dependency on newer GLIBC
-//#define JEMALLOC_HAVE_SECURE_GETENV
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
 
 /*
  * Defined if issetugid(2) is available.
@@ -110,21 +88,32 @@
 /* Defined if pthread_setname_np(3) is available. */
 #define JEMALLOC_HAVE_PTHREAD_SETNAME_NP
 
+/* Defined if pthread_getname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
 /*
  * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
  */
-#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
 
 /*
  * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
  */
-#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
 
 /*
  * Defined if mach_absolute_time() is available.
  */
 /* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
 
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
 /*
  * Defined if _malloc_thread_cleanup() exists.  At least in the case of
  * FreeBSD, pthread_key_create() allocates, which if used during malloc
@@ -160,6 +149,9 @@
 /* JEMALLOC_STATS enables statistics calculation. */
 #define JEMALLOC_STATS
 
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
 /* JEMALLOC_PROF enables allocation profiling. */
 /* #undef JEMALLOC_PROF */
 
@@ -184,6 +176,9 @@
 /* Support utrace(2)-based tracing. */
 /* #undef JEMALLOC_UTRACE */
 
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
 /* Support optional abort() on OOM. */
 /* #undef JEMALLOC_XMALLOC */
 
@@ -199,6 +194,9 @@
 /* One page is 2^LG_PAGE bytes. */
 #define LG_PAGE 12
 
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
 /*
  * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
  * system does not explicitly support huge pages; system calls that require
@@ -240,6 +238,12 @@
 #define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
 #define JEMALLOC_INTERNAL_FFS __builtin_ffs
 
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
 /*
  * If defined, explicitly attempt to more uniformly distribute large allocation
  * pointer alignments across all cache indices.
@@ -252,6 +256,12 @@
  */
 /* #undef JEMALLOC_LOG */
 
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
 /*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
@@ -288,7 +298,7 @@
  *                                 MADV_FREE, though typically with higher
  *                                 system overhead.
  */
-//#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_FREE
 #define JEMALLOC_PURGE_MADVISE_DONTNEED
 #define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
 
@@ -300,17 +310,46 @@
  */
 #define JEMALLOC_MADVISE_DONTDUMP
 
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
 /*
  * Defined if transparent huge pages (THPs) are supported via the
  * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
  */
 /* #undef JEMALLOC_THP */
 
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
 /* Define if operating system has alloca.h header. */
-#define JEMALLOC_HAS_ALLOCA_H 1
+#define JEMALLOC_HAS_ALLOCA_H
 
 /* C99 restrict keyword supported. */
-#define JEMALLOC_HAS_RESTRICT 1
+#define JEMALLOC_HAS_RESTRICT
 
 /* For use by hash code. */
 /* #undef JEMALLOC_BIG_ENDIAN */
@@ -351,7 +390,7 @@
 /*
  * If defined, all the features necessary for background threads are present.
  */
-#define JEMALLOC_BACKGROUND_THREAD 1
+#define JEMALLOC_BACKGROUND_THREAD
 
 /*
  * If defined, jemalloc symbols are not exported (doesn't work when
@@ -360,20 +399,29 @@
 /* #undef JEMALLOC_EXPORT */
 
 /* config.malloc_conf options string. */
-#define JEMALLOC_CONFIG_MALLOC_CONF ""
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
 
 /* If defined, jemalloc takes the malloc/free/etc. symbol names. */
-#define JEMALLOC_IS_MALLOC 1
+#define JEMALLOC_IS_MALLOC
 
 /*
  * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
  */
 #define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
 
-/*
- * popcount*() functions to use for bitmapping.
- */
-#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
-#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
 
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_preamble.h b/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_preamble.h
deleted file mode 100644
index d79551e1f25..00000000000
--- a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_preamble.h
+++ /dev/null
@@ -1,216 +0,0 @@
-#ifndef JEMALLOC_PREAMBLE_H
-#define JEMALLOC_PREAMBLE_H
-
-#include "jemalloc_internal_defs.h"
-#include "jemalloc/internal/jemalloc_internal_decls.h"
-
-#ifdef JEMALLOC_UTRACE
-#include <sys/ktrace.h>
-#endif
-
-#define JEMALLOC_NO_DEMANGLE
-#ifdef JEMALLOC_JET
-#  undef JEMALLOC_IS_MALLOC
-#  define JEMALLOC_N(n) jet_##n
-#  include "jemalloc/internal/public_namespace.h"
-#  define JEMALLOC_NO_RENAME
-#  include "jemalloc/jemalloc.h"
-#  undef JEMALLOC_NO_RENAME
-#else
-#  define JEMALLOC_N(n) je_##n
-#  include "jemalloc/jemalloc.h"
-#endif
-
-#if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
-#include <libkern/OSAtomic.h>
-#endif
-
-#ifdef JEMALLOC_ZONE
-#include <mach/mach_error.h>
-#include <mach/mach_init.h>
-#include <mach/vm_map.h>
-#endif
-
-#include "jemalloc/internal/jemalloc_internal_macros.h"
-
-/*
- * Note that the ordering matters here; the hook itself is name-mangled.  We
- * want the inclusion of hooks to happen early, so that we hook as much as
- * possible.
- */
-#ifndef JEMALLOC_NO_PRIVATE_NAMESPACE
-#  ifndef JEMALLOC_JET
-#    include "jemalloc/internal/private_namespace.h"
-#  else
-#    include "jemalloc/internal/private_namespace_jet.h"
-#  endif
-#endif
-#include "jemalloc/internal/test_hooks.h"
-
-#ifdef JEMALLOC_DEFINE_MADVISE_FREE
-#  define JEMALLOC_MADV_FREE 8
-#endif
-
-static const bool config_debug =
-#ifdef JEMALLOC_DEBUG
-    true
-#else
-    false
-#endif
-    ;
-static const bool have_dss =
-#ifdef JEMALLOC_DSS
-    true
-#else
-    false
-#endif
-    ;
-static const bool have_madvise_huge =
-#ifdef JEMALLOC_HAVE_MADVISE_HUGE
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_fill =
-#ifdef JEMALLOC_FILL
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_lazy_lock =
-#ifdef JEMALLOC_LAZY_LOCK
-    true
-#else
-    false
-#endif
-    ;
-static const char * const config_malloc_conf = JEMALLOC_CONFIG_MALLOC_CONF;
-static const bool config_prof =
-#ifdef JEMALLOC_PROF
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_prof_libgcc =
-#ifdef JEMALLOC_PROF_LIBGCC
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_prof_libunwind =
-#ifdef JEMALLOC_PROF_LIBUNWIND
-    true
-#else
-    false
-#endif
-    ;
-static const bool maps_coalesce =
-#ifdef JEMALLOC_MAPS_COALESCE
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_stats =
-#ifdef JEMALLOC_STATS
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_tls =
-#ifdef JEMALLOC_TLS
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_utrace =
-#ifdef JEMALLOC_UTRACE
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_xmalloc =
-#ifdef JEMALLOC_XMALLOC
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_cache_oblivious =
-#ifdef JEMALLOC_CACHE_OBLIVIOUS
-    true
-#else
-    false
-#endif
-    ;
-/*
- * Undocumented, for jemalloc development use only at the moment.  See the note
- * in jemalloc/internal/log.h.
- */
-static const bool config_log =
-#ifdef JEMALLOC_LOG
-    true
-#else
-    false
-#endif
-    ;
-#ifdef JEMALLOC_HAVE_SCHED_GETCPU
-/* Currently percpu_arena depends on sched_getcpu. */
-#define JEMALLOC_PERCPU_ARENA
-#endif
-static const bool have_percpu_arena =
-#ifdef JEMALLOC_PERCPU_ARENA
-    true
-#else
-    false
-#endif
-    ;
-/*
- * Undocumented, and not recommended; the application should take full
- * responsibility for tracking provenance.
- */
-static const bool force_ivsalloc =
-#ifdef JEMALLOC_FORCE_IVSALLOC
-    true
-#else
-    false
-#endif
-    ;
-static const bool have_background_thread =
-#ifdef JEMALLOC_BACKGROUND_THREAD
-    true
-#else
-    false
-#endif
-    ;
-
-#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS 1
-#define JEMALLOC_GCC_U8_SYNC_ATOMICS 1
-
-/*
- * Are extra safety checks enabled; things like checking the size of sized
- * deallocations, double-frees, etc.
- */
-static const bool config_opt_safety_checks =
-#ifdef JEMALLOC_OPT_SAFETY_CHECKS
-    true
-#elif defined(JEMALLOC_DEBUG)
-    /*
-     * This lets us only guard safety checks by one flag instead of two; fast
-     * checks can guard solely by config_opt_safety_checks and run in debug mode
-     * too.
-     */
-    true
-#else
-    false
-#endif
-    ;
-
-#endif /* JEMALLOC_PREAMBLE_H */
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_macros.h b/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_macros.h
deleted file mode 100644
index 34235894285..00000000000
--- a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_macros.h
+++ /dev/null
@@ -1,129 +0,0 @@
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <limits.h>
-#include <strings.h>
-
-#define JEMALLOC_VERSION "5.2.1-0-gea6b3e973b477b8061e0076bb257dbd7f3faa756"
-#define JEMALLOC_VERSION_MAJOR 5
-#define JEMALLOC_VERSION_MINOR 2
-#define JEMALLOC_VERSION_BUGFIX 1
-#define JEMALLOC_VERSION_NREV 0
-#define JEMALLOC_VERSION_GID "ea6b3e973b477b8061e0076bb257dbd7f3faa756"
-#define JEMALLOC_VERSION_GID_IDENT ea6b3e973b477b8061e0076bb257dbd7f3faa756
-
-#define MALLOCX_LG_ALIGN(la)	((int)(la))
-#if LG_SIZEOF_PTR == 2
-#  define MALLOCX_ALIGN(a)	((int)(ffs((int)(a))-1))
-#else
-#  define MALLOCX_ALIGN(a)						\
-     ((int)(((size_t)(a) < (size_t)INT_MAX) ? ffs((int)(a))-1 :	\
-     ffs((int)(((size_t)(a))>>32))+31))
-#endif
-#define MALLOCX_ZERO	((int)0x40)
-/*
- * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1
- * encodes MALLOCX_TCACHE_NONE.
- */
-#define MALLOCX_TCACHE(tc)	((int)(((tc)+2) << 8))
-#define MALLOCX_TCACHE_NONE	MALLOCX_TCACHE(-1)
-/*
- * Bias arena index bits so that 0 encodes "use an automatically chosen arena".
- */
-#define MALLOCX_ARENA(a)	((((int)(a))+1) << 20)
-
-/*
- * Use as arena index in "arena.<i>.{purge,decay,dss}" and
- * "stats.arenas.<i>.*" mallctl interfaces to select all arenas.  This
- * definition is intentionally specified in raw decimal format to support
- * cpp-based string concatenation, e.g.
- *
- *   #define STRINGIFY_HELPER(x) #x
- *   #define STRINGIFY(x) STRINGIFY_HELPER(x)
- *
- *   mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL,
- *       0);
- */
-#define MALLCTL_ARENAS_ALL	4096
-/*
- * Use as arena index in "stats.arenas.<i>.*" mallctl interfaces to select
- * destroyed arenas.
- */
-#define MALLCTL_ARENAS_DESTROYED	4097
-
-#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW)
-#  define JEMALLOC_CXX_THROW throw()
-#else
-#  define JEMALLOC_CXX_THROW
-#endif
-
-#if defined(_MSC_VER)
-#  define JEMALLOC_ATTR(s)
-#  define JEMALLOC_ALIGNED(s) __declspec(align(s))
-#  define JEMALLOC_ALLOC_SIZE(s)
-#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
-#  ifndef JEMALLOC_EXPORT
-#    ifdef DLLEXPORT
-#      define JEMALLOC_EXPORT __declspec(dllexport)
-#    else
-#      define JEMALLOC_EXPORT __declspec(dllimport)
-#    endif
-#  endif
-#  define JEMALLOC_FORMAT_ARG(i)
-#  define JEMALLOC_FORMAT_PRINTF(s, i)
-#  define JEMALLOC_NOINLINE __declspec(noinline)
-#  ifdef __cplusplus
-#    define JEMALLOC_NOTHROW __declspec(nothrow)
-#  else
-#    define JEMALLOC_NOTHROW
-#  endif
-#  define JEMALLOC_SECTION(s) __declspec(allocate(s))
-#  define JEMALLOC_RESTRICT_RETURN __declspec(restrict)
-#  if _MSC_VER >= 1900 && !defined(__EDG__)
-#    define JEMALLOC_ALLOCATOR __declspec(allocator)
-#  else
-#    define JEMALLOC_ALLOCATOR
-#  endif
-#elif defined(JEMALLOC_HAVE_ATTR)
-#  define JEMALLOC_ATTR(s) __attribute__((s))
-#  define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s))
-#  ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
-#    define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
-#    define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2))
-#  else
-#    define JEMALLOC_ALLOC_SIZE(s)
-#    define JEMALLOC_ALLOC_SIZE2(s1, s2)
-#  endif
-#  ifndef JEMALLOC_EXPORT
-#    define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
-#  endif
-#  ifdef JEMALLOC_HAVE_ATTR_FORMAT_ARG
-#    define JEMALLOC_FORMAT_ARG(i) JEMALLOC_ATTR(__format_arg__(3))
-#  else
-#    define JEMALLOC_FORMAT_ARG(i)
-#  endif
-#  ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF
-#    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i))
-#  elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF)
-#    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i))
-#  else
-#    define JEMALLOC_FORMAT_PRINTF(s, i)
-#  endif
-#  define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline)
-#  define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
-#  define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
-#  define JEMALLOC_RESTRICT_RETURN
-#  define JEMALLOC_ALLOCATOR
-#else
-#  define JEMALLOC_ATTR(s)
-#  define JEMALLOC_ALIGNED(s)
-#  define JEMALLOC_ALLOC_SIZE(s)
-#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
-#  define JEMALLOC_EXPORT
-#  define JEMALLOC_FORMAT_PRINTF(s, i)
-#  define JEMALLOC_NOINLINE
-#  define JEMALLOC_NOTHROW
-#  define JEMALLOC_SECTION(s)
-#  define JEMALLOC_RESTRICT_RETURN
-#  define JEMALLOC_ALLOCATOR
-#endif
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_protos.h b/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_protos.h
deleted file mode 100644
index ff025e30fa7..00000000000
--- a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_protos.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * The je_ prefix on the following public symbol declarations is an artifact
- * of namespace management, and should be omitted in application code unless
- * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle.h).
- */
-extern JEMALLOC_EXPORT const char	*je_malloc_conf;
-extern JEMALLOC_EXPORT void		(*je_malloc_message)(void *cbopaque,
-    const char *s);
-
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_malloc(size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_calloc(size_t num, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2);
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_posix_memalign(void **memptr,
-    size_t alignment, size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_aligned_alloc(size_t alignment,
-    size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc)
-    JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_realloc(void *ptr, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_free(void *ptr)
-    JEMALLOC_CXX_THROW;
-
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_mallocx(size_t size, int flags)
-    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_rallocx(void *ptr, size_t size,
-    int flags) JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_xallocx(void *ptr, size_t size,
-    size_t extra, int flags);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_sallocx(const void *ptr,
-    int flags) JEMALLOC_ATTR(pure);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_dallocx(void *ptr, int flags);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_sdallocx(void *ptr, size_t size,
-    int flags);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_nallocx(size_t size, int flags)
-    JEMALLOC_ATTR(pure);
-
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_mallctl(const char *name,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen);
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_mallctlnametomib(const char *name,
-    size_t *mibp, size_t *miblenp);
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_mallctlbymib(const size_t *mib,
-    size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_malloc_stats_print(
-    void (*write_cb)(void *, const char *), void *je_cbopaque,
-    const char *opts);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_malloc_usable_size(
-    JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW;
-
-#ifdef JEMALLOC_OVERRIDE_MEMALIGN
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_memalign(size_t alignment, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc);
-#endif
-
-#ifdef JEMALLOC_OVERRIDE_VALLOC
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_valloc(size_t size) JEMALLOC_CXX_THROW
-    JEMALLOC_ATTR(malloc);
-#endif
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64_musl/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_linux_x86_64_musl/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..684d4debb14
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_linux_x86_64_musl/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,428 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+/* #undef JEMALLOC_PREFIX */
+/* #undef JEMALLOC_CPREFIX */
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+#if !defined(USE_MUSL)
+    #define JEMALLOC_OVERRIDE___LIBC_CALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_FREE
+    #define JEMALLOC_OVERRIDE___LIBC_MALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
+    #define JEMALLOC_OVERRIDE___LIBC_REALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_VALLOC
+#endif
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT __asm__ volatile("pause")
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 1
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 48
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+/* #undef JEMALLOC_OS_UNFAIR_LOCK */
+
+/* Defined if syscall(2) is usable. */
+#define JEMALLOC_USE_SYSCALL
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+/* #undef JEMALLOC_HAVE_ISSETUGID */
+
+/* Defined if pthread_atfork(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Defined if pthread_setname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_SETNAME_NP
+
+/// musl doesn't support it
+/* Defined if pthread_getname_np(3) is available. */
+/* #define JEMALLOC_HAVE_PTHREAD_GETNAME_NP */
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+#define JEMALLOC_THREADED_INIT
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+/* #undef JEMALLOC_MUTEX_INIT_CB */
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+#define JEMALLOC_DSS
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+/* #undef JEMALLOC_LAZY_LOCK */
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 12
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 21
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+#define JEMALLOC_RETAIN
+
+/* TLS is used to map arenas and magazine caches to threads. */
+#define JEMALLOC_TLS
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+/* #undef JEMALLOC_ZONE */
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
+#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+#define JEMALLOC_HAVE_MADVISE_HUGE
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+#define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+#define JEMALLOC_MADVISE_DONTDUMP
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
+/* Define if operating system has alloca.h header. */
+#define JEMALLOC_HAS_ALLOCA_H
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+#define JEMALLOC_GLIBC_MALLOC_HOOK
+
+/* glibc memalign hook. */
+#define JEMALLOC_GLIBC_MEMALIGN_HOOK
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/* GNU specific sched_getcpu support */
+#define JEMALLOC_HAVE_SCHED_GETCPU
+
+/* GNU specific sched_setaffinity support */
+#define JEMALLOC_HAVE_SCHED_SETAFFINITY
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+#define JEMALLOC_BACKGROUND_THREAD
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+#define JEMALLOC_IS_MALLOC
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/kvproto b/contrib/kvproto
index 12e2f5a9d16..a5d4ffd2ba3 160000
--- a/contrib/kvproto
+++ b/contrib/kvproto
@@ -1 +1 @@
-Subproject commit 12e2f5a9d167f46602804840857ddc8ff06dc695
+Subproject commit a5d4ffd2ba337dad0bc99e9fb53bf665864a3f3b
diff --git a/contrib/prometheus-cpp b/contrib/prometheus-cpp
index ca1f3463e74..76470b3ec02 160000
--- a/contrib/prometheus-cpp
+++ b/contrib/prometheus-cpp
@@ -1 +1 @@
-Subproject commit ca1f3463e74d957d1cccddd4a1a29e3e5d34bd83
+Subproject commit 76470b3ec024c8214e1f4253fb1f4c0b28d3df94
diff --git a/contrib/prometheus-cpp-cmake/pull/CMakeLists.txt b/contrib/prometheus-cpp-cmake/pull/CMakeLists.txt
index daebd1b7c5a..993618e16ac 100644
--- a/contrib/prometheus-cpp-cmake/pull/CMakeLists.txt
+++ b/contrib/prometheus-cpp-cmake/pull/CMakeLists.txt
@@ -12,9 +12,18 @@ if(ENABLE_COMPRESSION)
 endif()
 
 add_library(pull
+  ${PROMETHEUS_SRC_DIR}/pull/src/basic_auth.cc
+  ${PROMETHEUS_SRC_DIR}/pull/src/basic_auth.h
+  ${PROMETHEUS_SRC_DIR}/pull/src/endpoint.cc
+  ${PROMETHEUS_SRC_DIR}/pull/src/endpoint.h
   ${PROMETHEUS_SRC_DIR}/pull/src/exposer.cc
   ${PROMETHEUS_SRC_DIR}/pull/src/handler.cc
   ${PROMETHEUS_SRC_DIR}/pull/src/handler.h
+  ${PROMETHEUS_SRC_DIR}/pull/src/metrics_collector.cc
+  ${PROMETHEUS_SRC_DIR}/pull/src/metrics_collector.h
+
+  ${PROMETHEUS_SRC_DIR}/pull/src/detail/base64.h
+
   $<$<BOOL:${USE_THIRDPARTY_LIBRARIES}>:$<TARGET_OBJECTS:civetweb>>
 )
 
diff --git a/contrib/prometheus-cpp-cmake/push/CMakeLists.txt b/contrib/prometheus-cpp-cmake/push/CMakeLists.txt
index 71dad9fb812..b776d17bdaf 100644
--- a/contrib/prometheus-cpp-cmake/push/CMakeLists.txt
+++ b/contrib/prometheus-cpp-cmake/push/CMakeLists.txt
@@ -3,6 +3,8 @@ if(NOT CURL_FOUND)
 endif()
 
 add_library(push
+  ${PROMETHEUS_SRC_DIR}/push/src/curl_wrapper.cc
+  ${PROMETHEUS_SRC_DIR}/push/src/curl_wrapper.h
   ${PROMETHEUS_SRC_DIR}/push/src/gateway.cc
 )
 
diff --git a/contrib/tiflash-proxy b/contrib/tiflash-proxy
index ca2f51f94e5..42ede65b66a 160000
--- a/contrib/tiflash-proxy
+++ b/contrib/tiflash-proxy
@@ -1 +1 @@
-Subproject commit ca2f51f94e55bdd23749dcc02ab4afb94eeb5ae5
+Subproject commit 42ede65b66aed69debc80b60a31c63e41010d450
diff --git a/contrib/tiflash-proxy-cmake/CMakeLists.txt b/contrib/tiflash-proxy-cmake/CMakeLists.txt
index e243ecba37c..e3e2df379a1 100644
--- a/contrib/tiflash-proxy-cmake/CMakeLists.txt
+++ b/contrib/tiflash-proxy-cmake/CMakeLists.txt
@@ -4,7 +4,11 @@ file(GLOB_RECURSE _TIFLASH_PROXY_SRCS "${_TIFLASH_PROXY_SOURCE_DIR}/*.rs")
 list(FILTER _TIFLASH_PROXY_SRCS EXCLUDE REGEX ${_TIFLASH_PROXY_SOURCE_DIR}/target/.*)
 
 # use `CFLAGS=-w CXXFLAGS=-w` to inhibit warning messages.
-set(TIFLASH_RUST_ENV CMAKE=${CMAKE_COMMAND} CFLAGS=-w CXXFLAGS=-w)
+if (TIFLASH_LLVM_TOOLCHAIN)
+    set(TIFLASH_RUST_ENV CMAKE=${CMAKE_COMMAND} "CFLAGS=-w -fuse-ld=lld" "CXXFLAGS=-w -fuse-ld=lld -stdlib=libc++")
+else()
+    set(TIFLASH_RUST_ENV CMAKE=${CMAKE_COMMAND} CFLAGS=-w CXXFLAGS=-w)
+endif()
 
 if(TIFLASH_LLVM_TOOLCHAIN AND USE_LIBCXX)
     set(TIFLASH_RUST_LINKER ${CMAKE_CURRENT_BINARY_DIR}/tiflash-linker)
diff --git a/contrib/tipb b/contrib/tipb
index bfb5c2c5518..0f4f873beca 160000
--- a/contrib/tipb
+++ b/contrib/tipb
@@ -1 +1 @@
-Subproject commit bfb5c2c55188c254018d3cf77bfad73b4d4b77ec
+Subproject commit 0f4f873beca8d5078dde0a23d15ad5ce3188ed0d
diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt
index cce11bd6997..0df79f89a84 100644
--- a/dbms/CMakeLists.txt
+++ b/dbms/CMakeLists.txt
@@ -91,12 +91,10 @@ add_headers_and_sources(dbms src/Storages/Page/V2/VersionSet)
 add_headers_and_sources(dbms src/Storages/Page/V2/gc)
 add_headers_and_sources(dbms src/WindowFunctions)
 add_headers_and_sources(dbms src/TiDB/Schema)
-if (ENABLE_V3_PAGESTORAGE)
-    add_headers_and_sources(dbms src/Storages/Page/V3)
-    add_headers_and_sources(dbms src/Storages/Page/V3/LogFile)
-    add_headers_and_sources(dbms src/Storages/Page/V3/WAL)
-    add_headers_and_sources(dbms src/Storages/Page/V3/spacemap)
-endif()
+add_headers_and_sources(dbms src/Storages/Page/V3)
+add_headers_and_sources(dbms src/Storages/Page/V3/LogFile)
+add_headers_and_sources(dbms src/Storages/Page/V3/WAL)
+add_headers_and_sources(dbms src/Storages/Page/V3/spacemap)
 add_headers_and_sources(dbms src/Storages/Page/)
 add_headers_and_sources(dbms src/TiDB)
 add_headers_and_sources(dbms src/Client)
@@ -318,11 +316,15 @@ if (ENABLE_TESTS)
         ${TiFlash_SOURCE_DIR}/dbms/src/AggregateFunctions/AggregateFunctionSum.cpp
         )
     target_include_directories(bench_dbms BEFORE PRIVATE ${SPARCEHASH_INCLUDE_DIR} ${benchmark_SOURCE_DIR}/include)
+    target_compile_definitions(bench_dbms PUBLIC DBMS_PUBLIC_GTEST)
 
     target_link_libraries(bench_dbms gtest dbms test_util_bench_main benchmark clickhouse_functions)
     if (ENABLE_TIFLASH_DTWORKLOAD)
         target_link_libraries(bench_dbms dt-workload-lib)
     endif ()
+    if (ENABLE_TIFLASH_PAGEWORKLOAD)
+        target_link_libraries(bench_dbms page-workload-lib)
+    endif ()
 
     add_check(bench_dbms)
 endif ()
diff --git a/dbms/src/Client/Connection.cpp b/dbms/src/Client/Connection.cpp
index 61a2843ac59..e21bde19a47 100644
--- a/dbms/src/Client/Connection.cpp
+++ b/dbms/src/Client/Connection.cpp
@@ -38,12 +38,6 @@
 #include <Poco/Net/SecureStreamSocket.h>
 #endif
 
-
-namespace CurrentMetrics
-{
-extern const Metric SendExternalTables;
-}
-
 namespace DB
 {
 namespace ErrorCodes
@@ -434,8 +428,6 @@ void Connection::sendExternalTablesData(ExternalTablesData & data)
     size_t maybe_compressed_out_bytes = maybe_compressed_out ? maybe_compressed_out->count() : 0;
     size_t rows = 0;
 
-    CurrentMetrics::Increment metric_increment{CurrentMetrics::SendExternalTables};
-
     for (auto & elem : data)
     {
         elem.first->readPrefix();
diff --git a/dbms/src/Client/ConnectionPoolWithFailover.cpp b/dbms/src/Client/ConnectionPoolWithFailover.cpp
index a9b6825a3fe..179b2d92c0e 100644
--- a/dbms/src/Client/ConnectionPoolWithFailover.cpp
+++ b/dbms/src/Client/ConnectionPoolWithFailover.cpp
@@ -20,13 +20,6 @@
 #include <Poco/Net/DNS.h>
 #include <Poco/Net/NetException.h>
 
-
-namespace ProfileEvents
-{
-extern const Event DistributedConnectionMissingTable;
-extern const Event DistributedConnectionStaleReplica;
-} // namespace ProfileEvents
-
 namespace DB
 {
 namespace ErrorCodes
@@ -50,7 +43,7 @@ ConnectionPoolWithFailover::ConnectionPoolWithFailover(
     hostname_differences.resize(nested_pools.size());
     for (size_t i = 0; i < nested_pools.size(); ++i)
     {
-        ConnectionPool & connection_pool = dynamic_cast<ConnectionPool &>(*nested_pools[i]);
+        auto & connection_pool = dynamic_cast<ConnectionPool &>(*nested_pools[i]);
         hostname_differences[i] = getHostNameDifference(local_hostname, connection_pool.getHost());
     }
 }
@@ -187,7 +180,6 @@ ConnectionPoolWithFailover::tryGetEntry(
             fail_message = "There is no table " + table_to_check->database + "." + table_to_check->table
                 + " on server: " + result.entry->getDescription();
             LOG_WARNING(log, fail_message);
-            ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable);
 
             return result;
         }
@@ -217,7 +209,6 @@ ConnectionPoolWithFailover::tryGetEntry(
                 table_to_check->database,
                 table_to_check->table,
                 delay);
-            ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica);
         }
     }
     catch (const Exception & e)
diff --git a/dbms/src/Columns/ColumnConst.h b/dbms/src/Columns/ColumnConst.h
index 27283c0f24a..da071507a72 100644
--- a/dbms/src/Columns/ColumnConst.h
+++ b/dbms/src/Columns/ColumnConst.h
@@ -233,7 +233,8 @@ class ColumnConst final : public COWPtrHelper<IColumn, ColumnConst>
     template <typename T>
     T getValue() const
     {
-        return getField().safeGet<typename NearestFieldType<T>::Type>();
+        auto && tmp = getField();
+        return std::move(tmp.safeGet<typename NearestFieldType<T>::Type>());
     }
 };
 
diff --git a/dbms/src/Common/Arena.h b/dbms/src/Common/Arena.h
index c61ebfca8aa..ebaaf607a6d 100644
--- a/dbms/src/Common/Arena.h
+++ b/dbms/src/Common/Arena.h
@@ -24,13 +24,6 @@
 #include <memory>
 #include <vector>
 
-
-namespace ProfileEvents
-{
-extern const Event ArenaAllocChunks;
-extern const Event ArenaAllocBytes;
-} // namespace ProfileEvents
-
 namespace DB
 {
 /** Memory pool to append something. For example, short strings.
@@ -55,9 +48,6 @@ class Arena : private boost::noncopyable
 
         Chunk(size_t size_, Chunk * prev_)
         {
-            ProfileEvents::increment(ProfileEvents::ArenaAllocChunks);
-            ProfileEvents::increment(ProfileEvents::ArenaAllocBytes, size_);
-
             begin = reinterpret_cast<char *>(Allocator::alloc(size_));
             pos = begin;
             end = begin + size_;
diff --git a/dbms/src/Common/CurrentMetrics.cpp b/dbms/src/Common/CurrentMetrics.cpp
index 8a2f111d882..8673784c590 100644
--- a/dbms/src/Common/CurrentMetrics.cpp
+++ b/dbms/src/Common/CurrentMetrics.cpp
@@ -17,36 +17,13 @@
 
 /// Available metrics. Add something here as you wish.
 #define APPLY_FOR_METRICS(M)                    \
-    M(Query)                                    \
-    M(Merge)                                    \
-    M(ReplicatedFetch)                          \
-    M(ReplicatedSend)                           \
-    M(ReplicatedChecks)                         \
-    M(BackgroundPoolTask)                       \
-    M(DiskSpaceReservedForMerge)                \
-    M(DistributedSend)                          \
-    M(QueryPreempted)                           \
-    M(TCPConnection)                            \
-    M(HTTPConnection)                           \
-    M(InterserverConnection)                    \
     M(OpenFileForRead)                          \
     M(OpenFileForWrite)                         \
     M(OpenFileForReadWrite)                     \
-    M(SendExternalTables)                       \
-    M(QueryThread)                              \
-    M(ReadonlyReplica)                          \
-    M(LeaderReplica)                            \
     M(MemoryTracking)                           \
     M(MemoryTrackingInBackgroundProcessingPool) \
-    M(MemoryTrackingForMerges)                  \
-    M(LeaderElection)                           \
-    M(EphemeralNode)                            \
-    M(DelayedInserts)                           \
-    M(ContextLockWait)                          \
-    M(StorageBufferRows)                        \
-    M(StorageBufferBytes)                       \
-    M(DictCacheRequests)                        \
-    M(Revision)                                 \
+    M(LogicalCPUCores)                          \
+    M(MemoryCapacity)                           \
     M(PSMVCCNumSnapshots)                       \
     M(PSMVCCSnapshotsList)                      \
     M(RWLockWaitingReaders)                     \
diff --git a/dbms/src/Common/FailPoint.cpp b/dbms/src/Common/FailPoint.cpp
index c6c3caa44ad..ad5010d7826 100644
--- a/dbms/src/Common/FailPoint.cpp
+++ b/dbms/src/Common/FailPoint.cpp
@@ -12,7 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/Exception.h>
 #include <Common/FailPoint.h>
+#include <Poco/String.h>
+#include <Poco/StringTokenizer.h>
+#include <Poco/Util/LayeredConfiguration.h>
+#include <common/defines.h>
+#include <common/logger_useful.h>
 
 #include <boost/core/noncopyable.hpp>
 #include <condition_variable>
@@ -21,7 +27,6 @@
 namespace DB
 {
 std::unordered_map<String, std::shared_ptr<FailPointChannel>> FailPointHelper::fail_point_wait_channels;
-
 #define APPLY_FOR_FAILPOINTS_ONCE(M)                              \
     M(exception_between_drop_meta_and_data)                       \
     M(exception_between_alter_data_and_meta)                      \
@@ -85,33 +90,54 @@ std::unordered_map<String, std::shared_ptr<FailPointChannel>> FailPointHelper::f
     M(force_remote_read_for_batch_cop)                       \
     M(force_context_path)                                    \
     M(force_slow_page_storage_snapshot_release)              \
-    M(force_change_all_blobs_to_read_only)
-
-#define APPLY_FOR_FAILPOINTS_ONCE_WITH_CHANNEL(M) \
-    M(pause_with_alter_locks_acquired)            \
-    M(hang_in_execution)                          \
-    M(pause_before_dt_background_delta_merge)     \
-    M(pause_until_dt_background_delta_merge)      \
-    M(pause_before_apply_raft_cmd)                \
-    M(pause_before_apply_raft_snapshot)           \
-    M(pause_until_apply_raft_snapshot)            \
+    M(force_change_all_blobs_to_read_only)                   \
+    M(unblock_query_init_after_write)
+
+
+#define APPLY_FOR_PAUSEABLE_FAILPOINTS_ONCE(M) \
+    M(pause_with_alter_locks_acquired)         \
+    M(hang_in_execution)                       \
+    M(pause_before_dt_background_delta_merge)  \
+    M(pause_until_dt_background_delta_merge)   \
+    M(pause_before_apply_raft_cmd)             \
+    M(pause_before_apply_raft_snapshot)        \
+    M(pause_until_apply_raft_snapshot)         \
     M(pause_after_copr_streams_acquired_once)
 
-#define APPLY_FOR_FAILPOINTS_WITH_CHANNEL(M) \
-    M(pause_when_reading_from_dt_stream)     \
-    M(pause_when_writing_to_dt_store)        \
-    M(pause_when_ingesting_to_dt_store)      \
-    M(pause_when_altering_dt_store)          \
-    M(pause_after_copr_streams_acquired)     \
-    M(pause_before_server_merge_one_delta)
+#define APPLY_FOR_PAUSEABLE_FAILPOINTS(M)  \
+    M(pause_when_reading_from_dt_stream)   \
+    M(pause_when_writing_to_dt_store)      \
+    M(pause_when_ingesting_to_dt_store)    \
+    M(pause_when_altering_dt_store)        \
+    M(pause_after_copr_streams_acquired)   \
+    M(pause_before_server_merge_one_delta) \
+    M(pause_query_init)
+
+
+#define APPLY_FOR_RANDOM_FAILPOINTS(M)                  \
+    M(random_tunnel_wait_timeout_failpoint)             \
+    M(random_tunnel_init_rpc_failure_failpoint)         \
+    M(random_receiver_sync_msg_push_failure_failpoint)  \
+    M(random_receiver_async_msg_push_failure_failpoint) \
+    M(random_limit_check_failpoint)                     \
+    M(random_join_build_failpoint)                      \
+    M(random_join_prob_failpoint)                       \
+    M(random_aggregate_create_state_failpoint)          \
+    M(random_aggregate_merge_failpoint)                 \
+    M(random_sharedquery_failpoint)                     \
+    M(random_interpreter_failpoint)                     \
+    M(random_task_lifecycle_failpoint)                  \
+    M(random_task_manager_find_task_failure_failpoint)  \
+    M(random_min_tso_scheduler_failpoint)
 
 namespace FailPoints
 {
 #define M(NAME) extern const char(NAME)[] = #NAME "";
 APPLY_FOR_FAILPOINTS_ONCE(M)
 APPLY_FOR_FAILPOINTS(M)
-APPLY_FOR_FAILPOINTS_ONCE_WITH_CHANNEL(M)
-APPLY_FOR_FAILPOINTS_WITH_CHANNEL(M)
+APPLY_FOR_PAUSEABLE_FAILPOINTS_ONCE(M)
+APPLY_FOR_PAUSEABLE_FAILPOINTS(M)
+APPLY_FOR_RANDOM_FAILPOINTS(M)
 #undef M
 } // namespace FailPoints
 
@@ -167,15 +193,15 @@ void FailPointHelper::enableFailPoint(const String & fail_point_name)
     }
 
 #define M(NAME) SUB_M(NAME, FIU_ONETIME)
-    APPLY_FOR_FAILPOINTS_ONCE_WITH_CHANNEL(M)
+    APPLY_FOR_PAUSEABLE_FAILPOINTS_ONCE(M)
 #undef M
 
 #define M(NAME) SUB_M(NAME, 0)
-    APPLY_FOR_FAILPOINTS_WITH_CHANNEL(M)
+    APPLY_FOR_PAUSEABLE_FAILPOINTS(M)
 #undef M
 #undef SUB_M
 
-    throw Exception("Cannot find fail point " + fail_point_name, ErrorCodes::FAIL_POINT_ERROR);
+    throw Exception(fmt::format("Cannot find fail point {}", fail_point_name), ErrorCodes::FAIL_POINT_ERROR);
 }
 
 void FailPointHelper::disableFailPoint(const String & fail_point_name)
@@ -200,6 +226,41 @@ void FailPointHelper::wait(const String & fail_point_name)
         ptr->wait();
     }
 }
+
+void FailPointHelper::initRandomFailPoints(Poco::Util::LayeredConfiguration & config, Poco::Logger * log)
+{
+    String random_fail_point_cfg = config.getString("flash.random_fail_points", "");
+    if (random_fail_point_cfg.empty())
+        return;
+
+    Poco::StringTokenizer string_tokens(random_fail_point_cfg, ",");
+    for (const auto & string_token : string_tokens)
+    {
+        Poco::StringTokenizer pair_tokens(string_token, "-");
+        RUNTIME_ASSERT((pair_tokens.count() == 2), log, "RandomFailPoints config should be FailPointA-RatioA,FailPointB-RatioB,... format");
+        double rate = atof(pair_tokens[1].c_str()); //NOLINT(cert-err34-c): check conversion error manually
+        RUNTIME_ASSERT((0 <= rate && rate <= 1.0), log, "RandomFailPoint trigger rate should in [0,1], while {}", rate);
+        enableRandomFailPoint(pair_tokens[0], rate);
+    }
+    LOG_FMT_INFO(log, "Enable RandomFailPoints: {}", random_fail_point_cfg);
+}
+
+void FailPointHelper::enableRandomFailPoint(const String & fail_point_name, double rate)
+{
+#define SUB_M(NAME)                                               \
+    if (fail_point_name == FailPoints::NAME)                      \
+    {                                                             \
+        fiu_enable_random(FailPoints::NAME, 1, nullptr, 0, rate); \
+        return;                                                   \
+    }
+
+#define M(NAME) SUB_M(NAME)
+    APPLY_FOR_RANDOM_FAILPOINTS(M)
+#undef M
+#undef SUB_M
+
+    throw Exception(fmt::format("Cannot find fail point {}", fail_point_name), ErrorCodes::FAIL_POINT_ERROR);
+}
 #else
 class FailPointChannel
 {
@@ -210,6 +271,10 @@ void FailPointHelper::enableFailPoint(const String &) {}
 void FailPointHelper::disableFailPoint(const String &) {}
 
 void FailPointHelper::wait(const String &) {}
+
+void FailPointHelper::initRandomFailPoints(Poco::Util::LayeredConfiguration &, Poco::Logger *) {}
+
+void FailPointHelper::enableRandomFailPoint(const String &, double) {}
 #endif
 
 } // namespace DB
diff --git a/dbms/src/Common/FailPoint.h b/dbms/src/Common/FailPoint.h
index 2cf40ad55e4..31df2dbdcd2 100644
--- a/dbms/src/Common/FailPoint.h
+++ b/dbms/src/Common/FailPoint.h
@@ -21,6 +21,15 @@
 
 #include <unordered_map>
 
+namespace Poco
+{
+class Logger;
+namespace Util
+{
+class LayeredConfiguration;
+}
+} // namespace Poco
+
 namespace DB
 {
 namespace ErrorCodes
@@ -35,7 +44,6 @@ extern const int FAIL_POINT_ERROR;
 // When `fail_point` is enabled, wait till it is disabled
 #define FAIL_POINT_PAUSE(fail_point) fiu_do_on(fail_point, FailPointHelper::wait(fail_point);)
 
-
 class FailPointChannel;
 class FailPointHelper
 {
@@ -46,6 +54,16 @@ class FailPointHelper
 
     static void wait(const String & fail_point_name);
 
+    /*
+     * For Server RandomFailPoint test usage. When FIU_ENABLE is defined, this function does the following work:
+     * 1. Return if TiFlash config has empty flash.random_fail_points cfg
+     * 2. Parse flash.random_fail_points, which expect to has "FailPointA-RatioA,FailPointB-RatioB,..." format
+     * 3. Call enableRandomFailPoint method with parsed FailPointName and Rate
+     */
+    static void initRandomFailPoints(Poco::Util::LayeredConfiguration & config, Poco::Logger * log);
+
+    static void enableRandomFailPoint(const String & fail_point_name, double rate);
+
 private:
     static std::unordered_map<String, std::shared_ptr<FailPointChannel>> fail_point_wait_channels;
 };
diff --git a/dbms/src/Common/MPMCQueue.h b/dbms/src/Common/MPMCQueue.h
index f550ecc7ca2..31dfc65a174 100644
--- a/dbms/src/Common/MPMCQueue.h
+++ b/dbms/src/Common/MPMCQueue.h
@@ -74,56 +74,80 @@ class MPMCQueue
             destruct(getObj(read_pos));
     }
 
-    /// Block util:
+    /// Block until:
     /// 1. Pop succeeds with a valid T: return true.
     /// 2. The queue is cancelled or finished: return false.
-    bool pop(T & obj)
+    ALWAYS_INLINE bool pop(T & obj)
     {
-        return popObj(obj);
+        return popObj<true>(obj);
     }
 
-    /// Besides all conditions mentioned at `pop`, `tryPop` will return false if `timeout` is exceeded.
+    /// Besides all conditions mentioned at `pop`, `popTimeout` will return false if `timeout` is exceeded.
     template <typename Duration>
-    bool tryPop(T & obj, const Duration & timeout)
+    ALWAYS_INLINE bool popTimeout(T & obj, const Duration & timeout)
     {
         /// std::condition_variable::wait_until will always use system_clock.
         auto deadline = std::chrono::system_clock::now() + timeout;
-        return popObj(obj, &deadline);
+        return popObj<true>(obj, &deadline);
     }
 
-    /// Block util:
+    /// Non-blocking function.
+    /// Return true if pop succeed.
+    /// else return false.
+    ALWAYS_INLINE bool tryPop(T & obj)
+    {
+        return popObj<false>(obj);
+    }
+
+    /// Block until:
     /// 1. Push succeeds and return true.
     /// 2. The queue is cancelled and return false.
     /// 3. The queue has finished and return false.
     template <typename U>
     ALWAYS_INLINE bool push(U && u)
     {
-        return pushObj(std::forward<U>(u));
+        return pushObj<true>(std::forward<U>(u));
     }
 
-    /// Besides all conditions mentioned at `push`, `tryPush` will return false if `timeout` is exceeded.
+    /// Besides all conditions mentioned at `push`, `pushTimeout` will return false if `timeout` is exceeded.
     template <typename U, typename Duration>
-    ALWAYS_INLINE bool tryPush(U && u, const Duration & timeout)
+    ALWAYS_INLINE bool pushTimeout(U && u, const Duration & timeout)
     {
         /// std::condition_variable::wait_until will always use system_clock.
         auto deadline = std::chrono::system_clock::now() + timeout;
-        return pushObj(std::forward<U>(u), &deadline);
+        return pushObj<true>(std::forward<U>(u), &deadline);
+    }
+
+    /// Non-blocking function.
+    /// Return true if push succeed.
+    /// else return false.
+    template <typename U>
+    ALWAYS_INLINE bool tryPush(U && u)
+    {
+        return pushObj<false>(std::forward<U>(u));
     }
 
     /// The same as `push` except it will construct the object in place.
     template <typename... Args>
     ALWAYS_INLINE bool emplace(Args &&... args)
     {
-        return emplaceObj(nullptr, std::forward<Args>(args)...);
+        return emplaceObj<true>(nullptr, std::forward<Args>(args)...);
     }
 
-    /// The same as `tryPush` except it will construct the object in place.
+    /// The same as `pushTimeout` except it will construct the object in place.
     template <typename... Args, typename Duration>
-    ALWAYS_INLINE bool tryEmplace(Args &&... args, const Duration & timeout)
+    ALWAYS_INLINE bool emplaceTimeout(Args &&... args, const Duration & timeout)
     {
         /// std::condition_variable::wait_until will always use system_clock.
         auto deadline = std::chrono::system_clock::now() + timeout;
-        return emplaceObj(&deadline, std::forward<Args>(args)...);
+        return emplaceObj<true>(&deadline, std::forward<Args>(args)...);
+    }
+
+    /// The same as `tryPush` except it will construct the object in place.
+    template <typename... Args>
+    ALWAYS_INLINE bool tryEmplace(Args &&... args)
+    {
+        return emplaceObj<false>(nullptr, std::forward<Args>(args)...);
     }
 
     /// Cancel a NORMAL queue will wake up all blocking readers and writers.
@@ -233,7 +257,8 @@ class MPMCQueue
         }
     }
 
-    bool popObj(T & res, const TimePoint * deadline = nullptr)
+    template <bool need_wait>
+    bool popObj(T & res, [[maybe_unused]] const TimePoint * deadline = nullptr)
     {
 #ifdef __APPLE__
         WaitingNode node;
@@ -241,14 +266,16 @@ class MPMCQueue
         thread_local WaitingNode node;
 #endif
         {
-            /// read_pos < write_pos means the queue isn't empty
-            auto pred = [&] {
-                return read_pos < write_pos || !isNormal();
-            };
-
             std::unique_lock lock(mu);
 
-            wait(lock, reader_head, node, pred, deadline);
+            if constexpr (need_wait)
+            {
+                /// read_pos < write_pos means the queue isn't empty
+                auto pred = [&] {
+                    return read_pos < write_pos || !isNormal();
+                };
+                wait(lock, reader_head, node, pred, deadline);
+            }
 
             if (!isCancelled() && read_pos < write_pos)
             {
@@ -272,21 +299,23 @@ class MPMCQueue
         return false;
     }
 
-    template <typename F>
-    bool assignObj(const TimePoint * deadline, F && assigner)
+    template <bool need_wait, typename F>
+    bool assignObj([[maybe_unused]] const TimePoint * deadline, F && assigner)
     {
 #ifdef __APPLE__
         WaitingNode node;
 #else
         thread_local WaitingNode node;
 #endif
-        auto pred = [&] {
-            return write_pos - read_pos < capacity || !isNormal();
-        };
-
         std::unique_lock lock(mu);
 
-        wait(lock, writer_head, node, pred, deadline);
+        if constexpr (need_wait)
+        {
+            auto pred = [&] {
+                return write_pos - read_pos < capacity || !isNormal();
+            };
+            wait(lock, writer_head, node, pred, deadline);
+        }
 
         /// double check status after potential wait
         /// check write_pos because timeouted will also reach here.
@@ -305,16 +334,16 @@ class MPMCQueue
         return false;
     }
 
-    template <typename U>
+    template <bool need_wait, typename U>
     ALWAYS_INLINE bool pushObj(U && u, const TimePoint * deadline = nullptr)
     {
-        return assignObj(deadline, [&](void * addr) { new (addr) T(std::forward<U>(u)); });
+        return assignObj<need_wait>(deadline, [&](void * addr) { new (addr) T(std::forward<U>(u)); });
     }
 
-    template <typename... Args>
+    template <bool need_wait, typename... Args>
     ALWAYS_INLINE bool emplaceObj(const TimePoint * deadline, Args &&... args)
     {
-        return assignObj(deadline, [&](void * addr) { new (addr) T(std::forward<Args>(args)...); });
+        return assignObj<need_wait>(deadline, [&](void * addr) { new (addr) T(std::forward<Args>(args)...); });
     }
 
     ALWAYS_INLINE bool isNormal() const
diff --git a/dbms/src/Common/MyDuration.cpp b/dbms/src/Common/MyDuration.cpp
index 8801ae0de44..513c40b6dbc 100644
--- a/dbms/src/Common/MyDuration.cpp
+++ b/dbms/src/Common/MyDuration.cpp
@@ -67,4 +67,4 @@ String MyDuration::toString() const
     auto frac_str = fmt::format("{:06}", microsecond);
     return fmt::format(fmt_str, sign > 0 ? "" : "-", hour, minute, second, frac_str);
 }
-} // namespace DB
\ No newline at end of file
+} // namespace DB
diff --git a/dbms/src/Common/PoolWithFailoverBase.h b/dbms/src/Common/PoolWithFailoverBase.h
index a5483587e3c..04e6474c0fe 100644
--- a/dbms/src/Common/PoolWithFailoverBase.h
+++ b/dbms/src/Common/PoolWithFailoverBase.h
@@ -40,12 +40,6 @@ extern const int LOGICAL_ERROR;
 } // namespace ErrorCodes
 } // namespace DB
 
-namespace ProfileEvents
-{
-extern const Event DistributedConnectionFailTry;
-extern const Event DistributedConnectionFailAtAll;
-} // namespace ProfileEvents
-
 /// This class provides a pool with fault tolerance. It is used for pooling of connections to replicated DB.
 /// Initialized by several PoolBase objects.
 /// When a connection is requested, tries to create or choose an alive connection from one of the nested pools.
@@ -254,14 +248,12 @@ PoolWithFailoverBase<TNestedPool>::getMany(
             else
             {
                 LOG_FMT_WARNING(log, "Connection failed at try No.{}, reason: {}", shuffled_pool.error_count + 1, fail_message);
-                ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry);
 
                 ++shuffled_pool.error_count;
 
                 if (shuffled_pool.error_count >= max_tries)
                 {
                     ++failed_pools_count;
-                    ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll);
                 }
             }
         }
diff --git a/dbms/src/Common/ProfileEvents.cpp b/dbms/src/Common/ProfileEvents.cpp
index 0ec1ce438a6..7507ff0b1f8 100644
--- a/dbms/src/Common/ProfileEvents.cpp
+++ b/dbms/src/Common/ProfileEvents.cpp
@@ -16,160 +16,98 @@
 
 
 /// Available events. Add something here as you wish.
-#define APPLY_FOR_EVENTS(M)                     \
-    M(Query)                                    \
-    M(SelectQuery)                              \
-    M(InsertQuery)                              \
-    M(DeleteQuery)                              \
-    M(FileOpen)                                 \
-    M(FileOpenFailed)                           \
-    M(Seek)                                     \
-    M(ReadBufferFromFileDescriptorRead)         \
-    M(ReadBufferFromFileDescriptorReadFailed)   \
-    M(ReadBufferFromFileDescriptorReadBytes)    \
-    M(WriteBufferFromFileDescriptorWrite)       \
-    M(WriteBufferFromFileDescriptorWriteFailed) \
-    M(WriteBufferFromFileDescriptorWriteBytes)  \
-    M(ReadBufferAIORead)                        \
-    M(ReadBufferAIOReadBytes)                   \
-    M(WriteBufferAIOWrite)                      \
-    M(WriteBufferAIOWriteBytes)                 \
-    M(ReadCompressedBytes)                      \
-    M(CompressedReadBufferBlocks)               \
-    M(CompressedReadBufferBytes)                \
-    M(UncompressedCacheHits)                    \
-    M(UncompressedCacheMisses)                  \
-    M(UncompressedCacheWeightLost)              \
-    M(IOBufferAllocs)                           \
-    M(IOBufferAllocBytes)                       \
-    M(ArenaAllocChunks)                         \
-    M(ArenaAllocBytes)                          \
-    M(FunctionExecute)                          \
-    M(TableFunctionExecute)                     \
-    M(MarkCacheHits)                            \
-    M(MarkCacheMisses)                          \
-    M(CreatedReadBufferOrdinary)                \
-    M(CreatedReadBufferAIO)                     \
-    M(CreatedWriteBufferOrdinary)               \
-    M(CreatedWriteBufferAIO)                    \
-                                                \
-    M(InsertedRows)                             \
-    M(InsertedBytes)                            \
-    M(DelayedInserts)                           \
-    M(RejectedInserts)                          \
-    M(DelayedInsertsMilliseconds)               \
-    M(DuplicatedInsertedBlocks)                 \
-                                                \
-    M(DistributedConnectionFailTry)             \
-    M(DistributedConnectionMissingTable)        \
-    M(DistributedConnectionStaleReplica)        \
-    M(DistributedConnectionFailAtAll)           \
-                                                \
-    M(CompileAttempt)                           \
-    M(CompileSuccess)                           \
-                                                \
-    M(ExternalSortWritePart)                    \
-    M(ExternalSortMerge)                        \
-    M(ExternalAggregationWritePart)             \
-    M(ExternalAggregationMerge)                 \
-    M(ExternalAggregationCompressedBytes)       \
-    M(ExternalAggregationUncompressedBytes)     \
-                                                \
-    M(SlowRead)                                 \
-    M(ReadBackoff)                              \
-                                                \
-    M(RegexpCreated)                            \
-    M(ContextLock)                              \
-                                                \
-    M(StorageBufferFlush)                       \
-    M(StorageBufferErrorOnFlush)                \
-    M(StorageBufferPassedAllMinThresholds)      \
-    M(StorageBufferPassedTimeMaxThreshold)      \
-    M(StorageBufferPassedRowsMaxThreshold)      \
-    M(StorageBufferPassedBytesMaxThreshold)     \
-                                                \
-    M(DictCacheKeysRequested)                   \
-    M(DictCacheKeysRequestedMiss)               \
-    M(DictCacheKeysRequestedFound)              \
-    M(DictCacheKeysExpired)                     \
-    M(DictCacheKeysNotFound)                    \
-    M(DictCacheKeysHit)                         \
-    M(DictCacheRequestTimeNs)                   \
-    M(DictCacheRequests)                        \
-    M(DictCacheLockWriteNs)                     \
-    M(DictCacheLockReadNs)                      \
-                                                \
-    M(DistributedSyncInsertionTimeoutExceeded)  \
-    M(DataAfterMergeDiffersFromReplica)         \
-    M(PolygonsAddedToPool)                      \
-    M(PolygonsInPoolAllocatedBytes)             \
-    M(RWLockAcquiredReadLocks)                  \
-    M(RWLockAcquiredWriteLocks)                 \
-    M(RWLockReadersWaitMilliseconds)            \
-    M(RWLockWritersWaitMilliseconds)            \
-                                                \
-    M(PSMWritePages)                            \
-    M(PSMWriteIOCalls)                          \
-    M(PSV3MBlobExpansion)                       \
-    M(PSV3MBlobReused)                          \
-    M(PSMWriteBytes)                            \
-    M(PSMBackgroundWriteBytes)                  \
-    M(PSMReadPages)                             \
-    M(PSMBackgroundReadBytes)                   \
-                                                \
-    M(PSMReadIOCalls)                           \
-    M(PSMReadBytes)                             \
-    M(PSMWriteFailed)                           \
-    M(PSMReadFailed)                            \
-                                                \
-    M(PSMVCCApplyOnCurrentBase)                 \
-    M(PSMVCCApplyOnCurrentDelta)                \
-    M(PSMVCCApplyOnNewDelta)                    \
-    M(PSMVCCCompactOnDelta)                     \
-    M(PSMVCCCompactOnDeltaRebaseRejected)       \
-    M(PSMVCCCompactOnBase)                      \
-                                                \
-    M(DMWriteBytes)                             \
-    M(DMWriteBlock)                             \
-    M(DMWriteBlockNS)                           \
-    M(DMWriteFile)                              \
-    M(DMWriteFileNS)                            \
-    M(DMDeleteRange)                            \
-    M(DMDeleteRangeNS)                          \
-    M(DMAppendDeltaPrepare)                     \
-    M(DMAppendDeltaPrepareNS)                   \
-    M(DMAppendDeltaCommitMemory)                \
-    M(DMAppendDeltaCommitMemoryNS)              \
-    M(DMAppendDeltaCommitDisk)                  \
-    M(DMAppendDeltaCommitDiskNS)                \
-    M(DMAppendDeltaCleanUp)                     \
-    M(DMAppendDeltaCleanUpNS)                   \
-    M(DMPlace)                                  \
-    M(DMPlaceNS)                                \
-    M(DMPlaceUpsert)                            \
-    M(DMPlaceUpsertNS)                          \
-    M(DMPlaceDeleteRange)                       \
-    M(DMPlaceDeleteRangeNS)                     \
-    M(DMDeltaMerge)                             \
-    M(DMDeltaMergeNS)                           \
-    M(DMSegmentSplit)                           \
-    M(DMSegmentSplitNS)                         \
-    M(DMSegmentGetSplitPoint)                   \
-    M(DMSegmentGetSplitPointNS)                 \
-    M(DMSegmentMerge)                           \
-    M(DMSegmentMergeNS)                         \
-    M(DMFlushDeltaCache)                        \
-    M(DMFlushDeltaCacheNS)                      \
-    M(DMCleanReadRows)                          \
-                                                \
-    M(FileFSync)                                \
-                                                \
-    M(DMFileFilterNoFilter)                     \
-    M(DMFileFilterAftPKAndPackSet)              \
-    M(DMFileFilterAftRoughSet)                  \
-                                                \
-    M(ChecksumDigestBytes)                      \
-                                                \
+#define APPLY_FOR_EVENTS(M)                    \
+    M(Query)                                   \
+    M(FileOpen)                                \
+    M(FileOpenFailed)                          \
+    M(ReadBufferFromFileDescriptorRead)        \
+    M(ReadBufferFromFileDescriptorReadFailed)  \
+    M(ReadBufferFromFileDescriptorReadBytes)   \
+    M(WriteBufferFromFileDescriptorWrite)      \
+    M(WriteBufferFromFileDescriptorWriteBytes) \
+    M(ReadBufferAIORead)                       \
+    M(ReadBufferAIOReadBytes)                  \
+    M(WriteBufferAIOWrite)                     \
+    M(WriteBufferAIOWriteBytes)                \
+                                               \
+    M(UncompressedCacheHits)                   \
+    M(UncompressedCacheMisses)                 \
+    M(UncompressedCacheWeightLost)             \
+    M(MarkCacheHits)                           \
+    M(MarkCacheMisses)                         \
+                                               \
+    M(ExternalAggregationCompressedBytes)      \
+    M(ExternalAggregationUncompressedBytes)    \
+                                               \
+    M(ContextLock)                             \
+                                               \
+    M(RWLockAcquiredReadLocks)                 \
+    M(RWLockAcquiredWriteLocks)                \
+    M(RWLockReadersWaitMilliseconds)           \
+    M(RWLockWritersWaitMilliseconds)           \
+                                               \
+    M(PSMWritePages)                           \
+    M(PSMWriteIOCalls)                         \
+    M(PSV3MBlobExpansion)                      \
+    M(PSV3MBlobReused)                         \
+    M(PSMWriteBytes)                           \
+    M(PSMBackgroundWriteBytes)                 \
+    M(PSMReadPages)                            \
+    M(PSMBackgroundReadBytes)                  \
+                                               \
+    M(PSMReadIOCalls)                          \
+    M(PSMReadBytes)                            \
+    M(PSMWriteFailed)                          \
+    M(PSMReadFailed)                           \
+                                               \
+    M(PSMVCCApplyOnCurrentBase)                \
+    M(PSMVCCApplyOnCurrentDelta)               \
+    M(PSMVCCApplyOnNewDelta)                   \
+    M(PSMVCCCompactOnDelta)                    \
+    M(PSMVCCCompactOnDeltaRebaseRejected)      \
+    M(PSMVCCCompactOnBase)                     \
+                                               \
+    M(DMWriteBytes)                            \
+    M(DMWriteBlock)                            \
+    M(DMWriteBlockNS)                          \
+    M(DMWriteFile)                             \
+    M(DMWriteFileNS)                           \
+    M(DMDeleteRange)                           \
+    M(DMDeleteRangeNS)                         \
+    M(DMAppendDeltaPrepare)                    \
+    M(DMAppendDeltaPrepareNS)                  \
+    M(DMAppendDeltaCommitMemory)               \
+    M(DMAppendDeltaCommitMemoryNS)             \
+    M(DMAppendDeltaCommitDisk)                 \
+    M(DMAppendDeltaCommitDiskNS)               \
+    M(DMAppendDeltaCleanUp)                    \
+    M(DMAppendDeltaCleanUpNS)                  \
+    M(DMPlace)                                 \
+    M(DMPlaceNS)                               \
+    M(DMPlaceUpsert)                           \
+    M(DMPlaceUpsertNS)                         \
+    M(DMPlaceDeleteRange)                      \
+    M(DMPlaceDeleteRangeNS)                    \
+    M(DMDeltaMerge)                            \
+    M(DMDeltaMergeNS)                          \
+    M(DMSegmentSplit)                          \
+    M(DMSegmentSplitNS)                        \
+    M(DMSegmentGetSplitPoint)                  \
+    M(DMSegmentGetSplitPointNS)                \
+    M(DMSegmentMerge)                          \
+    M(DMSegmentMergeNS)                        \
+    M(DMFlushDeltaCache)                       \
+    M(DMFlushDeltaCacheNS)                     \
+    M(DMCleanReadRows)                         \
+                                               \
+    M(FileFSync)                               \
+                                               \
+    M(DMFileFilterNoFilter)                    \
+    M(DMFileFilterAftPKAndPackSet)             \
+    M(DMFileFilterAftRoughSet)                 \
+                                               \
+    M(ChecksumDigestBytes)                     \
+                                               \
     M(RaftWaitIndexTimeout)
 
 namespace ProfileEvents
diff --git a/dbms/src/Common/TiFlashMetrics.h b/dbms/src/Common/TiFlashMetrics.h
index 9aa826e0e30..c0ce60af01e 100644
--- a/dbms/src/Common/TiFlashMetrics.h
+++ b/dbms/src/Common/TiFlashMetrics.h
@@ -60,27 +60,27 @@ namespace DB
         F(type_partition_ts, {"type", "partition_table_scan"}),                                                                           \
         F(type_window, {"type", "window"}), F(type_window_sort, {"type", "window_sort"}))                                                 \
     M(tiflash_coprocessor_request_duration_seconds, "Bucketed histogram of request duration", Histogram,                                  \
-        F(type_batch, {{"type", "batch"}}, ExpBuckets{0.0005, 2, 30}), F(type_cop, {{"type", "cop"}}, ExpBuckets{0.0005, 2, 30}),         \
-        F(type_super_batch, {{"type", "super_batch"}}, ExpBuckets{0.0005, 2, 30}),                                                        \
-        F(type_dispatch_mpp_task, {{"type", "dispatch_mpp_task"}}, ExpBuckets{0.0005, 2, 30}),                                            \
-        F(type_mpp_establish_conn, {{"type", "mpp_establish_conn"}}, ExpBuckets{0.0005, 2, 30}),                                          \
-        F(type_cancel_mpp_task, {{"type", "cancel_mpp_task"}}, ExpBuckets{0.0005, 2, 30}),                                                \
-        F(type_run_mpp_task, {{"type", "run_mpp_task"}}, ExpBuckets{0.0005, 2, 30}))                                                      \
+        F(type_batch, {{"type", "batch"}}, ExpBuckets{0.001, 2, 20}), F(type_cop, {{"type", "cop"}}, ExpBuckets{0.001, 2, 20}),           \
+        F(type_super_batch, {{"type", "super_batch"}}, ExpBuckets{0.001, 2, 20}),                                                         \
+        F(type_dispatch_mpp_task, {{"type", "dispatch_mpp_task"}}, ExpBuckets{0.001, 2, 20}),                                             \
+        F(type_mpp_establish_conn, {{"type", "mpp_establish_conn"}}, ExpBuckets{0.001, 2, 20}),                                           \
+        F(type_cancel_mpp_task, {{"type", "cancel_mpp_task"}}, ExpBuckets{0.001, 2, 20}),                                                 \
+        F(type_run_mpp_task, {{"type", "run_mpp_task"}}, ExpBuckets{0.001, 2, 20}))                                                       \
     M(tiflash_coprocessor_request_memory_usage, "Bucketed histogram of request memory usage", Histogram,                                  \
         F(type_cop, {{"type", "cop"}}, ExpBuckets{1024 * 1024, 2, 16}),                                                                   \
-        F(type_super_batch, {{"type", "super_batch"}}, ExpBuckets{1024 * 1024, 2, 16}),                                                   \
-        F(type_run_mpp_task, {{"type", "run_mpp_task"}}, ExpBuckets{1024 * 1024, 2, 16}))                                                 \
+        F(type_super_batch, {{"type", "super_batch"}}, ExpBuckets{1024 * 1024, 2, 20}),                                                   \
+        F(type_run_mpp_task, {{"type", "run_mpp_task"}}, ExpBuckets{1024 * 1024, 2, 20}))                                                 \
     M(tiflash_coprocessor_request_error, "Total number of request error", Counter, F(reason_meet_lock, {"reason", "meet_lock"}),          \
         F(reason_region_not_found, {"reason", "region_not_found"}), F(reason_epoch_not_match, {"reason", "epoch_not_match"}),             \
         F(reason_kv_client_error, {"reason", "kv_client_error"}), F(reason_internal_error, {"reason", "internal_error"}),                 \
         F(reason_other_error, {"reason", "other_error"}))                                                                                 \
     M(tiflash_coprocessor_request_handle_seconds, "Bucketed histogram of request handle duration", Histogram,                             \
-        F(type_batch, {{"type", "batch"}}, ExpBuckets{0.0005, 2, 30}), F(type_cop, {{"type", "cop"}}, ExpBuckets{0.0005, 2, 30}),         \
-        F(type_super_batch, {{"type", "super_batch"}}, ExpBuckets{0.0005, 2, 30}),                                                        \
-        F(type_dispatch_mpp_task, {{"type", "dispatch_mpp_task"}}, ExpBuckets{0.0005, 2, 30}),                                            \
-        F(type_mpp_establish_conn, {{"type", "mpp_establish_conn"}}, ExpBuckets{0.0005, 2, 30}),                                          \
-        F(type_cancel_mpp_task, {{"type", "cancel_mpp_task"}}, ExpBuckets{0.0005, 2, 30}),                                                \
-        F(type_run_mpp_task, {{"type", "run_mpp_task"}}, ExpBuckets{0.0005, 2, 30}))                                                      \
+        F(type_batch, {{"type", "batch"}}, ExpBuckets{0.001, 2, 20}), F(type_cop, {{"type", "cop"}}, ExpBuckets{0.001, 2, 20}),           \
+        F(type_super_batch, {{"type", "super_batch"}}, ExpBuckets{0.001, 2, 20}),                                                         \
+        F(type_dispatch_mpp_task, {{"type", "dispatch_mpp_task"}}, ExpBuckets{0.001, 2, 20}),                                             \
+        F(type_mpp_establish_conn, {{"type", "mpp_establish_conn"}}, ExpBuckets{0.001, 2, 20}),                                           \
+        F(type_cancel_mpp_task, {{"type", "cancel_mpp_task"}}, ExpBuckets{0.001, 2, 20}),                                                 \
+        F(type_run_mpp_task, {{"type", "run_mpp_task"}}, ExpBuckets{0.001, 2, 20}))                                                       \
     M(tiflash_coprocessor_response_bytes, "Total bytes of response body", Counter)                                                        \
     M(tiflash_schema_version, "Current version of tiflash cached schema", Gauge)                                                          \
     M(tiflash_schema_applying, "Whether the schema is applying or not (holding lock)", Gauge)                                             \
@@ -95,21 +95,14 @@ namespace DB
         F(type_alter_column_tp, {"type", "alter_column_type"}), F(type_rename_column, {"type", "rename_column"}),                         \
         F(type_exchange_partition, {"type", "exchange_partition"}))                                                                       \
     M(tiflash_schema_apply_duration_seconds, "Bucketed histogram of ddl apply duration", Histogram,                                       \
-        F(type_ddl_apply_duration, {{"req", "ddl_apply_duration"}}, ExpBuckets{0.0005, 2, 20}))                                           \
-    M(tiflash_tmt_merge_count, "Total number of TMT engine merge", Counter)                                                               \
-    M(tiflash_tmt_merge_duration_seconds, "Bucketed histogram of TMT engine merge duration", Histogram,                                   \
-        F(type_tmt_merge_duration, {{"type", "tmt_merge_duration"}}, ExpBuckets{0.0005, 2, 20}))                                          \
-    M(tiflash_tmt_write_parts_count, "Total number of TMT engine write parts", Counter)                                                   \
-    M(tiflash_tmt_write_parts_duration_seconds, "Bucketed histogram of TMT engine write parts duration", Histogram,                       \
-        F(type_tmt_write_duration, {{"type", "tmt_write_parts_duration"}}, ExpBuckets{0.0005, 2, 20}))                                    \
-    M(tiflash_tmt_read_parts_count, "Total number of TMT engine read parts", Gauge)                                                       \
+        F(type_ddl_apply_duration, {{"req", "ddl_apply_duration"}}, ExpBuckets{0.001, 2, 20}))                                            \
     M(tiflash_raft_read_index_count, "Total number of raft read index", Counter)                                                          \
     M(tiflash_raft_read_index_duration_seconds, "Bucketed histogram of raft read index duration", Histogram,                              \
-        F(type_raft_read_index_duration, {{"type", "tmt_raft_read_index_duration"}}, ExpBuckets{0.0005, 2, 20}))                          \
+        F(type_raft_read_index_duration, {{"type", "tmt_raft_read_index_duration"}}, ExpBuckets{0.001, 2, 20}))                           \
     M(tiflash_raft_wait_index_duration_seconds, "Bucketed histogram of raft wait index duration", Histogram,                              \
-        F(type_raft_wait_index_duration, {{"type", "tmt_raft_wait_index_duration"}}, ExpBuckets{0.0005, 2, 20}))                          \
+        F(type_raft_wait_index_duration, {{"type", "tmt_raft_wait_index_duration"}}, ExpBuckets{0.001, 2, 20}))                           \
     M(tiflash_syncing_data_freshness, "The freshness of tiflash data with tikv data", Histogram,                                          \
-        F(type_syncing_data_freshness, {{"type", "data_freshness"}}, ExpBuckets{0.0005, 2, 20}))                                          \
+        F(type_syncing_data_freshness, {{"type", "data_freshness"}}, ExpBuckets{0.001, 2, 20}))                                           \
     M(tiflash_storage_write_amplification, "The data write amplification in storage engine", Gauge)                                       \
     M(tiflash_storage_read_tasks_count, "Total number of storage engine read tasks", Counter)                                             \
     M(tiflash_storage_command_count, "Total number of storage's command, such as delete range / shutdown /startup", Counter,              \
@@ -122,16 +115,16 @@ namespace DB
         F(type_seg_split, {"type", "seg_split"}), F(type_seg_split_fg, {"type", "seg_split_fg"}),                                         \
         F(type_seg_merge, {"type", "seg_merge"}), F(type_place_index_update, {"type", "place_index_update"}))                             \
     M(tiflash_storage_subtask_duration_seconds, "Bucketed histogram of storage's sub task duration", Histogram,                           \
-        F(type_delta_merge, {{"type", "delta_merge"}}, ExpBuckets{0.0005, 2, 20}),                                                        \
-        F(type_delta_merge_fg, {{"type", "delta_merge_fg"}}, ExpBuckets{0.0005, 2, 20}),                                                  \
-        F(type_delta_merge_fg_rpc, {{"type", "delta_merge_fg_rpc"}}, ExpBuckets{0.0005, 2, 20}),                                          \
-        F(type_delta_merge_bg_gc, {{"type", "delta_merge_bg_gc"}}, ExpBuckets{0.0005, 2, 20}),                                            \
-        F(type_delta_compact, {{"type", "delta_compact"}}, ExpBuckets{0.0005, 2, 20}),                                                    \
-        F(type_delta_flush, {{"type", "delta_flush"}}, ExpBuckets{0.0005, 2, 20}),                                                        \
-        F(type_seg_split, {{"type", "seg_split"}}, ExpBuckets{0.0005, 2, 20}),                                                            \
-        F(type_seg_split_fg, {{"type", "seg_split_fg"}}, ExpBuckets{0.0005, 2, 20}),                                                      \
-        F(type_seg_merge, {{"type", "seg_merge"}}, ExpBuckets{0.0005, 2, 20}),                                                            \
-        F(type_place_index_update, {{"type", "place_index_update"}}, ExpBuckets{0.0005, 2, 20}))                                          \
+        F(type_delta_merge, {{"type", "delta_merge"}}, ExpBuckets{0.001, 2, 20}),                                                         \
+        F(type_delta_merge_fg, {{"type", "delta_merge_fg"}}, ExpBuckets{0.001, 2, 20}),                                                   \
+        F(type_delta_merge_fg_rpc, {{"type", "delta_merge_fg_rpc"}}, ExpBuckets{0.001, 2, 20}),                                           \
+        F(type_delta_merge_bg_gc, {{"type", "delta_merge_bg_gc"}}, ExpBuckets{0.001, 2, 20}),                                             \
+        F(type_delta_compact, {{"type", "delta_compact"}}, ExpBuckets{0.001, 2, 20}),                                                     \
+        F(type_delta_flush, {{"type", "delta_flush"}}, ExpBuckets{0.001, 2, 20}),                                                         \
+        F(type_seg_split, {{"type", "seg_split"}}, ExpBuckets{0.001, 2, 20}),                                                             \
+        F(type_seg_split_fg, {{"type", "seg_split_fg"}}, ExpBuckets{0.001, 2, 20}),                                                       \
+        F(type_seg_merge, {{"type", "seg_merge"}}, ExpBuckets{0.001, 2, 20}),                                                             \
+        F(type_place_index_update, {{"type", "place_index_update"}}, ExpBuckets{0.001, 2, 20}))                                           \
     M(tiflash_storage_throughput_bytes, "Calculate the throughput of tasks of storage in bytes", Gauge,           /**/                    \
         F(type_write, {"type", "write"}),                                                                         /**/                    \
         F(type_ingest, {"type", "ingest"}),                                                                       /**/                    \
@@ -145,8 +138,8 @@ namespace DB
         F(type_split, {"type", "split"}),                                                                         /**/                    \
         F(type_merge, {"type", "merge"}))                                                                         /**/                    \
     M(tiflash_storage_write_stall_duration_seconds, "The write stall duration of storage, in seconds", Histogram, /**/                    \
-        F(type_write, {{"type", "write"}}, ExpBuckets{0.0005, 2, 20}),                                            /**/                    \
-        F(type_delete_range, {{"type", "delete_range"}}, ExpBuckets{0.0005, 2, 20}))                              /**/                    \
+        F(type_write, {{"type", "write"}}, ExpBuckets{0.001, 2, 20}),                                             /**/                    \
+        F(type_delete_range, {{"type", "delete_range"}}, ExpBuckets{0.001, 2, 20}))                               /**/                    \
     M(tiflash_storage_page_gc_count, "Total number of page's gc execution.", Counter,                                                     \
         F(type_exec, {"type", "exec"}),                                                                                                   \
         F(type_low_write, {"type", "low_write"}),                                                                                         \
@@ -170,7 +163,7 @@ namespace DB
         Histogram, /* these command usually cost servel seconds, increase the start bucket to 50ms */                                     \
         F(type_ingest_sst, {{"type", "ingest_sst"}}, ExpBuckets{0.05, 2, 10}),                                                            \
         F(type_apply_snapshot_predecode, {{"type", "snapshot_predecode"}}, ExpBuckets{0.05, 2, 10}),                                      \
-        F(type_apply_snapshot_predecode_sst2dt, {{"type", "snapshot_predecode_sst2dt"}}, ExpBuckets{0.05, 2, 10}),                              \
+        F(type_apply_snapshot_predecode_sst2dt, {{"type", "snapshot_predecode_sst2dt"}}, ExpBuckets{0.05, 2, 10}),                        \
         F(type_apply_snapshot_flush, {{"type", "snapshot_flush"}}, ExpBuckets{0.05, 2, 10}))                                              \
     M(tiflash_raft_process_keys, "Total number of keys processed in some types of Raft commands", Counter,                                \
         F(type_apply_snapshot, {"type", "apply_snapshot"}), F(type_ingest_sst, {"type", "ingest_sst"}))                                   \
@@ -212,7 +205,7 @@ namespace DB
         F(type_thread_hard_limit, {"type", "thread_hard_limit"}),                                                                         \
         F(type_hard_limit_exceeded_count, {"type", "hard_limit_exceeded_count"}))                                                         \
     M(tiflash_task_scheduler_waiting_duration_seconds, "Bucketed histogram of task waiting for scheduling duration", Histogram,           \
-        F(type_task_scheduler_waiting_duration, {{"type", "task_waiting_duration"}}, ExpBuckets{0.0005, 2, 20}))
+        F(type_task_scheduler_waiting_duration, {{"type", "task_waiting_duration"}}, ExpBuckets{0.001, 2, 20}))
 
 // clang-format on
 
diff --git a/dbms/src/Common/getNumberOfPhysicalCPUCores.h b/dbms/src/Common/getNumberOfPhysicalCPUCores.h
index 6f7eaef4bb4..b3ab65a66e5 100644
--- a/dbms/src/Common/getNumberOfPhysicalCPUCores.h
+++ b/dbms/src/Common/getNumberOfPhysicalCPUCores.h
@@ -15,4 +15,5 @@
 #pragma once
 
 /// Get number of CPU cores without hyper-threading.
+/// Note: do not support environment under resource isolation mechanism like Docker, CGroup.
 unsigned getNumberOfPhysicalCPUCores();
diff --git a/dbms/src/Common/tests/gtest_mpmc_queue.cpp b/dbms/src/Common/tests/gtest_mpmc_queue.cpp
index 85ad1892067..3f2748b452b 100644
--- a/dbms/src/Common/tests/gtest_mpmc_queue.cpp
+++ b/dbms/src/Common/tests/gtest_mpmc_queue.cpp
@@ -98,12 +98,14 @@ class MPMCQueueTest : public ::testing::Test
     void testCannotTryPush(MPMCQueue<T> & queue)
     {
         auto old_size = queue.size();
-        auto res = queue.tryPush(ValueHelper<T>::make(-1), std::chrono::microseconds(1));
-        auto new_size = queue.size();
-        if (res)
+        bool ok1 = queue.tryPush(ValueHelper<T>::make(-1));
+        auto new_size1 = queue.size();
+        bool ok2 = queue.pushTimeout(ValueHelper<T>::make(-1), std::chrono::microseconds(1));
+        auto new_size2 = queue.size();
+        if (ok1 || ok2)
             throw TiFlashTestException("Should push fail");
-        if (old_size != new_size)
-            throw TiFlashTestException(fmt::format("Size changed from {} to {} without push", old_size, new_size));
+        if (old_size != new_size1 || old_size != new_size2)
+            throw TiFlashTestException(fmt::format("Size changed from {} to {} and {} without push", old_size, new_size1, new_size2));
     }
 
     template <typename T>
@@ -124,12 +126,14 @@ class MPMCQueueTest : public ::testing::Test
     {
         auto old_size = queue.size();
         T res;
-        bool ok = queue.tryPop(res, std::chrono::microseconds(1));
-        auto new_size = queue.size();
-        if (ok)
+        bool ok1 = queue.tryPop(res);
+        auto new_size1 = queue.size();
+        bool ok2 = queue.popTimeout(res, std::chrono::microseconds(1));
+        auto new_size2 = queue.size();
+        if (ok1 || ok2)
             throw TiFlashTestException("Should pop fail");
-        if (old_size != new_size)
-            throw TiFlashTestException(fmt::format("Size changed from {} to {} without pop", old_size, new_size));
+        if (old_size != new_size1 || old_size != new_size2)
+            throw TiFlashTestException(fmt::format("Size changed from {} to {} and {} without pop", old_size, new_size1, new_size2));
     }
 
     template <typename T>
@@ -474,7 +478,6 @@ class MPMCQueueTest : public ::testing::Test
             throwOrMove(std::move(rhs));
         }
 
-
         ThrowInjectable & operator=(ThrowInjectable && rhs)
         {
             if (this != &rhs)
diff --git a/dbms/src/Common/tests/mpmc_queue_perftest.cpp b/dbms/src/Common/tests/mpmc_queue_perftest.cpp
index d047b5d498f..ba0d00001a3 100644
--- a/dbms/src/Common/tests/mpmc_queue_perftest.cpp
+++ b/dbms/src/Common/tests/mpmc_queue_perftest.cpp
@@ -87,7 +87,7 @@ struct Helper<MPMCQueue<T>>
     template <typename U>
     static void pushOneTo(MPMCQueue<T> & queue, U && data)
     {
-        queue.tryPush(std::forward<U>(data), std::chrono::milliseconds(1));
+        queue.pushTimeout(std::forward<U>(data), std::chrono::milliseconds(1));
     }
 };
 
diff --git a/dbms/src/Common/wrapInvocable.h b/dbms/src/Common/wrapInvocable.h
index d6cee519835..1c93bb3e782 100644
--- a/dbms/src/Common/wrapInvocable.h
+++ b/dbms/src/Common/wrapInvocable.h
@@ -35,7 +35,6 @@ inline auto wrapInvocable(bool propagate_memory_tracker, Func && func, Args &&..
         // run the task with the parameters provided
         return std::apply(std::move(func), std::move(args));
     };
-
     return capture;
 }
 } // namespace DB
diff --git a/dbms/src/Core/Block.cpp b/dbms/src/Core/Block.cpp
index 28db7af82e1..971e8f36e2a 100644
--- a/dbms/src/Core/Block.cpp
+++ b/dbms/src/Core/Block.cpp
@@ -238,10 +238,18 @@ void Block::checkNumberOfRows() const
         if (rows == -1)
             rows = size;
         else if (rows != size)
-            throw Exception("Sizes of columns doesn't match: "
-                                + data.front().name + ": " + toString(rows)
-                                + ", " + elem.name + ": " + toString(size),
+        {
+            auto first_col = data.front();
+            throw Exception(fmt::format(
+                                "Sizes of columns doesn't match: {}(id={}): {}, {}(id={}): {}",
+                                first_col.name,
+                                first_col.column_id,
+                                rows,
+                                elem.name,
+                                elem.column_id,
+                                size),
                             ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
+        }
     }
 }
 
diff --git a/dbms/src/DataStreams/AggregatingBlockInputStream.cpp b/dbms/src/DataStreams/AggregatingBlockInputStream.cpp
index 0d9e907c5f4..4cd09d1ea63 100644
--- a/dbms/src/DataStreams/AggregatingBlockInputStream.cpp
+++ b/dbms/src/DataStreams/AggregatingBlockInputStream.cpp
@@ -17,12 +17,6 @@
 #include <DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.h>
 #include <DataStreams/NativeBlockInputStream.h>
 
-
-namespace ProfileEvents
-{
-extern const Event ExternalAggregationMerge;
-}
-
 namespace DB
 {
 Block AggregatingBlockInputStream::getHeader() const
@@ -56,8 +50,6 @@ Block AggregatingBlockInputStream::readImpl()
               *  then read and merge them, spending the minimum amount of memory.
               */
 
-            ProfileEvents::increment(ProfileEvents::ExternalAggregationMerge);
-
             if (!isCancelled())
             {
                 /// Flush data in the RAM to disk also. It's easier than merging on-disk and RAM data.
diff --git a/dbms/src/DataStreams/AsynchronousBlockInputStream.h b/dbms/src/DataStreams/AsynchronousBlockInputStream.h
index e75d1603648..5b373c26e95 100644
--- a/dbms/src/DataStreams/AsynchronousBlockInputStream.h
+++ b/dbms/src/DataStreams/AsynchronousBlockInputStream.h
@@ -22,12 +22,6 @@
 #include <Poco/Event.h>
 #include <common/ThreadPool.h>
 
-
-namespace CurrentMetrics
-{
-extern const Metric QueryThread;
-}
-
 namespace DB
 {
 /** Executes another BlockInputStream in a separate thread.
@@ -141,8 +135,6 @@ class AsynchronousBlockInputStream : public IProfilingBlockInputStream
     /// Calculations that can be performed in a separate thread
     void calculate()
     {
-        CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread};
-
         try
         {
             if (first)
diff --git a/dbms/src/DataStreams/CountingBlockOutputStream.cpp b/dbms/src/DataStreams/CountingBlockOutputStream.cpp
index 26bc5a4566f..52dc6b598b9 100644
--- a/dbms/src/DataStreams/CountingBlockOutputStream.cpp
+++ b/dbms/src/DataStreams/CountingBlockOutputStream.cpp
@@ -12,20 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <DataStreams/CountingBlockOutputStream.h>
 #include <Common/ProfileEvents.h>
-
-
-namespace ProfileEvents
-{
-    extern const Event InsertedRows;
-    extern const Event InsertedBytes;
-}
-
+#include <DataStreams/CountingBlockOutputStream.h>
 
 namespace DB
 {
-
 void CountingBlockOutputStream::write(const Block & block)
 {
     stream->write(block);
@@ -33,9 +24,6 @@ void CountingBlockOutputStream::write(const Block & block)
     Progress local_progress(block.rows(), block.bytes(), 0);
     progress.incrementPiecewiseAtomically(local_progress);
 
-    ProfileEvents::increment(ProfileEvents::InsertedRows, local_progress.rows);
-    ProfileEvents::increment(ProfileEvents::InsertedBytes, local_progress.bytes);
-
     if (process_elem)
         process_elem->updateProgressOut(local_progress);
 
@@ -43,4 +31,4 @@ void CountingBlockOutputStream::write(const Block & block)
         progress_callback(local_progress);
 }
 
-}
+} // namespace DB
diff --git a/dbms/src/DataStreams/MergeSortingBlockInputStream.cpp b/dbms/src/DataStreams/MergeSortingBlockInputStream.cpp
index e79426f686e..cf8db3f8711 100644
--- a/dbms/src/DataStreams/MergeSortingBlockInputStream.cpp
+++ b/dbms/src/DataStreams/MergeSortingBlockInputStream.cpp
@@ -19,13 +19,6 @@
 #include <IO/CompressedWriteBuffer.h>
 #include <IO/WriteBufferFromFile.h>
 
-
-namespace ProfileEvents
-{
-extern const Event ExternalSortWritePart;
-extern const Event ExternalSortMerge;
-} // namespace ProfileEvents
-
 namespace DB
 {
 /** Remove constant columns from block.
@@ -136,7 +129,6 @@ Block MergeSortingBlockInputStream::readImpl()
                 MergeSortingBlocksBlockInputStream block_in(blocks, description, log->identifier(), max_merged_block_size, limit);
 
                 LOG_FMT_INFO(log, "Sorting and writing part of data into temporary file {}", path);
-                ProfileEvents::increment(ProfileEvents::ExternalSortWritePart);
                 copyData(block_in, block_out, &is_cancelled); /// NOTE. Possibly limit disk usage.
                 LOG_FMT_INFO(log, "Done writing part of data into temporary file {}", path);
 
@@ -155,7 +147,6 @@ Block MergeSortingBlockInputStream::readImpl()
         else
         {
             /// If there was temporary files.
-            ProfileEvents::increment(ProfileEvents::ExternalSortMerge);
 
             LOG_FMT_INFO(log, "There are {} temporary sorted parts to merge.", temporary_files.size());
 
diff --git a/dbms/src/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.cpp b/dbms/src/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.cpp
index 5d0b677b792..3a1cc1eed31 100644
--- a/dbms/src/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.cpp
+++ b/dbms/src/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.cpp
@@ -19,13 +19,6 @@
 
 #include <future>
 
-
-namespace CurrentMetrics
-{
-extern const Metric QueryThread;
-}
-
-
 namespace DB
 {
 /** Scheme of operation:
@@ -156,7 +149,7 @@ void MergingAggregatedMemoryEfficientBlockInputStream::cancel(bool kill)
 
     for (auto & input : inputs)
     {
-        if (IProfilingBlockInputStream * child = dynamic_cast<IProfilingBlockInputStream *>(input.stream.get()))
+        if (auto * child = dynamic_cast<IProfilingBlockInputStream *>(input.stream.get()))
         {
             try
             {
@@ -198,7 +191,6 @@ void MergingAggregatedMemoryEfficientBlockInputStream::start()
 
             reading_pool->schedule(
                 wrapInvocable(true, [&child] {
-                    CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread};
                     child->readPrefix();
                 }));
         }
@@ -309,8 +301,6 @@ void MergingAggregatedMemoryEfficientBlockInputStream::finalize()
 
 void MergingAggregatedMemoryEfficientBlockInputStream::mergeThread()
 {
-    CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread};
-
     try
     {
         while (!parallel_merge_data->finish)
@@ -490,7 +480,6 @@ MergingAggregatedMemoryEfficientBlockInputStream::BlocksToMerge MergingAggregate
             if (need_that_input(input))
             {
                 reading_pool->schedule(wrapInvocable(true, [&input, &read_from_input] {
-                    CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread};
                     read_from_input(input);
                 }));
             }
diff --git a/dbms/src/DataStreams/MultiplexInputStream.h b/dbms/src/DataStreams/MultiplexInputStream.h
new file mode 100644
index 00000000000..4fa33262e66
--- /dev/null
+++ b/dbms/src/DataStreams/MultiplexInputStream.h
@@ -0,0 +1,246 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <DataStreams/IProfilingBlockInputStream.h>
+
+#include <memory>
+#include <queue>
+#include <vector>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+extern const int LOGICAL_ERROR;
+} // namespace ErrorCodes
+
+class MultiPartitionStreamPool
+{
+public:
+    MultiPartitionStreamPool() = default;
+
+    void addPartitionStreams(const BlockInputStreams & cur_streams)
+    {
+        if (cur_streams.empty())
+            return;
+        std::unique_lock lk(mu);
+        streams_queue_by_partition.push_back(
+            std::make_shared<std::queue<std::shared_ptr<IBlockInputStream>>>());
+        for (const auto & stream : cur_streams)
+            streams_queue_by_partition.back()->push(stream);
+        added_streams.insert(added_streams.end(), cur_streams.begin(), cur_streams.end());
+    }
+
+    std::shared_ptr<IBlockInputStream> pickOne()
+    {
+        std::unique_lock lk(mu);
+        if (streams_queue_by_partition.empty())
+            return nullptr;
+        if (streams_queue_id >= static_cast<int>(streams_queue_by_partition.size()))
+            streams_queue_id = 0;
+
+        auto & q = *streams_queue_by_partition[streams_queue_id];
+        std::shared_ptr<IBlockInputStream> ret = nullptr;
+        assert(!q.empty());
+        ret = q.front();
+        q.pop();
+        if (q.empty())
+            streams_queue_id = removeQueue(streams_queue_id);
+        else
+            streams_queue_id = nextQueueId(streams_queue_id);
+        return ret;
+    }
+
+    int exportAddedStreams(BlockInputStreams & ret_streams)
+    {
+        std::unique_lock lk(mu);
+        for (auto & stream : added_streams)
+            ret_streams.push_back(stream);
+        return added_streams.size();
+    }
+
+    int addedStreamsCnt()
+    {
+        std::unique_lock lk(mu);
+        return added_streams.size();
+    }
+
+private:
+    int removeQueue(int queue_id)
+    {
+        streams_queue_by_partition[queue_id] = nullptr;
+        if (queue_id != static_cast<int>(streams_queue_by_partition.size()) - 1)
+        {
+            swap(streams_queue_by_partition[queue_id], streams_queue_by_partition.back());
+            streams_queue_by_partition.pop_back();
+            return queue_id;
+        }
+        else
+        {
+            streams_queue_by_partition.pop_back();
+            return 0;
+        }
+    }
+
+    int nextQueueId(int queue_id) const
+    {
+        if (queue_id + 1 < static_cast<int>(streams_queue_by_partition.size()))
+            return queue_id + 1;
+        else
+            return 0;
+    }
+
+    static void swap(std::shared_ptr<std::queue<std::shared_ptr<IBlockInputStream>>> & a,
+                     std::shared_ptr<std::queue<std::shared_ptr<IBlockInputStream>>> & b)
+    {
+        a.swap(b);
+    }
+
+    std::vector<
+        std::shared_ptr<std::queue<
+            std::shared_ptr<IBlockInputStream>>>>
+        streams_queue_by_partition;
+    std::vector<std::shared_ptr<IBlockInputStream>> added_streams;
+    int streams_queue_id = 0;
+    std::mutex mu;
+};
+
+class MultiplexInputStream final : public IProfilingBlockInputStream
+{
+private:
+    static constexpr auto NAME = "Multiplex";
+
+public:
+    MultiplexInputStream(
+        std::shared_ptr<MultiPartitionStreamPool> & shared_pool,
+        const String & req_id)
+        : log(Logger::get(NAME, req_id))
+        , shared_pool(shared_pool)
+    {
+        shared_pool->exportAddedStreams(children);
+        size_t num_children = children.size();
+        if (num_children > 1)
+        {
+            Block header = children.at(0)->getHeader();
+            for (size_t i = 1; i < num_children; ++i)
+                assertBlocksHaveEqualStructure(
+                    children[i]->getHeader(),
+                    header,
+                    "MULTIPLEX");
+        }
+    }
+
+    String getName() const override { return NAME; }
+
+    ~MultiplexInputStream() override
+    {
+        try
+        {
+            if (!all_read)
+                cancel(false);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log, __PRETTY_FUNCTION__);
+        }
+    }
+
+    /** Different from the default implementation by trying to stop all sources,
+      * skipping failed by execution.
+      */
+    void cancel(bool kill) override
+    {
+        if (kill)
+            is_killed = true;
+
+        bool old_val = false;
+        if (!is_cancelled.compare_exchange_strong(
+                old_val,
+                true,
+                std::memory_order_seq_cst,
+                std::memory_order_relaxed))
+            return;
+
+        if (cur_stream)
+        {
+            if (IProfilingBlockInputStream * child = dynamic_cast<IProfilingBlockInputStream *>(&*cur_stream))
+            {
+                child->cancel(kill);
+            }
+        }
+    }
+
+    Block getHeader() const override { return children.at(0)->getHeader(); }
+
+protected:
+    /// Do nothing, to make the preparation when underlying InputStream is picked from the pool
+    void readPrefix() override
+    {
+    }
+
+    /** The following options are possible:
+      * 1. `readImpl` function is called until it returns an empty block.
+      *  Then `readSuffix` function is called and then destructor.
+      * 2. `readImpl` function is called. At some point, `cancel` function is called perhaps from another thread.
+      *  Then `readSuffix` function is called and then destructor.
+      * 3. At any time, the object can be destroyed (destructor called).
+      */
+
+    Block readImpl() override
+    {
+        if (all_read)
+            return {};
+
+        Block ret;
+        while (!cur_stream || !(ret = cur_stream->read()))
+        {
+            if (cur_stream)
+                cur_stream->readSuffix(); // release old inputstream
+            cur_stream = shared_pool->pickOne();
+            if (!cur_stream)
+            { // shared_pool is empty
+                all_read = true;
+                return {};
+            }
+            cur_stream->readPrefix();
+        }
+        return ret;
+    }
+
+    /// Called either after everything is read, or after cancel.
+    void readSuffix() override
+    {
+        if (!all_read && !is_cancelled)
+            throw Exception("readSuffix called before all data is read", ErrorCodes::LOGICAL_ERROR);
+
+        if (cur_stream)
+        {
+            cur_stream->readSuffix();
+            cur_stream = nullptr;
+        }
+    }
+
+private:
+    LoggerPtr log;
+
+    std::shared_ptr<MultiPartitionStreamPool> shared_pool;
+    std::shared_ptr<IBlockInputStream> cur_stream;
+
+    bool all_read = false;
+};
+
+} // namespace DB
diff --git a/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.cpp b/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.cpp
index 1a59b979c29..cd9d6235f52 100644
--- a/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.cpp
+++ b/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.cpp
@@ -20,18 +20,11 @@
 #include <DataStreams/NativeBlockInputStream.h>
 #include <DataStreams/ParallelAggregatingBlockInputStream.h>
 
-
-namespace ProfileEvents
-{
-extern const Event ExternalAggregationMerge;
-}
-
-
 namespace DB
 {
 ParallelAggregatingBlockInputStream::ParallelAggregatingBlockInputStream(
     const BlockInputStreams & inputs,
-    const BlockInputStreamPtr & additional_input_at_end,
+    const BlockInputStreams & additional_inputs_at_end,
     const Aggregator::Params & params_,
     const FileProviderPtr & file_provider_,
     bool final_,
@@ -48,11 +41,10 @@ ParallelAggregatingBlockInputStream::ParallelAggregatingBlockInputStream(
     , keys_size(params.keys_size)
     , aggregates_size(params.aggregates_size)
     , handler(*this)
-    , processor(inputs, additional_input_at_end, max_threads, handler, log)
+    , processor(inputs, additional_inputs_at_end, max_threads, handler, log)
 {
     children = inputs;
-    if (additional_input_at_end)
-        children.push_back(additional_input_at_end);
+    children.insert(children.end(), additional_inputs_at_end.begin(), additional_inputs_at_end.end());
 }
 
 
@@ -101,8 +93,6 @@ Block ParallelAggregatingBlockInputStream::readImpl()
                 *  then read and merge them, spending the minimum amount of memory.
                 */
 
-            ProfileEvents::increment(ProfileEvents::ExternalAggregationMerge);
-
             const auto & files = aggregator.getTemporaryFiles();
             BlockInputStreams input_streams;
             for (const auto & file : files.files)
@@ -207,8 +197,8 @@ void ParallelAggregatingBlockInputStream::Handler::onException(std::exception_pt
 
     /// can not cancel parent inputStream or the exception might be lost
     if (!parent.executed)
-        /// kill the processor so ExchangeReceiver will be closed
-        parent.processor.cancel(true);
+        /// use cancel instead of kill to avoid too many useless error message
+        parent.processor.cancel(false);
 }
 
 
diff --git a/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.h b/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.h
index 41e61786370..907622c8364 100644
--- a/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.h
+++ b/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.h
@@ -36,7 +36,7 @@ class ParallelAggregatingBlockInputStream : public IProfilingBlockInputStream
       */
     ParallelAggregatingBlockInputStream(
         const BlockInputStreams & inputs,
-        const BlockInputStreamPtr & additional_input_at_end,
+        const BlockInputStreams & additional_inputs_at_end,
         const Aggregator::Params & params_,
         const FileProviderPtr & file_provider_,
         bool final_,
diff --git a/dbms/src/DataStreams/ParallelInputsProcessor.h b/dbms/src/DataStreams/ParallelInputsProcessor.h
index 0e839093cd7..57ab37e1756 100644
--- a/dbms/src/DataStreams/ParallelInputsProcessor.h
+++ b/dbms/src/DataStreams/ParallelInputsProcessor.h
@@ -16,6 +16,7 @@
 
 #include <Common/CurrentMetrics.h>
 #include <Common/Logger.h>
+#include <Common/MPMCQueue.h>
 #include <Common/MemoryTracker.h>
 #include <Common/ThreadFactory.h>
 #include <Common/ThreadManager.h>
@@ -46,11 +47,6 @@
   *    then read block from source and then put source back to queue of available sources.
   */
 
-namespace CurrentMetrics
-{
-extern const Metric QueryThread;
-}
-
 namespace DB
 {
 /** Union mode.
@@ -88,9 +84,8 @@ template <typename Handler, StreamUnionMode mode = StreamUnionMode::Basic>
 class ParallelInputsProcessor
 {
 public:
-    /** additional_input_at_end - if not nullptr,
-      *  then the blocks from this source will start to be processed only after all other sources are processed.
-      * This is done in the main thread.
+    /** additional_inputs_at_end - if not empty,
+      *  then the blocks from the sources will start to be processed only after all other sources are processed.
       *
       * Intended for implementation of FULL and RIGHT JOIN
       * - where you must first make JOIN in parallel, while noting which keys are not found,
@@ -98,19 +93,18 @@ class ParallelInputsProcessor
       */
     ParallelInputsProcessor(
         const BlockInputStreams & inputs_,
-        const BlockInputStreamPtr & additional_input_at_end_,
+        const BlockInputStreams & additional_inputs_at_end_,
         size_t max_threads_,
         Handler & handler_,
         const LoggerPtr & log_)
         : inputs(inputs_)
-        , additional_input_at_end(additional_input_at_end_)
-        , max_threads(std::min(inputs_.size(), max_threads_))
+        , additional_inputs_at_end(additional_inputs_at_end_)
+        , max_threads(std::min(std::max(inputs_.size(), additional_inputs_at_end_.size()), max_threads_))
         , handler(handler_)
+        , working_inputs(inputs_)
+        , working_additional_inputs(additional_inputs_at_end_)
         , log(log_)
-    {
-        for (size_t i = 0; i < inputs_.size(); ++i)
-            unprepared_inputs.emplace(inputs_[i], i);
-    }
+    {}
 
     ~ParallelInputsProcessor()
     {
@@ -137,36 +131,21 @@ class ParallelInputsProcessor
     /// Ask all sources to stop earlier than they run out.
     void cancel(bool kill)
     {
-        finish = true;
+        working_inputs.available_inputs.cancel();
+        working_additional_inputs.available_inputs.cancel();
 
-        for (auto & input : inputs)
-        {
-            if (IProfilingBlockInputStream * child = dynamic_cast<IProfilingBlockInputStream *>(&*input))
-            {
-                try
-                {
-                    child->cancel(kill);
-                }
-                catch (...)
-                {
-                    /** If you can not ask one or more sources to stop.
-                      * (for example, the connection is broken for distributed query processing)
-                      * - then do not care.
-                      */
-                    LOG_FMT_ERROR(log, "Exception while cancelling {}", child->getName());
-                }
-            }
-        }
+        cancelStreams(inputs, kill);
+        cancelStreams(additional_inputs_at_end, kill);
     }
 
     /// Wait until all threads are finished, before the destructor.
     void wait()
     {
-        if (joined_threads)
-            return;
         if (thread_manager)
+        {
             thread_manager->wait();
-        joined_threads = true;
+            thread_manager.reset();
+        }
     }
 
     size_t getNumActiveThreads() const
@@ -186,13 +165,78 @@ class ParallelInputsProcessor
         BlockInputStreamPtr in;
         size_t i; /// The source number (for debugging).
 
-        InputData() {}
+        InputData()
+            : i(0)
+        {}
         InputData(const BlockInputStreamPtr & in_, size_t i_)
             : in(in_)
             , i(i_)
         {}
     };
 
+    struct WorkingInputs
+    {
+        explicit WorkingInputs(const BlockInputStreams & inputs_)
+            : available_inputs(inputs_.size())
+            , active_inputs(inputs_.size())
+            , unprepared_inputs(inputs_.size())
+        {
+            for (size_t i = 0; i < inputs_.size(); ++i)
+                unprepared_inputs.emplace(inputs_[i], i);
+        }
+        /** A set of available sources that are not currently processed by any thread.
+          * Each thread takes one source from this set, takes a block out of the source (at this moment the source does the calculations)
+          *  and (if the source is not run out), puts it back into the set of available sources.
+          *
+          * The question arises what is better to use:
+          * - the queue (just processed source will be processed the next time later than the rest)
+          * - stack (just processed source will be processed as soon as possible).
+          *
+          * The stack is better than the queue when you need to do work on reading one source more consequentially,
+          *  and theoretically, this allows you to achieve more consequent/consistent reads from the disk.
+          *
+          * But when using the stack, there is a problem with distributed query processing:
+          *  data is read only from a part of the servers, and on the other servers
+          * a timeout occurs during send, and the request processing ends with an exception.
+          *
+          * Therefore, a queue is used. This can be improved in the future.
+          */
+        using AvailableInputs = MPMCQueue<InputData>;
+        AvailableInputs available_inputs;
+
+        /// How many active input streams.
+        std::atomic<size_t> active_inputs;
+
+        /** For parallel preparing (readPrefix) child streams.
+          * First, streams are located here.
+          * After a stream was prepared, it is moved to "available_inputs" for reading.
+          */
+        using UnpreparedInputs = MPMCQueue<InputData>;
+        UnpreparedInputs unprepared_inputs;
+    };
+
+    void cancelStreams(const BlockInputStreams & streams, bool kill)
+    {
+        for (const auto & input : streams)
+        {
+            if (auto * p_child = dynamic_cast<IProfilingBlockInputStream *>(&*input))
+            {
+                try
+                {
+                    p_child->cancel(kill);
+                }
+                catch (...)
+                {
+                    /** If you can not ask one or more sources to stop.
+                      * (for example, the connection is broken for distributed query processing)
+                      * - then do not care.
+                      */
+                    LOG_FMT_ERROR(log, "Exception while cancelling {}", p_child->getName());
+                }
+            }
+        }
+    }
+
     void publishPayload(BlockInputStreamPtr & stream, Block & block, size_t thread_num)
     {
         if constexpr (mode == StreamUnionMode::Basic)
@@ -206,34 +250,24 @@ class ParallelInputsProcessor
 
     void thread(size_t thread_num)
     {
-        std::exception_ptr exception;
+        work(thread_num, working_inputs);
+        work(thread_num, working_additional_inputs);
 
-        CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread};
+        handler.onFinishThread(thread_num);
 
-        try
+        if (0 == --active_threads)
         {
-            while (!finish)
-            {
-                InputData unprepared_input;
-                {
-                    std::lock_guard lock(unprepared_inputs_mutex);
-
-                    if (unprepared_inputs.empty())
-                        break;
-
-                    unprepared_input = unprepared_inputs.front();
-                    unprepared_inputs.pop();
-                }
-
-                unprepared_input.in->readPrefix();
+            handler.onFinish();
+        }
+    }
 
-                {
-                    std::lock_guard lock(available_inputs_mutex);
-                    available_inputs.push(unprepared_input);
-                }
-            }
+    void work(size_t thread_num, WorkingInputs & work)
+    {
+        std::exception_ptr exception;
 
-            loop(thread_num);
+        try
+        {
+            loop(thread_num, work);
         }
         catch (...)
         {
@@ -244,134 +278,63 @@ class ParallelInputsProcessor
         {
             handler.onException(exception, thread_num);
         }
-
-        handler.onFinishThread(thread_num);
-
-        /// The last thread on the output indicates that there is no more data.
-        if (0 == --active_threads)
-        {
-            /// And then it processes an additional source, if there is one.
-            if (additional_input_at_end)
-            {
-                try
-                {
-                    additional_input_at_end->readPrefix();
-                    while (Block block = additional_input_at_end->read())
-                        publishPayload(additional_input_at_end, block, thread_num);
-                }
-                catch (...)
-                {
-                    exception = std::current_exception();
-                }
-
-                if (exception)
-                {
-                    handler.onException(exception, thread_num);
-                }
-            }
-
-            handler.onFinish(); /// TODO If in `onFinish` or `onFinishThread` there is an exception, then std::terminate is called.
-        }
     }
 
-    void loop(size_t thread_num)
+    /// This function may be called in different threads.
+    /// If no exception occurs, we can ensure that the work is all done when the function
+    /// returns in any thread.
+    void loop(size_t thread_num, WorkingInputs & work)
     {
-        while (!finish) /// You may need to stop work earlier than all sources run out.
+        if (work.active_inputs == 0)
         {
-            InputData input;
+            return;
+        }
 
-            /// Select the next source.
-            {
-                std::lock_guard lock(available_inputs_mutex);
+        InputData input;
 
-                /// If there are no free sources, then this thread is no longer needed. (But other threads can work with their sources.)
-                if (available_inputs.empty())
-                    break;
-
-                input = available_inputs.front();
+        while (work.unprepared_inputs.tryPop(input))
+        {
+            input.in->readPrefix();
 
-                /// We remove the source from the queue of available sources.
-                available_inputs.pop();
-            }
+            work.available_inputs.push(input);
+        }
 
+        // The condition is false when all input streams are exhausted or
+        // an exception occurred then the queue was cancelled.
+        while (work.available_inputs.pop(input))
+        {
             /// The main work.
             Block block = input.in->read();
 
+            if (block)
             {
-                if (finish)
-                    break;
-
-                /// If this source is not run out yet, then put the resulting block in the ready queue.
+                work.available_inputs.push(input);
+                publishPayload(input.in, block, thread_num);
+            }
+            else
+            {
+                if (0 == --work.active_inputs)
                 {
-                    std::lock_guard lock(available_inputs_mutex);
-
-                    if (block)
-                    {
-                        available_inputs.push(input);
-                    }
-                    else
-                    {
-                        if (available_inputs.empty())
-                            break;
-                    }
-                }
-
-                if (finish)
+                    work.available_inputs.finish();
                     break;
-
-                if (block)
-                    publishPayload(input.in, block, thread_num);
+                }
             }
         }
     }
 
-    BlockInputStreams inputs;
-    BlockInputStreamPtr additional_input_at_end;
+    const BlockInputStreams inputs;
+    const BlockInputStreams additional_inputs_at_end;
     unsigned max_threads;
 
     Handler & handler;
 
     std::shared_ptr<ThreadManager> thread_manager;
 
-    /** A set of available sources that are not currently processed by any thread.
-      * Each thread takes one source from this set, takes a block out of the source (at this moment the source does the calculations)
-      *  and (if the source is not run out), puts it back into the set of available sources.
-      *
-      * The question arises what is better to use:
-      * - the queue (just processed source will be processed the next time later than the rest)
-      * - stack (just processed source will be processed as soon as possible).
-      *
-      * The stack is better than the queue when you need to do work on reading one source more consequentially,
-      *  and theoretically, this allows you to achieve more consequent/consistent reads from the disk.
-      *
-      * But when using the stack, there is a problem with distributed query processing:
-      *  data is read only from a part of the servers, and on the other servers
-      * a timeout occurs during send, and the request processing ends with an exception.
-      *
-      * Therefore, a queue is used. This can be improved in the future.
-      */
-    using AvailableInputs = std::queue<InputData>;
-    AvailableInputs available_inputs;
-
-    /** For parallel preparing (readPrefix) child streams.
-      * First, streams are located here.
-      * After a stream was prepared, it is moved to "available_inputs" for reading.
-      */
-    using UnpreparedInputs = std::queue<InputData>;
-    UnpreparedInputs unprepared_inputs;
-
-    /// For operations with available_inputs.
-    std::mutex available_inputs_mutex;
-
-    /// For operations with unprepared_inputs.
-    std::mutex unprepared_inputs_mutex;
+    WorkingInputs working_inputs;
+    WorkingInputs working_additional_inputs;
 
     /// How many sources ran out.
     std::atomic<size_t> active_threads{0};
-    /// Finish the threads work (before the sources run out).
-    std::atomic<bool> finish{false};
-    /// Wait for the completion of all threads.
-    std::atomic<bool> joined_threads{false};
 
     const LoggerPtr log;
 };
diff --git a/dbms/src/DataStreams/SharedQueryBlockInputStream.h b/dbms/src/DataStreams/SharedQueryBlockInputStream.h
index e7cece67f0b..d7c0707b5aa 100644
--- a/dbms/src/DataStreams/SharedQueryBlockInputStream.h
+++ b/dbms/src/DataStreams/SharedQueryBlockInputStream.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <Common/FailPoint.h>
 #include <Common/MPMCQueue.h>
 #include <Common/ThreadFactory.h>
 #include <Common/ThreadManager.h>
@@ -24,6 +25,11 @@
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_sharedquery_failpoint[];
+} // namespace FailPoints
+
 /** This block input stream is used by SharedQuery.
   * It enable multiple threads read from one stream.
  */
@@ -136,6 +142,7 @@ class SharedQueryBlockInputStream : public IProfilingBlockInputStream
             in->readPrefix();
             while (true)
             {
+                FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_sharedquery_failpoint);
                 Block block = in->read();
                 // in is finished or queue is canceled
                 if (!block || !queue.push(block))
diff --git a/dbms/src/DataStreams/SizeLimits.cpp b/dbms/src/DataStreams/SizeLimits.cpp
index 7dd5e1524ba..4d1bfaae997 100644
--- a/dbms/src/DataStreams/SizeLimits.cpp
+++ b/dbms/src/DataStreams/SizeLimits.cpp
@@ -12,22 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <DataStreams/SizeLimits.h>
-#include <Common/formatReadable.h>
 #include <Common/Exception.h>
-#include <string>
+#include <Common/FailPoint.h>
+#include <Common/formatReadable.h>
+#include <DataStreams/SizeLimits.h>
 
+#include <string>
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_limit_check_failpoint[];
+} // namespace FailPoints
 
 bool SizeLimits::check(UInt64 rows, UInt64 bytes, const char * what, int exception_code) const
 {
-    if (max_rows && rows > max_rows)
+    bool rows_exceed_limit = max_rows && rows > max_rows;
+    fiu_do_on(FailPoints::random_limit_check_failpoint, rows_exceed_limit = true;);
+    if (rows_exceed_limit)
     {
         if (overflow_mode == OverflowMode::THROW)
             throw Exception("Limit for " + std::string(what) + " exceeded, max rows: " + formatReadableQuantity(max_rows)
-                + ", current rows: " + formatReadableQuantity(rows), exception_code);
+                                + ", current rows: " + formatReadableQuantity(rows),
+                            exception_code);
         else
             return false;
     }
@@ -36,7 +44,8 @@ bool SizeLimits::check(UInt64 rows, UInt64 bytes, const char * what, int excepti
     {
         if (overflow_mode == OverflowMode::THROW)
             throw Exception("Limit for " + std::string(what) + " exceeded, max bytes: " + formatReadableSizeWithBinarySuffix(max_bytes)
-                + ", current bytes: " + formatReadableSizeWithBinarySuffix(bytes), exception_code);
+                                + ", current bytes: " + formatReadableSizeWithBinarySuffix(bytes),
+                            exception_code);
         else
             return false;
     }
@@ -44,4 +53,4 @@ bool SizeLimits::check(UInt64 rows, UInt64 bytes, const char * what, int excepti
     return true;
 }
 
-}
+} // namespace DB
diff --git a/dbms/src/DataStreams/TiRemoteBlockInputStream.h b/dbms/src/DataStreams/TiRemoteBlockInputStream.h
index 76fda0b57d0..c1afb1e9f4e 100644
--- a/dbms/src/DataStreams/TiRemoteBlockInputStream.h
+++ b/dbms/src/DataStreams/TiRemoteBlockInputStream.h
@@ -59,6 +59,11 @@ class TiRemoteBlockInputStream : public IProfilingBlockInputStream
 
     uint64_t total_rows;
 
+    // For fine grained shuffle, sender will partition data into muiltiple streams by hashing.
+    // ExchangeReceiverBlockInputStream only need to read its own stream, i.e., streams[stream_id].
+    // CoprocessorBlockInputStream doesn't take care of this.
+    size_t stream_id;
+
     void initRemoteExecutionSummaries(tipb::SelectResponse & resp, size_t index)
     {
         for (const auto & execution_summary : resp.execution_summaries())
@@ -121,7 +126,7 @@ class TiRemoteBlockInputStream : public IProfilingBlockInputStream
 
     bool fetchRemoteResult()
     {
-        auto result = remote_reader->nextResult(block_queue, sample_block);
+        auto result = remote_reader->nextResult(block_queue, sample_block, stream_id);
         if (result.meet_error)
         {
             LOG_FMT_WARNING(log, "remote reader meets error: {}", result.error_msg);
@@ -169,13 +174,14 @@ class TiRemoteBlockInputStream : public IProfilingBlockInputStream
     }
 
 public:
-    TiRemoteBlockInputStream(std::shared_ptr<RemoteReader> remote_reader_, const String & req_id, const String & executor_id)
+    TiRemoteBlockInputStream(std::shared_ptr<RemoteReader> remote_reader_, const String & req_id, const String & executor_id, size_t stream_id_)
         : remote_reader(remote_reader_)
         , source_num(remote_reader->getSourceNum())
         , name(fmt::format("TiRemoteBlockInputStream({})", RemoteReader::name))
         , execution_summaries_inited(source_num)
         , log(Logger::get(name, req_id, executor_id))
         , total_rows(0)
+        , stream_id(stream_id_)
     {
         for (size_t i = 0; i < source_num; ++i)
         {
diff --git a/dbms/src/DataStreams/UnionBlockInputStream.h b/dbms/src/DataStreams/UnionBlockInputStream.h
index 251d0663e14..ffcc8d77c10 100644
--- a/dbms/src/DataStreams/UnionBlockInputStream.h
+++ b/dbms/src/DataStreams/UnionBlockInputStream.h
@@ -94,20 +94,19 @@ class UnionBlockInputStream final : public IProfilingBlockInputStream
 public:
     UnionBlockInputStream(
         BlockInputStreams inputs,
-        BlockInputStreamPtr additional_input_at_end,
+        BlockInputStreams additional_inputs_at_end,
         size_t max_threads,
         const String & req_id,
         ExceptionCallback exception_callback_ = ExceptionCallback())
-        : output_queue(std::min(inputs.size(), max_threads) * 5) // reduce contention
+        : output_queue(std::min(std::max(inputs.size(), additional_inputs_at_end.size()), max_threads) * 5) // reduce contention
         , log(Logger::get(NAME, req_id))
         , handler(*this)
-        , processor(inputs, additional_input_at_end, max_threads, handler, log)
+        , processor(inputs, additional_inputs_at_end, max_threads, handler, log)
         , exception_callback(exception_callback_)
     {
         // TODO: assert capacity of output_queue is not less than processor.getMaxThreads()
         children = inputs;
-        if (additional_input_at_end)
-            children.push_back(additional_input_at_end);
+        children.insert(children.end(), additional_inputs_at_end.begin(), additional_inputs_at_end.end());
 
         size_t num_children = children.size();
         if (num_children > 1)
@@ -293,8 +292,8 @@ class UnionBlockInputStream final : public IProfilingBlockInputStream
         /// and the exception is lost.
         output_queue.emplace(exception);
         /// can not cancel itself or the exception might be lost
-        /// kill the processor so ExchangeReceiver will be closed
-        processor.cancel(true);
+        /// use cancel instead of kill to avoid too many useless error message
+        processor.cancel(false);
     }
 
     struct Handler
diff --git a/dbms/src/DataStreams/WindowBlockInputStream.cpp b/dbms/src/DataStreams/WindowBlockInputStream.cpp
index bc63db52873..2cc61df8104 100644
--- a/dbms/src/DataStreams/WindowBlockInputStream.cpp
+++ b/dbms/src/DataStreams/WindowBlockInputStream.cpp
@@ -16,6 +16,7 @@
 #include <Common/Arena.h>
 #include <DataStreams/WindowBlockInputStream.h>
 #include <Interpreters/ExpressionActions.h>
+#include <Interpreters/WindowDescription.h>
 #include <Interpreters/convertFieldToType.h>
 
 namespace DB
@@ -574,4 +575,21 @@ void WindowBlockInputStream::tryCalculate()
         peer_group_number = 1;
     }
 }
+
+void WindowBlockInputStream::appendInfo(FmtBuffer & buffer) const
+{
+    buffer.append(", function: {");
+    buffer.joinStr(
+        window_description.window_functions_descriptions.begin(),
+        window_description.window_functions_descriptions.end(),
+        [&](const auto & func, FmtBuffer & b) {
+            b.append(func.window_function->getName());
+        },
+        ", ");
+    buffer.fmtAppend(
+        "}}, frame: {{type: {}, boundary_begin: {}, boundary_end: {}}}",
+        frameTypeToString(window_description.frame.type),
+        boundaryTypeToString(window_description.frame.begin_type),
+        boundaryTypeToString(window_description.frame.end_type));
+}
 } // namespace DB
diff --git a/dbms/src/DataStreams/WindowBlockInputStream.h b/dbms/src/DataStreams/WindowBlockInputStream.h
index 46b18dec1ee..0ef23aa9f6f 100644
--- a/dbms/src/DataStreams/WindowBlockInputStream.h
+++ b/dbms/src/DataStreams/WindowBlockInputStream.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <Common/FmtUtils.h>
 #include <DataStreams/IProfilingBlockInputStream.h>
 #include <Interpreters/AggregateDescription.h>
 #include <Interpreters/WindowDescription.h>
@@ -169,6 +170,7 @@ class WindowBlockInputStream : public IProfilingBlockInputStream
 
 protected:
     Block readImpl() override;
+    void appendInfo(FmtBuffer & buffer) const override;
 
     LoggerPtr log;
 
diff --git a/dbms/src/DataStreams/tests/union_stream2.cpp b/dbms/src/DataStreams/tests/union_stream2.cpp
index f939cda4e14..fb3f7238414 100644
--- a/dbms/src/DataStreams/tests/union_stream2.cpp
+++ b/dbms/src/DataStreams/tests/union_stream2.cpp
@@ -51,7 +51,7 @@ try
     for (size_t i = 0, size = streams.size(); i < size; ++i)
         streams[i] = std::make_shared<AsynchronousBlockInputStream>(streams[i]);
 
-    BlockInputStreamPtr stream = std::make_shared<UnionBlockInputStream<>>(streams, nullptr, settings.max_threads, /*req_id=*/"");
+    BlockInputStreamPtr stream = std::make_shared<UnionBlockInputStream<>>(streams, BlockInputStreams{}, settings.max_threads, /*req_id=*/"");
     stream = std::make_shared<LimitBlockInputStream>(stream, 10, 0, "");
 
     WriteBufferFromFileDescriptor wb(STDERR_FILENO);
diff --git a/dbms/src/DataTypes/IDataType.h b/dbms/src/DataTypes/IDataType.h
index 120d0b1ba30..71fda0615e4 100644
--- a/dbms/src/DataTypes/IDataType.h
+++ b/dbms/src/DataTypes/IDataType.h
@@ -471,7 +471,6 @@ class IDataType : private boost::noncopyable
     virtual bool isEnum() const { return false; };
 
     virtual bool isNullable() const { return false; }
-
     /** Is this type can represent only NULL value? (It also implies isNullable)
       */
     virtual bool onlyNull() const { return false; }
diff --git a/dbms/src/DataTypes/NumberTraits.h b/dbms/src/DataTypes/NumberTraits.h
index 925628a8894..a8b91b88075 100644
--- a/dbms/src/DataTypes/NumberTraits.h
+++ b/dbms/src/DataTypes/NumberTraits.h
@@ -277,6 +277,7 @@ struct ResultOfAbs<Decimal<T>>
 };
 
 /** For bitwise operations, an integer is obtained with number of bits is equal to the maximum of the arguments.
+  * todo: note that MySQL handles only unsigned 64-bit integer argument and result values. We should refine the code.
     */
 template <typename A, typename B>
 struct ResultOfBit
diff --git a/dbms/src/Debug/DBGInvoker.cpp b/dbms/src/Debug/DBGInvoker.cpp
index 3f633c08e67..df993d8e6e9 100644
--- a/dbms/src/Debug/DBGInvoker.cpp
+++ b/dbms/src/Debug/DBGInvoker.cpp
@@ -118,6 +118,10 @@ DBGInvoker::DBGInvoker()
     regSchemalessFunc("mapped_database", dbgFuncMappedDatabase);
     regSchemalessFunc("mapped_table", dbgFuncMappedTable);
     regSchemafulFunc("query_mapped", dbgFuncQueryMapped);
+    regSchemalessFunc("get_tiflash_replica_count", dbgFuncGetTiflashReplicaCount);
+    regSchemalessFunc("get_partition_tables_tiflash_replica_count", dbgFuncGetPartitionTablesTiflashReplicaCount);
+    regSchemalessFunc("get_tiflash_mode", dbgFuncGetTiflashMode);
+    regSchemalessFunc("get_partition_tables_tiflash_mode", dbgFuncGetPartitionTablesTiflashMode);
 
     regSchemalessFunc("search_log_for_key", dbgFuncSearchLogForKey);
     regSchemalessFunc("tidb_dag", dbgFuncTiDBQueryFromNaturalDag);
diff --git a/dbms/src/Debug/MockSchemaGetter.h b/dbms/src/Debug/MockSchemaGetter.h
index f02699866ce..11c5d97f036 100644
--- a/dbms/src/Debug/MockSchemaGetter.h
+++ b/dbms/src/Debug/MockSchemaGetter.h
@@ -17,16 +17,25 @@
 #include <Debug/MockTiDB.h>
 #include <TiDB/Schema/SchemaGetter.h>
 
+#include <optional>
+
 namespace DB
 {
-
 struct MockSchemaGetter
 {
     TiDB::DBInfoPtr getDatabase(DatabaseID db_id) { return MockTiDB::instance().getDBInfoByID(db_id); }
 
     Int64 getVersion() { return MockTiDB::instance().getVersion(); }
 
-    SchemaDiff getSchemaDiff(Int64 version) { return MockTiDB::instance().getSchemaDiff(version); }
+    std::optional<SchemaDiff> getSchemaDiff(Int64 version)
+    {
+        return MockTiDB::instance().getSchemaDiff(version);
+    }
+
+    bool checkSchemaDiffExists(Int64 version)
+    {
+        return MockTiDB::instance().checkSchemaDiffExists(version);
+    }
 
     TiDB::TableInfoPtr getTableInfo(DatabaseID, TableID table_id) { return MockTiDB::instance().getTableInfoByID(table_id); }
 
diff --git a/dbms/src/Debug/MockTiDB.cpp b/dbms/src/Debug/MockTiDB.cpp
index 42ab56a97c1..99d9625461b 100644
--- a/dbms/src/Debug/MockTiDB.cpp
+++ b/dbms/src/Debug/MockTiDB.cpp
@@ -221,7 +221,6 @@ TiDB::TableInfoPtr MockTiDB::parseColumns(
             {
                 String & name = string_tokens[index];
                 index_info.idx_cols[index].name = name;
-                index_info.idx_cols[index].offset = pk_column_pos_map[name];
                 index_info.idx_cols[index].length = -1;
             }
         }
@@ -302,7 +301,7 @@ int MockTiDB::newTables(
         tables_by_id.emplace(table->table_info.id, table);
         tables_by_name.emplace(qualified_name, table);
 
-        AffectedOption opt;
+        AffectedOption opt{};
         opt.schema_id = table->database_id;
         opt.table_id = table->id();
         opt.old_schema_id = table->database_id;
@@ -571,7 +570,7 @@ void MockTiDB::renameTables(const std::vector<std::tuple<std::string, std::strin
         tables_by_name.erase(qualified_name);
         tables_by_name.emplace(new_qualified_name, new_table);
 
-        AffectedOption opt;
+        AffectedOption opt{};
         opt.schema_id = table->database_id;
         opt.table_id = new_table->id();
         opt.old_schema_id = table->database_id;
@@ -669,9 +668,14 @@ std::pair<bool, DatabaseID> MockTiDB::getDBIDByName(const String & database_name
     return std::make_pair(false, -1);
 }
 
-SchemaDiff MockTiDB::getSchemaDiff(Int64 version_)
+std::optional<SchemaDiff> MockTiDB::getSchemaDiff(Int64 version_)
 {
     return version_diff[version_];
 }
 
+bool MockTiDB::checkSchemaDiffExists(Int64 version)
+{
+    return version_diff.find(version) != version_diff.end();
+}
+
 } // namespace DB
diff --git a/dbms/src/Debug/MockTiDB.h b/dbms/src/Debug/MockTiDB.h
index 36d2af90859..261e547b13a 100644
--- a/dbms/src/Debug/MockTiDB.h
+++ b/dbms/src/Debug/MockTiDB.h
@@ -127,7 +127,9 @@ class MockTiDB : public ext::Singleton<MockTiDB>
 
     std::pair<bool, DatabaseID> getDBIDByName(const String & database_name);
 
-    SchemaDiff getSchemaDiff(Int64 version);
+    bool checkSchemaDiffExists(Int64 version);
+
+    std::optional<SchemaDiff> getSchemaDiff(Int64 version);
 
     std::unordered_map<String, DatabaseID> getDatabases() { return databases; }
 
diff --git a/dbms/src/Debug/astToExecutor.cpp b/dbms/src/Debug/astToExecutor.cpp
index a1e9295b3f5..61f4474f919 100644
--- a/dbms/src/Debug/astToExecutor.cpp
+++ b/dbms/src/Debug/astToExecutor.cpp
@@ -24,13 +24,13 @@
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ASTIdentifier.h>
 #include <Parsers/ASTLiteral.h>
-#include <Parsers/ASTOrderByElement.h>
 #include <Parsers/ASTSelectQuery.h>
 #include <Poco/StringTokenizer.h>
 #include <common/logger_useful.h>
 
 namespace DB
 {
+using ASTPartitionByElement = ASTOrderByElement;
 void literalFieldToTiPBExpr(const ColumnInfo & ci, const Field & val_field, tipb::Expr * expr, Int32 collator_id)
 {
     *(expr->mutable_field_type()) = columnInfoToFieldType(ci);
@@ -191,6 +191,12 @@ std::unordered_map<String, tipb::ExprType> agg_func_name_to_sig({
     {"group_concat", tipb::ExprType::GroupConcat},
 });
 
+std::unordered_map<String, tipb::ExprType> window_func_name_to_sig({
+    {"RowNumber", tipb::ExprType::RowNumber},
+    {"Rank", tipb::ExprType::Rank},
+    {"DenseRank", tipb::ExprType::DenseRank},
+});
+
 DAGColumnInfo toNullableDAGColumnInfo(const DAGColumnInfo & input)
 {
     DAGColumnInfo output = input;
@@ -854,6 +860,7 @@ bool ExchangeReceiver::toTiPBExecutor(tipb::Executor * tipb_executor, uint32_t c
 {
     tipb_executor->set_tp(tipb::ExecType::TypeExchangeReceiver);
     tipb_executor->set_executor_id(name);
+    tipb_executor->set_fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count);
     tipb::ExchangeReceiver * exchange_receiver = tipb_executor->mutable_exchange_receiver();
     for (auto & field : output_schema)
     {
@@ -1352,6 +1359,107 @@ void Join::toMPPSubPlan(size_t & executor_index, const DAGProperties & propertie
     exchange_map[left_exchange_receiver->name] = std::make_pair(left_exchange_receiver, left_exchange_sender);
     exchange_map[right_exchange_receiver->name] = std::make_pair(right_exchange_receiver, right_exchange_sender);
 }
+
+bool Window::toTiPBExecutor(tipb::Executor * tipb_executor, uint32_t collator_id, const MPPInfo & mpp_info, const Context & context)
+{
+    tipb_executor->set_tp(tipb::ExecType::TypeWindow);
+    tipb_executor->set_executor_id(name);
+    tipb_executor->set_fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count);
+    tipb::Window * window = tipb_executor->mutable_window();
+    auto & input_schema = children[0]->output_schema;
+    for (const auto & expr : func_descs)
+    {
+        tipb::Expr * window_expr = window->add_func_desc();
+        const auto * window_func = typeid_cast<const ASTFunction *>(expr.get());
+        for (const auto & arg : window_func->arguments->children)
+        {
+            tipb::Expr * func = window_expr->add_children();
+            astToPB(input_schema, arg, func, collator_id, context);
+        }
+        auto window_sig_it = window_func_name_to_sig.find(window_func->name);
+        if (window_sig_it == window_func_name_to_sig.end())
+            throw Exception(fmt::format("Unsupported window function {}", window_func->name), ErrorCodes::LOGICAL_ERROR);
+        auto window_sig = window_sig_it->second;
+        window_expr->set_tp(window_sig);
+        auto * ft = window_expr->mutable_field_type();
+        // TODO: Maybe more window functions with different field type.
+        ft->set_tp(TiDB::TypeLongLong);
+        ft->set_flag(TiDB::ColumnFlagBinary);
+        ft->set_collate(collator_id);
+        ft->set_flen(21);
+        ft->set_decimal(-1);
+    }
+
+    for (const auto & child : order_by_exprs)
+    {
+        auto * elem = typeid_cast<ASTOrderByElement *>(child.get());
+        if (!elem)
+            throw Exception("Invalid order by element", ErrorCodes::LOGICAL_ERROR);
+        tipb::ByItem * by = window->add_order_by();
+        by->set_desc(elem->direction < 0);
+        tipb::Expr * expr = by->mutable_expr();
+        astToPB(children[0]->output_schema, elem->children[0], expr, collator_id, context);
+    }
+
+    for (const auto & child : partition_by_exprs)
+    {
+        auto * elem = typeid_cast<ASTPartitionByElement *>(child.get());
+        if (!elem)
+            throw Exception("Invalid partition by element", ErrorCodes::LOGICAL_ERROR);
+        tipb::ByItem * by = window->add_partition_by();
+        by->set_desc(elem->direction < 0);
+        tipb::Expr * expr = by->mutable_expr();
+        astToPB(children[0]->output_schema, elem->children[0], expr, collator_id, context);
+    }
+
+    if (frame.type.has_value())
+    {
+        tipb::WindowFrame * mut_frame = window->mutable_frame();
+        mut_frame->set_type(frame.type.value());
+        if (frame.start.has_value())
+        {
+            auto * start = mut_frame->mutable_start();
+            start->set_offset(std::get<2>(frame.start.value()));
+            start->set_unbounded(std::get<1>(frame.start.value()));
+            start->set_type(std::get<0>(frame.start.value()));
+        }
+
+        if (frame.end.has_value())
+        {
+            auto * end = mut_frame->mutable_end();
+            end->set_offset(std::get<2>(frame.end.value()));
+            end->set_unbounded(std::get<1>(frame.end.value()));
+            end->set_type(std::get<0>(frame.end.value()));
+        }
+    }
+
+    auto * children_executor = window->mutable_child();
+    return children[0]->toTiPBExecutor(children_executor, collator_id, mpp_info, context);
+}
+
+bool Sort::toTiPBExecutor(tipb::Executor * tipb_executor, uint32_t collator_id, const MPPInfo & mpp_info, const Context & context)
+{
+    tipb_executor->set_tp(tipb::ExecType::TypeSort);
+    tipb_executor->set_executor_id(name);
+    tipb_executor->set_fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count);
+    tipb::Sort * sort = tipb_executor->mutable_sort();
+    sort->set_ispartialsort(is_partial_sort);
+
+    for (const auto & child : by_exprs)
+    {
+        auto * elem = typeid_cast<ASTOrderByElement *>(child.get());
+        if (!elem)
+            throw Exception("Invalid order by element", ErrorCodes::LOGICAL_ERROR);
+        tipb::ByItem * by = sort->add_byitems();
+        by->set_desc(elem->direction < 0);
+        tipb::Expr * expr = by->mutable_expr();
+        astToPB(children[0]->output_schema, elem->children[0], expr, collator_id, context);
+    }
+
+    auto * children_executor = sort->mutable_child();
+    return children[0]->toTiPBExecutor(children_executor, collator_id, mpp_info, context);
+}
+
 } // namespace mock
 
 ExecutorPtr compileTableScan(size_t & executor_index, TableInfo & table_info, String & table_alias, bool append_pk_column)
@@ -1449,7 +1557,7 @@ ExecutorPtr compileAggregation(ExecutorPtr input, size_t & executor_index, ASTPt
                 ci.tp = TiDB::TypeLongLong;
                 ci.flag = TiDB::ColumnFlagUnsigned | TiDB::ColumnFlagNotNull;
             }
-            else if (func->name == "max" || func->name == "min" || func->name == "first_row")
+            else if (func->name == "max" || func->name == "min" || func->name == "first_row" || func->name == "sum")
             {
                 ci = children_ci[0];
                 ci.flag &= ~TiDB::ColumnFlagNotNull;
@@ -1533,7 +1641,6 @@ ExecutorPtr compileProject(ExecutorPtr input, size_t & executor_index, ASTPtr se
             }
         }
     }
-
     auto project = std::make_shared<mock::Project>(executor_index, output_schema, std::move(exprs));
     project->children.push_back(input);
     return project;
@@ -1570,11 +1677,102 @@ ExecutorPtr compileExchangeSender(ExecutorPtr input, size_t & executor_index, ti
     return exchange_sender;
 }
 
-
-ExecutorPtr compileExchangeReceiver(size_t & executor_index, DAGSchema schema)
+ExecutorPtr compileExchangeReceiver(size_t & executor_index, DAGSchema schema, uint64_t fine_grained_shuffle_stream_count)
 {
-    ExecutorPtr exchange_receiver = std::make_shared<mock::ExchangeReceiver>(executor_index, schema);
+    ExecutorPtr exchange_receiver = std::make_shared<mock::ExchangeReceiver>(executor_index, schema, fine_grained_shuffle_stream_count);
     return exchange_receiver;
 }
 
-} // namespace DB
\ No newline at end of file
+ExecutorPtr compileWindow(ExecutorPtr input, size_t & executor_index, ASTPtr func_desc_list, ASTPtr partition_by_expr_list, ASTPtr order_by_expr_list, mock::MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count)
+{
+    std::vector<ASTPtr> partition_columns;
+    if (partition_by_expr_list != nullptr)
+    {
+        for (const auto & child : partition_by_expr_list->children)
+        {
+            auto * elem = typeid_cast<ASTPartitionByElement *>(child.get());
+            if (!elem)
+                throw Exception("Invalid partition by element", ErrorCodes::LOGICAL_ERROR);
+            partition_columns.push_back(child);
+            compileExpr(input->output_schema, elem->children[0]);
+        }
+    }
+
+    std::vector<ASTPtr> order_columns;
+    if (order_by_expr_list != nullptr)
+    {
+        for (const auto & child : order_by_expr_list->children)
+        {
+            auto * elem = typeid_cast<ASTOrderByElement *>(child.get());
+            if (!elem)
+                throw Exception("Invalid order by element", ErrorCodes::LOGICAL_ERROR);
+            order_columns.push_back(child);
+            compileExpr(input->output_schema, elem->children[0]);
+        }
+    }
+
+    DAGSchema output_schema;
+    output_schema.insert(output_schema.end(), input->output_schema.begin(), input->output_schema.end());
+
+    std::vector<ASTPtr> window_exprs;
+    if (func_desc_list != nullptr)
+    {
+        for (const auto & expr : func_desc_list->children)
+        {
+            const auto * func = typeid_cast<const ASTFunction *>(expr.get());
+            window_exprs.push_back(expr);
+            std::vector<TiDB::ColumnInfo> children_ci;
+            for (const auto & arg : func->arguments->children)
+            {
+                children_ci.push_back(compileExpr(input->output_schema, arg));
+            }
+            // TODO: add more window functions
+            TiDB::ColumnInfo ci;
+            switch (window_func_name_to_sig[func->name])
+            {
+            case tipb::ExprType::RowNumber:
+            case tipb::ExprType::Rank:
+            case tipb::ExprType::DenseRank:
+            {
+                ci.tp = TiDB::TypeLongLong;
+                ci.flag = TiDB::ColumnFlagBinary;
+                break;
+            }
+            default:
+                throw Exception(fmt::format("Unsupported window function {}", func->name), ErrorCodes::LOGICAL_ERROR);
+            }
+            output_schema.emplace_back(std::make_pair(func->getColumnName(), ci));
+        }
+    }
+
+    ExecutorPtr window = std::make_shared<mock::Window>(
+        executor_index,
+        output_schema,
+        window_exprs,
+        std::move(partition_columns),
+        std::move(order_columns),
+        frame,
+        fine_grained_shuffle_stream_count);
+    window->children.push_back(input);
+    return window;
+}
+
+ExecutorPtr compileSort(ExecutorPtr input, size_t & executor_index, ASTPtr order_by_expr_list, bool is_partial_sort, uint64_t fine_grained_shuffle_stream_count)
+{
+    std::vector<ASTPtr> order_columns;
+    if (order_by_expr_list != nullptr)
+    {
+        for (const auto & child : order_by_expr_list->children)
+        {
+            auto * elem = typeid_cast<ASTOrderByElement *>(child.get());
+            if (!elem)
+                throw Exception("Invalid order by element", ErrorCodes::LOGICAL_ERROR);
+            order_columns.push_back(child);
+            compileExpr(input->output_schema, elem->children[0]);
+        }
+    }
+    ExecutorPtr sort = std::make_shared<mock::Sort>(executor_index, input->output_schema, std::move(order_columns), is_partial_sort, fine_grained_shuffle_stream_count);
+    sort->children.push_back(input);
+    return sort;
+}
+} // namespace DB
diff --git a/dbms/src/Debug/astToExecutor.h b/dbms/src/Debug/astToExecutor.h
index 37d3f22b6e1..f39f4059d26 100644
--- a/dbms/src/Debug/astToExecutor.h
+++ b/dbms/src/Debug/astToExecutor.h
@@ -19,6 +19,7 @@
 #include <Debug/MockTiDB.h>
 #include <Functions/FunctionFactory.h>
 #include <Interpreters/convertFieldToType.h>
+#include <Parsers/ASTOrderByElement.h>
 #include <Parsers/ASTTablesInSelectQuery.h>
 #include <Parsers/IAST.h>
 #include <Parsers/ParserSelectQuery.h>
@@ -28,6 +29,8 @@
 #include <Storages/Transaction/Types.h>
 #include <tipb/select.pb.h>
 
+#include <optional>
+
 namespace DB
 {
 namespace ErrorCodes
@@ -136,8 +139,11 @@ struct ExchangeSender : Executor
 struct ExchangeReceiver : Executor
 {
     TaskMetas task_metas;
-    ExchangeReceiver(size_t & index, const DAGSchema & output)
+    uint64_t fine_grained_shuffle_stream_count;
+
+    ExchangeReceiver(size_t & index, const DAGSchema & output, uint64_t fine_grained_shuffle_stream_count_ = 0)
         : Executor(index, "exchange_receiver_" + std::to_string(index), output)
+        , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
     {}
     void columnPrune(std::unordered_set<String> &) override { throw Exception("Should not reach here"); }
     bool toTiPBExecutor(tipb::Executor * tipb_executor, uint32_t collator_id, const MPPInfo & mpp_info, const Context &) override;
@@ -272,6 +278,58 @@ struct Join : Executor
 
     void toMPPSubPlan(size_t & executor_index, const DAGProperties & properties, std::unordered_map<String, std::pair<std::shared_ptr<ExchangeReceiver>, std::shared_ptr<ExchangeSender>>> & exchange_map) override;
 };
+
+using MockWindowFrameBound = std::tuple<tipb::WindowBoundType, bool, UInt64>;
+
+struct MockWindowFrame
+{
+    std::optional<tipb::WindowFrameType> type;
+    std::optional<MockWindowFrameBound> start;
+    std::optional<MockWindowFrameBound> end;
+    // TODO: support calcFuncs
+};
+
+struct Window : Executor
+{
+    std::vector<ASTPtr> func_descs;
+    std::vector<ASTPtr> partition_by_exprs;
+    std::vector<ASTPtr> order_by_exprs;
+    MockWindowFrame frame;
+    uint64_t fine_grained_shuffle_stream_count;
+
+    Window(size_t & index_, const DAGSchema & output_schema_, std::vector<ASTPtr> func_descs_, std::vector<ASTPtr> partition_by_exprs_, std::vector<ASTPtr> order_by_exprs_, MockWindowFrame frame_, uint64_t fine_grained_shuffle_stream_count_ = 0)
+        : Executor(index_, "window_" + std::to_string(index_), output_schema_)
+        , func_descs(std::move(func_descs_))
+        , partition_by_exprs(std::move(partition_by_exprs_))
+        , order_by_exprs(order_by_exprs_)
+        , frame(frame_)
+        , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
+    {
+    }
+    // Currently only use Window Executor in Unit Test which don't call columnPrume.
+    // TODO: call columnPrune in unit test and further benchmark test to eliminate compute process.
+    void columnPrune(std::unordered_set<String> &) override { throw Exception("Should not reach here"); }
+    bool toTiPBExecutor(tipb::Executor * tipb_executor, uint32_t collator_id, const MPPInfo & mpp_info, const Context & context) override;
+};
+
+struct Sort : Executor
+{
+    std::vector<ASTPtr> by_exprs;
+    bool is_partial_sort;
+    uint64_t fine_grained_shuffle_stream_count;
+
+    Sort(size_t & index_, const DAGSchema & output_schema_, std::vector<ASTPtr> by_exprs_, bool is_partial_sort_, uint64_t fine_grained_shuffle_stream_count_ = 0)
+        : Executor(index_, "sort_" + std::to_string(index_), output_schema_)
+        , by_exprs(by_exprs_)
+        , is_partial_sort(is_partial_sort_)
+        , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
+    {
+    }
+    // Currently only use Sort Executor in Unit Test which don't call columnPrume.
+    // TODO: call columnPrune in unit test and further benchmark test to eliminate compute process.
+    void columnPrune(std::unordered_set<String> &) override { throw Exception("Should not reach here"); }
+    bool toTiPBExecutor(tipb::Executor * tipb_executor, uint32_t collator_id, const MPPInfo & mpp_info, const Context & context) override;
+};
 } // namespace mock
 
 using ExecutorPtr = std::shared_ptr<mock::Executor>;
@@ -292,10 +350,11 @@ ExecutorPtr compileJoin(size_t & executor_index, ExecutorPtr left, ExecutorPtr r
 
 ExecutorPtr compileExchangeSender(ExecutorPtr input, size_t & executor_index, tipb::ExchangeType exchange_type);
 
-ExecutorPtr compileExchangeReceiver(size_t & executor_index, DAGSchema schema);
+ExecutorPtr compileExchangeReceiver(size_t & executor_index, DAGSchema schema, uint64_t fine_grained_shuffle_stream_count = 0);
 
-void literalFieldToTiPBExpr(const ColumnInfo & ci, const Field & field, tipb::Expr * expr, Int32 collator_id);
+ExecutorPtr compileWindow(ExecutorPtr input, size_t & executor_index, ASTPtr func_desc_list, ASTPtr partition_by_expr_list, ASTPtr order_by_expr_list, mock::MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count = 0);
 
-//TODO: add compileWindow
+ExecutorPtr compileSort(ExecutorPtr input, size_t & executor_index, ASTPtr order_by_expr_list, bool is_partial_sort, uint64_t fine_grained_shuffle_stream_count = 0);
 
-} // namespace DB
\ No newline at end of file
+void literalFieldToTiPBExpr(const ColumnInfo & ci, const Field & field, tipb::Expr * expr, Int32 collator_id);
+} // namespace DB
diff --git a/dbms/src/Debug/dbgFuncCoprocessor.cpp b/dbms/src/Debug/dbgFuncCoprocessor.cpp
index e9335d1e2bd..62a8b7537f1 100644
--- a/dbms/src/Debug/dbgFuncCoprocessor.cpp
+++ b/dbms/src/Debug/dbgFuncCoprocessor.cpp
@@ -290,8 +290,9 @@ BlockInputStreamPtr executeQuery(Context & context, RegionID region_id, const DA
                 tipb_exchange_receiver.encoded_task_meta_size(),
                 10,
                 /*req_id=*/"",
-                /*executor_id=*/"");
-        BlockInputStreamPtr ret = std::make_shared<ExchangeReceiverInputStream>(exchange_receiver, /*req_id=*/"", /*executor_id=*/"");
+                /*executor_id=*/"",
+                /*fine_grained_shuffle_stream_count=*/0);
+        BlockInputStreamPtr ret = std::make_shared<ExchangeReceiverInputStream>(exchange_receiver, /*req_id=*/"", /*executor_id=*/"", /*stream_id*/ 0);
         return ret;
     }
     else
diff --git a/dbms/src/Debug/dbgFuncMockRaftCommand.cpp b/dbms/src/Debug/dbgFuncMockRaftCommand.cpp
index df93ee1c78d..3626041f428 100644
--- a/dbms/src/Debug/dbgFuncMockRaftCommand.cpp
+++ b/dbms/src/Debug/dbgFuncMockRaftCommand.cpp
@@ -40,7 +40,7 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
     auto & tmt = context.getTMTContext();
     auto & kvstore = tmt.getKVStore();
 
-    RegionID region_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value);
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
     const String & database_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
     const String & table_name = typeid_cast<const ASTIdentifier &>(*args[2]).name;
     auto table = MockTiDB::instance().getTableByName(database_name, table_name);
@@ -49,7 +49,7 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
     if (4 + handle_column_size * 4 != args.size())
         throw Exception("Args not matched, should be: region-id1, database-name, table-name, start1, end1, start2, end2, region-id2",
                         ErrorCodes::BAD_ARGUMENTS);
-    RegionID region_id2 = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[args.size() - 1]).value);
+    auto region_id2 = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[args.size() - 1]).value));
 
     auto table_id = table->id();
     TiKVKey start_key1, start_key2, end_key1, end_key2;
@@ -59,9 +59,17 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
         std::vector<Field> start_keys2;
         std::vector<Field> end_keys1;
         std::vector<Field> end_keys2;
+
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
+
         for (size_t i = 0; i < handle_column_size; i++)
         {
-            auto & column_info = table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset];
+            auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            auto & column_info = table_info.columns[idx];
 
             auto start_field1 = RegionBench::convertField(column_info, typeid_cast<const ASTLiteral &>(*args[3 + i]).value);
             TiDB::DatumBumpy start_datum1 = TiDB::DatumBumpy(start_field1, column_info.tp);
@@ -88,10 +96,10 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
     }
     else
     {
-        HandleID start1 = (HandleID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value);
-        HandleID end1 = (HandleID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[4]).value);
-        HandleID start2 = (HandleID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[5]).value);
-        HandleID end2 = (HandleID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[6]).value);
+        auto start1 = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value));
+        auto end1 = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[4]).value));
+        auto start2 = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[5]).value));
+        auto end2 = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[6]).value));
         start_key1 = RecordKVFormat::genKey(table_id, start1);
         start_key2 = RecordKVFormat::genKey(table_id, start2);
         end_key1 = RecordKVFormat::genKey(table_id, end1);
@@ -110,7 +118,7 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
         request.set_cmd_type(raft_cmdpb::AdminCmdType::BatchSplit);
         raft_cmdpb::BatchSplitResponse * splits = response.mutable_splits();
         {
-            auto region = splits->add_regions();
+            auto * region = splits->add_regions();
             region->set_id(region_id);
             region->set_start_key(start_key1);
             region->set_end_key(end_key1);
@@ -118,7 +126,7 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
             *region->mutable_region_epoch() = new_epoch;
         }
         {
-            auto region = splits->add_regions();
+            auto * region = splits->add_regions();
             region->set_id(region_id2);
             region->set_start_key(start_key2);
             region->set_end_key(end_key2);
@@ -144,8 +152,8 @@ void MockRaftCommand::dbgFuncPrepareMerge(Context & context, const ASTs & args,
         throw Exception("Args not matched, should be: source-id1, target-id2", ErrorCodes::BAD_ARGUMENTS);
     }
 
-    RegionID region_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value);
-    RegionID target_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value);
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
+    auto target_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value));
 
     auto & tmt = context.getTMTContext();
     auto & kvstore = tmt.getKVStore();
@@ -157,7 +165,7 @@ void MockRaftCommand::dbgFuncPrepareMerge(Context & context, const ASTs & args,
     {
         request.set_cmd_type(raft_cmdpb::AdminCmdType::PrepareMerge);
 
-        auto prepare_merge = request.mutable_prepare_merge();
+        auto * prepare_merge = request.mutable_prepare_merge();
         {
             auto min_index = region->appliedIndex();
             prepare_merge->set_min_index(min_index);
@@ -184,8 +192,8 @@ void MockRaftCommand::dbgFuncCommitMerge(Context & context, const ASTs & args, D
         throw Exception("Args not matched, should be: source-id1, current-id2", ErrorCodes::BAD_ARGUMENTS);
     }
 
-    RegionID source_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value);
-    RegionID current_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value);
+    auto source_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
+    auto current_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value));
 
     auto & tmt = context.getTMTContext();
     auto & kvstore = tmt.getKVStore();
@@ -196,7 +204,7 @@ void MockRaftCommand::dbgFuncCommitMerge(Context & context, const ASTs & args, D
 
     {
         request.set_cmd_type(raft_cmdpb::AdminCmdType::CommitMerge);
-        auto commit_merge = request.mutable_commit_merge();
+        auto * commit_merge = request.mutable_commit_merge();
         {
             commit_merge->set_commit(source_region->appliedIndex());
             *commit_merge->mutable_source() = source_region->getMetaRegion();
@@ -220,7 +228,7 @@ void MockRaftCommand::dbgFuncRollbackMerge(Context & context, const ASTs & args,
         throw Exception("Args not matched, should be: region-id", ErrorCodes::BAD_ARGUMENTS);
     }
 
-    RegionID region_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value);
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
 
     auto & tmt = context.getTMTContext();
     auto & kvstore = tmt.getKVStore();
@@ -231,7 +239,7 @@ void MockRaftCommand::dbgFuncRollbackMerge(Context & context, const ASTs & args,
     {
         request.set_cmd_type(raft_cmdpb::AdminCmdType::RollbackMerge);
 
-        auto rollback_merge = request.mutable_rollback_merge();
+        auto * rollback_merge = request.mutable_rollback_merge();
         {
             auto merge_state = region->getMergeState();
             rollback_merge->set_commit(merge_state.commit());
diff --git a/dbms/src/Debug/dbgFuncMockRaftSnapshot.cpp b/dbms/src/Debug/dbgFuncMockRaftSnapshot.cpp
index 7eecbbdf6f7..b5d3f252d0a 100644
--- a/dbms/src/Debug/dbgFuncMockRaftSnapshot.cpp
+++ b/dbms/src/Debug/dbgFuncMockRaftSnapshot.cpp
@@ -60,7 +60,7 @@ RegionPtr GenDbgRegionSnapshotWithData(Context & context, const ASTs & args)
 {
     const String & database_name = typeid_cast<const ASTIdentifier &>(*args[0]).name;
     const String & table_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
-    RegionID region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[2]).value));
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[2]).value));
     TableID table_id = RegionBench::getTableID(context, database_name, table_name, "");
     MockTiDB::TablePtr table = MockTiDB::instance().getTableByName(database_name, table_name);
     auto & table_info = table->table_info;
@@ -68,10 +68,16 @@ RegionPtr GenDbgRegionSnapshotWithData(Context & context, const ASTs & args)
     size_t handle_column_size = is_common_handle ? table_info.getPrimaryIndexInfo().idx_cols.size() : 1;
     RegionPtr region;
 
+    std::unordered_map<String, size_t> column_name_columns_index_map;
+    for (size_t i = 0; i < table_info.columns.size(); i++)
+    {
+        column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+    }
+
     if (!is_common_handle)
     {
-        HandleID start = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value));
-        HandleID end = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[4]).value));
+        auto start = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value));
+        auto end = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[4]).value));
         region = RegionBench::createRegion(table_id, region_id, start, end);
     }
     else
@@ -81,7 +87,8 @@ RegionPtr GenDbgRegionSnapshotWithData(Context & context, const ASTs & args)
         std::vector<Field> end_keys;
         for (size_t i = 0; i < handle_column_size; i++)
         {
-            auto & column_info = table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset];
+            auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            auto & column_info = table_info.columns[idx];
             auto start_field = RegionBench::convertField(column_info, typeid_cast<const ASTLiteral &>(*args[3 + i]).value);
             TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
             start_keys.emplace_back(start_datum.field());
@@ -105,8 +112,8 @@ RegionPtr GenDbgRegionSnapshotWithData(Context & context, const ASTs & args)
     for (auto it = args_begin; it != args_end; it += len)
     {
         HandleID handle_id = is_common_handle ? 0 : static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*it[0]).value));
-        Timestamp tso = static_cast<Timestamp>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*it[1]).value));
-        UInt8 del = static_cast<UInt8>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*it[2]).value));
+        auto tso = static_cast<Timestamp>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*it[1]).value));
+        auto del = static_cast<UInt8>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*it[2]).value));
         {
             std::vector<Field> fields;
 
@@ -122,9 +129,9 @@ RegionPtr GenDbgRegionSnapshotWithData(Context & context, const ASTs & args)
                 std::vector<Field> keys; // handle key
                 for (size_t i = 0; i < table_info.getPrimaryIndexInfo().idx_cols.size(); i++)
                 {
-                    auto & idx_col = table_info.getPrimaryIndexInfo().idx_cols[i];
-                    auto & column_info = table_info.columns[idx_col.offset];
-                    auto start_field = RegionBench::convertField(column_info, fields[idx_col.offset]);
+                    auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+                    auto & column_info = table_info.columns[idx];
+                    auto start_field = RegionBench::convertField(column_info, fields[idx]);
                     TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
                     keys.emplace_back(start_datum.field());
                 }
@@ -168,7 +175,7 @@ void MockRaftCommand::dbgFuncRegionSnapshotWithData(Context & context, const AST
 // DBGInvoke region_snapshot(region-id, start-key, end-key, database-name, table-name[, partition-id])
 void MockRaftCommand::dbgFuncRegionSnapshot(Context & context, const ASTs & args, DBGInvoker::Printer output)
 {
-    RegionID region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
     bool has_partition_id = false;
     size_t args_size = args.size();
     if (dynamic_cast<ASTLiteral *>(args[args_size - 1].get()) != nullptr)
@@ -198,9 +205,16 @@ void MockRaftCommand::dbgFuncRegionSnapshot(Context & context, const ASTs & args
         // Get start key and end key form multiple column if it is clustered_index.
         std::vector<Field> start_keys;
         std::vector<Field> end_keys;
+
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
         for (size_t i = 0; i < handle_column_size; i++)
         {
-            const auto & column_info = table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset];
+            auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            const auto & column_info = table_info.columns[idx];
             auto start_field = RegionBench::convertField(column_info, typeid_cast<const ASTLiteral &>(*args[1 + i]).value);
             TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
             start_keys.emplace_back(start_datum.field());
@@ -214,15 +228,15 @@ void MockRaftCommand::dbgFuncRegionSnapshot(Context & context, const ASTs & args
     }
     else
     {
-        HandleID start = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value));
-        HandleID end = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[2]).value));
+        auto start = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value));
+        auto end = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[2]).value));
         start_key = RecordKVFormat::genKey(table_id, start);
         end_key = RecordKVFormat::genKey(table_id, end);
     }
     region_info.set_start_key(start_key.toString());
     region_info.set_end_key(end_key.toString());
-    *region_info.add_peers() = createPeer(1, true);
-    *region_info.add_peers() = createPeer(2, true);
+    *region_info.add_peers() = tests::createPeer(1, true);
+    *region_info.add_peers() = tests::createPeer(2, true);
     auto peer_id = 1;
     auto start_decoded_key = RecordKVFormat::decodeTiKVKey(start_key);
     auto end_decoded_key = RecordKVFormat::decodeTiKVKey(end_key);
@@ -432,9 +446,9 @@ void MockRaftCommand::dbgFuncIngestSST(Context & context, const ASTs & args, DBG
 {
     const String & database_name = typeid_cast<const ASTIdentifier &>(*args[0]).name;
     const String & table_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
-    RegionID region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[2]).value));
-    RegionID start_handle = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value));
-    RegionID end_handle = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[4]).value));
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[2]).value));
+    auto start_handle = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value));
+    auto end_handle = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[4]).value));
     MockTiDB::TablePtr table = MockTiDB::instance().getTableByName(database_name, table_name);
 
     const auto & table_info = RegionBench::getTableInfo(context, database_name, table_name);
@@ -555,7 +569,7 @@ void MockRaftCommand::dbgFuncRegionSnapshotApplyBlock(Context & context, const A
         throw Exception("Args not matched, should be: region-id", ErrorCodes::BAD_ARGUMENTS);
     }
 
-    RegionID region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args.front()).value));
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args.front()).value));
     auto [region, block_cache] = GLOBAL_REGION_MAP.popRegionCache("__snap_" + std::to_string(region_id));
     auto & tmt = context.getTMTContext();
     context.getTMTContext().getKVStore()->checkAndApplySnapshot<RegionPtrWithBlock>({region, std::move(block_cache)}, tmt);
@@ -577,12 +591,12 @@ void MockRaftCommand::dbgFuncRegionSnapshotPreHandleDTFiles(Context & context, c
 
     const String & database_name = typeid_cast<const ASTIdentifier &>(*args[0]).name;
     const String & table_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
-    RegionID region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[2]).value));
-    RegionID start_handle = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value));
-    RegionID end_handle = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[4]).value));
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[2]).value));
+    auto start_handle = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value));
+    auto end_handle = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[4]).value));
 
-    const String schema_str = safeGet<String>(typeid_cast<const ASTLiteral &>(*args[5]).value);
-    String handle_pk_name = safeGet<String>(typeid_cast<const ASTLiteral &>(*args[6]).value);
+    const auto schema_str = safeGet<String>(typeid_cast<const ASTLiteral &>(*args[5]).value);
+    auto handle_pk_name = safeGet<String>(typeid_cast<const ASTLiteral &>(*args[6]).value);
 
     UInt64 test_fields = 1;
     if (args.size() > 7)
@@ -677,10 +691,10 @@ void MockRaftCommand::dbgFuncRegionSnapshotPreHandleDTFilesWithHandles(Context &
 
     const String & database_name = typeid_cast<const ASTIdentifier &>(*args[0]).name;
     const String & table_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
-    RegionID region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[2]).value));
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[2]).value));
 
-    const String schema_str = safeGet<String>(typeid_cast<const ASTLiteral &>(*args[3]).value);
-    String handle_pk_name = safeGet<String>(typeid_cast<const ASTLiteral &>(*args[4]).value);
+    const auto schema_str = safeGet<String>(typeid_cast<const ASTLiteral &>(*args[3]).value);
+    auto handle_pk_name = safeGet<String>(typeid_cast<const ASTLiteral &>(*args[4]).value);
 
     std::vector<UInt64> handles;
     for (size_t i = 5; i < args.size(); ++i)
@@ -770,7 +784,7 @@ void MockRaftCommand::dbgFuncRegionSnapshotApplyDTFiles(Context & context, const
     if (args.size() != 1)
         throw Exception("Args not matched, should be: region-id", ErrorCodes::BAD_ARGUMENTS);
 
-    RegionID region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args.front()).value));
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args.front()).value));
     const auto region_name = "__snap_snap_" + std::to_string(region_id);
     auto [new_region, ingest_ids] = GLOBAL_REGION_MAP.popRegionSnap(region_name);
     auto & tmt = context.getTMTContext();
diff --git a/dbms/src/Debug/dbgFuncRegion.cpp b/dbms/src/Debug/dbgFuncRegion.cpp
index b2024eac1d8..f65a18b8fd0 100644
--- a/dbms/src/Debug/dbgFuncRegion.cpp
+++ b/dbms/src/Debug/dbgFuncRegion.cpp
@@ -61,9 +61,15 @@ void dbgFuncPutRegion(Context & context, const ASTs & args, DBGInvoker::Printer
     {
         std::vector<Field> start_keys;
         std::vector<Field> end_keys;
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
         for (size_t i = 0; i < handle_column_size; i++)
         {
-            const auto & column_info = table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset];
+            auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            const auto & column_info = table_info.columns[idx];
             auto start_field = RegionBench::convertField(column_info, typeid_cast<const ASTLiteral &>(*args[1 + i]).value);
             TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
             start_keys.emplace_back(start_datum.field());
diff --git a/dbms/src/Debug/dbgFuncSchema.cpp b/dbms/src/Debug/dbgFuncSchema.cpp
index c388015dc10..9ef07f16e8b 100644
--- a/dbms/src/Debug/dbgFuncSchema.cpp
+++ b/dbms/src/Debug/dbgFuncSchema.cpp
@@ -24,6 +24,7 @@
 #include <Storages/IManageableStorage.h>
 #include <Storages/Transaction/TMTContext.h>
 #include <Storages/Transaction/TiDB.h>
+#include <TiDB/Schema/SchemaNameMapper.h>
 #include <TiDB/Schema/SchemaSyncService.h>
 #include <TiDB/Schema/SchemaSyncer.h>
 #include <fmt/core.h>
@@ -137,4 +138,5 @@ void dbgFuncIsTombstone(Context & context, const ASTs & args, DBGInvoker::Printe
     output(fmt_buf.toString());
 }
 
+
 } // namespace DB
\ No newline at end of file
diff --git a/dbms/src/Debug/dbgFuncSchema.h b/dbms/src/Debug/dbgFuncSchema.h
index 162bc0af46b..51ab3ad41cf 100644
--- a/dbms/src/Debug/dbgFuncSchema.h
+++ b/dbms/src/Debug/dbgFuncSchema.h
@@ -46,5 +46,4 @@ void dbgFuncResetSchemas(Context & context, const ASTs & args, DBGInvoker::Print
 // Usage:
 //   ./storage-client.sh "DBGInvoke is_tombstone(db_name, table_name)"
 void dbgFuncIsTombstone(Context & context, const ASTs & args, DBGInvoker::Printer output);
-
 } // namespace DB
diff --git a/dbms/src/Debug/dbgFuncSchemaName.cpp b/dbms/src/Debug/dbgFuncSchemaName.cpp
index 4c2ad86bd62..3aa7b6e3af4 100644
--- a/dbms/src/Debug/dbgFuncSchemaName.cpp
+++ b/dbms/src/Debug/dbgFuncSchemaName.cpp
@@ -128,4 +128,109 @@ BlockInputStreamPtr dbgFuncQueryMapped(Context & context, const ASTs & args)
     return executeQuery(query, context, true).in;
 }
 
+
+void dbgFuncGetTiflashReplicaCount(Context & context, const ASTs & args, DBGInvoker::Printer output)
+{
+    if (args.empty() || args.size() != 2)
+        throw Exception("Args not matched, should be: database-name[, table-name]", ErrorCodes::BAD_ARGUMENTS);
+
+    const String & database_name = typeid_cast<const ASTIdentifier &>(*args[0]).name;
+    FmtBuffer fmt_buf;
+
+    const String & table_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
+    auto mapped = mappedTable(context, database_name, table_name);
+    auto storage = context.getTable(mapped->first, mapped->second);
+    auto managed_storage = std::dynamic_pointer_cast<IManageableStorage>(storage);
+    if (!managed_storage)
+        throw Exception(database_name + "." + table_name + " is not ManageableStorage", ErrorCodes::BAD_ARGUMENTS);
+
+    fmt_buf.append((std::to_string(managed_storage->getTableInfo().replica_info.count)));
+
+    output(fmt_buf.toString());
+}
+
+void dbgFuncGetPartitionTablesTiflashReplicaCount(Context & context, const ASTs & args, DBGInvoker::Printer output)
+{
+    if (args.empty() || args.size() != 2)
+        throw Exception("Args not matched, should be: database-name[, table-name]", ErrorCodes::BAD_ARGUMENTS);
+
+    const String & database_name = typeid_cast<const ASTIdentifier &>(*args[0]).name;
+    FmtBuffer fmt_buf;
+
+    const String & table_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
+    auto mapped = mappedTable(context, database_name, table_name);
+    auto storage = context.getTable(mapped->first, mapped->second);
+    auto managed_storage = std::dynamic_pointer_cast<IManageableStorage>(storage);
+    if (!managed_storage)
+        throw Exception(database_name + "." + table_name + " is not ManageableStorage", ErrorCodes::BAD_ARGUMENTS);
+
+    auto table_info = managed_storage->getTableInfo();
+
+    if (!table_info.isLogicalPartitionTable())
+        throw Exception(database_name + "." + table_name + " is not logical partition table", ErrorCodes::BAD_ARGUMENTS);
+
+    SchemaNameMapper name_mapper;
+    for (const auto & part_def : table_info.partition.definitions)
+    {
+        auto paritition_table_info = table_info.producePartitionTableInfo(part_def.id, name_mapper);
+        auto partition_storage = context.getTMTContext().getStorages().get(paritition_table_info->id);
+        fmt_buf.append((std::to_string(partition_storage->getTableInfo().replica_info.count)));
+        fmt_buf.append("/");
+    }
+
+    output(fmt_buf.toString());
+}
+
+void dbgFuncGetTiflashMode(Context & context, const ASTs & args, DBGInvoker::Printer output)
+{
+    if (args.empty() || args.size() != 2)
+        throw Exception("Args not matched, should be: database-name[, table-name]", ErrorCodes::BAD_ARGUMENTS);
+
+    const String & database_name = typeid_cast<const ASTIdentifier &>(*args[0]).name;
+    FmtBuffer fmt_buf;
+
+    const String & table_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
+    auto mapped = mappedTable(context, database_name, table_name);
+    auto storage = context.getTable(mapped->first, mapped->second);
+    auto managed_storage = std::dynamic_pointer_cast<IManageableStorage>(storage);
+    if (!managed_storage)
+        throw Exception(database_name + "." + table_name + " is not ManageableStorage", ErrorCodes::BAD_ARGUMENTS);
+
+    fmt_buf.append((TiFlashModeToString(managed_storage->getTableInfo().tiflash_mode)));
+
+    output(fmt_buf.toString());
+}
+
+void dbgFuncGetPartitionTablesTiflashMode(Context & context, const ASTs & args, DBGInvoker::Printer output)
+{
+    if (args.empty() || args.size() != 2)
+        throw Exception("Args not matched, should be: database-name[, table-name]", ErrorCodes::BAD_ARGUMENTS);
+
+    const String & database_name = typeid_cast<const ASTIdentifier &>(*args[0]).name;
+    FmtBuffer fmt_buf;
+
+    const String & table_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
+    auto mapped = mappedTable(context, database_name, table_name);
+    auto storage = context.getTable(mapped->first, mapped->second);
+    auto managed_storage = std::dynamic_pointer_cast<IManageableStorage>(storage);
+    if (!managed_storage)
+        throw Exception(database_name + "." + table_name + " is not ManageableStorage", ErrorCodes::BAD_ARGUMENTS);
+
+    auto table_info = managed_storage->getTableInfo();
+
+    if (!table_info.isLogicalPartitionTable())
+        throw Exception(database_name + "." + table_name + " is not logical partition table", ErrorCodes::BAD_ARGUMENTS);
+
+    SchemaNameMapper name_mapper;
+    for (const auto & part_def : table_info.partition.definitions)
+    {
+        auto paritition_table_info = table_info.producePartitionTableInfo(part_def.id, name_mapper);
+        auto partition_storage = context.getTMTContext().getStorages().get(paritition_table_info->id);
+        fmt_buf.append((TiFlashModeToString(partition_storage->getTableInfo().tiflash_mode)));
+        fmt_buf.append("/");
+    }
+
+    output(fmt_buf.toString());
+}
+
 } // namespace DB
diff --git a/dbms/src/Debug/dbgFuncSchemaName.h b/dbms/src/Debug/dbgFuncSchemaName.h
index 8e95aaab908..ec18f89e911 100644
--- a/dbms/src/Debug/dbgFuncSchemaName.h
+++ b/dbms/src/Debug/dbgFuncSchemaName.h
@@ -40,4 +40,24 @@ void dbgFuncMappedTable(Context & context, const ASTs & args, DBGInvoker::Printe
 //   ./storage-client.sh "DBGInvoke query_mapped('select * from $d.$t', database_name[, table_name])"
 BlockInputStreamPtr dbgFuncQueryMapped(Context & context, const ASTs & args);
 
+// Get table's tiflash replica counts with mapped table name
+// Usage:
+//   ./storage-client.sh "DBGInvoke get_tiflash_replica_count(db_name, table_name)"
+void dbgFuncGetTiflashReplicaCount(Context & context, const ASTs & args, DBGInvoker::Printer output);
+
+// Get the logical table's partition tables' tiflash replica counts with mapped table name
+// Usage:
+//   ./storage-client.sh "DBGInvoke get_partition_tables_tiflash_replica_count(db_name, table_name)"
+void dbgFuncGetPartitionTablesTiflashReplicaCount(Context & context, const ASTs & args, DBGInvoker::Printer output);
+
+// Get table's tiflash mode with mapped table name
+// Usage:
+//   ./storage-client.sh "DBGInvoke get_tiflash_mode(db_name, table_name)"
+void dbgFuncGetTiflashMode(Context & context, const ASTs & args, DBGInvoker::Printer output);
+
+// Get the logical table's partition tables' tiflash replica counts with mapped table name
+// Usage:
+//   ./storage-client.sh "DBGInvoke get_partition_tables_tiflash_mode(db_name, table_name)"
+void dbgFuncGetPartitionTablesTiflashMode(Context & context, const ASTs & args, DBGInvoker::Printer output);
+
 } // namespace DB
diff --git a/dbms/src/Debug/dbgTools.cpp b/dbms/src/Debug/dbgTools.cpp
index 685b2563a3b..854d8a18bd5 100644
--- a/dbms/src/Debug/dbgTools.cpp
+++ b/dbms/src/Debug/dbgTools.cpp
@@ -310,7 +310,7 @@ void insert( //
     // Parse the fields in the inserted row
     std::vector<Field> fields;
     {
-        for (ASTs::const_iterator it = values_begin; it != values_end; ++it)
+        for (auto it = values_begin; it != values_end; ++it)
         {
             auto field = typeid_cast<const ASTLiteral *>((*it).get())->value;
             fields.emplace_back(field);
@@ -330,11 +330,18 @@ void insert( //
     if (table_info.is_common_handle)
     {
         std::vector<Field> keys;
+
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
+
         for (size_t i = 0; i < table_info.getPrimaryIndexInfo().idx_cols.size(); i++)
         {
-            const auto & idx_col = table_info.getPrimaryIndexInfo().idx_cols[i];
-            const auto & column_info = table_info.columns[idx_col.offset];
-            auto start_field = RegionBench::convertField(column_info, fields[idx_col.offset]);
+            const auto & col_idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            const auto & column_info = table_info.columns[col_idx];
+            auto start_field = RegionBench::convertField(column_info, fields[col_idx]);
             TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
             keys.emplace_back(start_datum.field());
         }
diff --git a/dbms/src/Dictionaries/CacheDictionary.cpp b/dbms/src/Dictionaries/CacheDictionary.cpp
index 8573bdad6bd..0d7243ede8f 100644
--- a/dbms/src/Dictionaries/CacheDictionary.cpp
+++ b/dbms/src/Dictionaries/CacheDictionary.cpp
@@ -12,57 +12,36 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <functional>
-#include <sstream>
-#include <memory>
-#include <Columns/ColumnsNumber.h>
-#include <Columns/ColumnVector.h>
 #include <Columns/ColumnString.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnsNumber.h>
 #include <Common/BitHelpers.h>
-#include <Common/randomSeed.h>
+#include <Common/CurrentMetrics.h>
 #include <Common/HashTable/Hash.h>
-#include <Common/Stopwatch.h>
-#include <Common/ProfilingScopedRWLock.h>
 #include <Common/ProfileEvents.h>
-#include <Common/CurrentMetrics.h>
+#include <Common/ProfilingScopedRWLock.h>
+#include <Common/Stopwatch.h>
+#include <Common/randomSeed.h>
 #include <Common/typeid_cast.h>
 #include <Dictionaries/CacheDictionary.h>
 #include <Dictionaries/DictionaryBlockInputStream.h>
-#include <ext/size.h>
-#include <ext/range.h>
-#include <ext/map.h>
-
-
-namespace ProfileEvents
-{
-    extern const Event DictCacheKeysRequested;
-    extern const Event DictCacheKeysRequestedMiss;
-    extern const Event DictCacheKeysRequestedFound;
-    extern const Event DictCacheKeysExpired;
-    extern const Event DictCacheKeysNotFound;
-    extern const Event DictCacheKeysHit;
-    extern const Event DictCacheRequestTimeNs;
-    extern const Event DictCacheRequests;
-    extern const Event DictCacheLockWriteNs;
-    extern const Event DictCacheLockReadNs;
-}
-
-namespace CurrentMetrics
-{
-    extern const Metric DictCacheRequests;
-}
 
+#include <ext/map.h>
+#include <ext/range.h>
+#include <ext/size.h>
+#include <functional>
+#include <memory>
+#include <sstream>
 
 namespace DB
 {
-
 namespace ErrorCodes
 {
-    extern const int TYPE_MISMATCH;
-    extern const int BAD_ARGUMENTS;
-    extern const int UNSUPPORTED_METHOD;
-    extern const int LOGICAL_ERROR;
-}
+extern const int TYPE_MISMATCH;
+extern const int BAD_ARGUMENTS;
+extern const int UNSUPPORTED_METHOD;
+extern const int LOGICAL_ERROR;
+} // namespace ErrorCodes
 
 
 inline size_t CacheDictionary::getCellIdx(const Key id) const
@@ -73,15 +52,15 @@ inline size_t CacheDictionary::getCellIdx(const Key id) const
 }
 
 
-CacheDictionary::CacheDictionary(const std::string & name, const DictionaryStructure & dict_struct,
-    DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime,
-    const size_t size)
-    : name{name}, dict_struct(dict_struct),
-        source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
-        size{roundUpToPowerOfTwoOrZero(std::max(size, size_t(max_collision_length)))},
-        size_overlap_mask{this->size - 1},
-        cells{this->size},
-        rnd_engine(randomSeed())
+CacheDictionary::CacheDictionary(const std::string & name, const DictionaryStructure & dict_struct, DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime, const size_t size)
+    : name{name}
+    , dict_struct(dict_struct)
+    , source_ptr{std::move(source_ptr)}
+    , dict_lifetime(dict_lifetime)
+    , size{roundUpToPowerOfTwoOrZero(std::max(size, size_t(max_collision_length)))}
+    , size_overlap_mask{this->size - 1}
+    , cells{this->size}
+    , rnd_engine(randomSeed())
 {
     if (!this->source_ptr->supportsSelectiveLoad())
         throw Exception{
@@ -100,13 +79,19 @@ void CacheDictionary::toParent(const PaddedPODArray<Key> & ids, PaddedPODArray<K
 {
     const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
 
-    getItemsNumber<UInt64>(*hierarchical_attribute, ids, out, [&] (const size_t) { return null_value; });
+    getItemsNumber<UInt64>(*hierarchical_attribute, ids, out, [&](const size_t) { return null_value; });
 }
 
 
 /// Allow to use single value in same way as array.
-static inline CacheDictionary::Key getAt(const PaddedPODArray<CacheDictionary::Key> & arr, const size_t idx) { return arr[idx]; }
-static inline CacheDictionary::Key getAt(const CacheDictionary::Key & value, const size_t) { return value; }
+static inline CacheDictionary::Key getAt(const PaddedPODArray<CacheDictionary::Key> & arr, const size_t idx)
+{
+    return arr[idx];
+}
+static inline CacheDictionary::Key getAt(const CacheDictionary::Key & value, const size_t)
+{
+    return value;
+}
 
 
 template <typename AncestorType>
@@ -118,7 +103,7 @@ void CacheDictionary::isInImpl(
     /// Transform all children to parents until ancestor id or null_value will be reached.
 
     size_t size = out.size();
-    memset(out.data(), 0xFF, size);        /// 0xFF means "not calculated"
+    memset(out.data(), 0xFF, size); /// 0xFF means "not calculated"
 
     const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
 
@@ -224,19 +209,19 @@ void CacheDictionary::isInConstantVector(
 }
 
 
-#define DECLARE(TYPE)\
-void CacheDictionary::get##TYPE(const std::string & attribute_name, const PaddedPODArray<Key> & ids, PaddedPODArray<TYPE> & out) const\
-{\
-    auto & attribute = getAttribute(attribute_name);\
-    if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
-        throw Exception{\
-            name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
-            ErrorCodes::TYPE_MISMATCH};\
-    \
-    const auto null_value = std::get<TYPE>(attribute.null_values);\
-    \
-    getItemsNumber<TYPE>(attribute, ids, out, [&] (const size_t) { return null_value; });\
-}
+#define DECLARE(TYPE)                                                                                                                      \
+    void CacheDictionary::get##TYPE(const std::string & attribute_name, const PaddedPODArray<Key> & ids, PaddedPODArray<TYPE> & out) const \
+    {                                                                                                                                      \
+        auto & attribute = getAttribute(attribute_name);                                                                                   \
+        if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))                                                  \
+            throw Exception{                                                                                                               \
+                name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),                           \
+                ErrorCodes::TYPE_MISMATCH};                                                                                                \
+                                                                                                                                           \
+        const auto null_value = std::get<TYPE>(attribute.null_values);                                                                     \
+                                                                                                                                           \
+        getItemsNumber<TYPE>(attribute, ids, out, [&](const size_t) { return null_value; });                                               \
+    }
 DECLARE(UInt8)
 DECLARE(UInt16)
 DECLARE(UInt32)
@@ -260,22 +245,24 @@ void CacheDictionary::getString(const std::string & attribute_name, const Padded
 
     const auto null_value = StringRef{std::get<String>(attribute.null_values)};
 
-    getItemsString(attribute, ids, out, [&] (const size_t) { return null_value; });
+    getItemsString(attribute, ids, out, [&](const size_t) { return null_value; });
 }
 
-#define DECLARE(TYPE)\
-void CacheDictionary::get##TYPE(\
-    const std::string & attribute_name, const PaddedPODArray<Key> & ids, const PaddedPODArray<TYPE> & def,\
-    PaddedPODArray<TYPE> & out) const\
-{\
-    auto & attribute = getAttribute(attribute_name);\
-    if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
-        throw Exception{\
-            name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
-            ErrorCodes::TYPE_MISMATCH};\
-    \
-    getItemsNumber<TYPE>(attribute, ids, out, [&] (const size_t row) { return def[row]; });\
-}
+#define DECLARE(TYPE)                                                                                            \
+    void CacheDictionary::get##TYPE(                                                                             \
+        const std::string & attribute_name,                                                                      \
+        const PaddedPODArray<Key> & ids,                                                                         \
+        const PaddedPODArray<TYPE> & def,                                                                        \
+        PaddedPODArray<TYPE> & out) const                                                                        \
+    {                                                                                                            \
+        auto & attribute = getAttribute(attribute_name);                                                         \
+        if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))                        \
+            throw Exception{                                                                                     \
+                name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type), \
+                ErrorCodes::TYPE_MISMATCH};                                                                      \
+                                                                                                                 \
+        getItemsNumber<TYPE>(attribute, ids, out, [&](const size_t row) { return def[row]; });                   \
+    }
 DECLARE(UInt8)
 DECLARE(UInt16)
 DECLARE(UInt32)
@@ -290,7 +277,9 @@ DECLARE(Float64)
 #undef DECLARE
 
 void CacheDictionary::getString(
-    const std::string & attribute_name, const PaddedPODArray<Key> & ids, const ColumnString * const def,
+    const std::string & attribute_name,
+    const PaddedPODArray<Key> & ids,
+    const ColumnString * const def,
     ColumnString * const out) const
 {
     auto & attribute = getAttribute(attribute_name);
@@ -299,21 +288,24 @@ void CacheDictionary::getString(
             name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
             ErrorCodes::TYPE_MISMATCH};
 
-    getItemsString(attribute, ids, out, [&] (const size_t row) { return def->getDataAt(row); });
+    getItemsString(attribute, ids, out, [&](const size_t row) { return def->getDataAt(row); });
 }
 
-#define DECLARE(TYPE)\
-void CacheDictionary::get##TYPE(\
-    const std::string & attribute_name, const PaddedPODArray<Key> & ids, const TYPE def, PaddedPODArray<TYPE> & out) const\
-{\
-    auto & attribute = getAttribute(attribute_name);\
-    if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
-        throw Exception{\
-            name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
-            ErrorCodes::TYPE_MISMATCH};\
-    \
-    getItemsNumber<TYPE>(attribute, ids, out, [&] (const size_t) { return def; });\
-}
+#define DECLARE(TYPE)                                                                                            \
+    void CacheDictionary::get##TYPE(                                                                             \
+        const std::string & attribute_name,                                                                      \
+        const PaddedPODArray<Key> & ids,                                                                         \
+        const TYPE def,                                                                                          \
+        PaddedPODArray<TYPE> & out) const                                                                        \
+    {                                                                                                            \
+        auto & attribute = getAttribute(attribute_name);                                                         \
+        if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))                        \
+            throw Exception{                                                                                     \
+                name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type), \
+                ErrorCodes::TYPE_MISMATCH};                                                                      \
+                                                                                                                 \
+        getItemsNumber<TYPE>(attribute, ids, out, [&](const size_t) { return def; });                            \
+    }
 DECLARE(UInt8)
 DECLARE(UInt16)
 DECLARE(UInt32)
@@ -328,7 +320,9 @@ DECLARE(Float64)
 #undef DECLARE
 
 void CacheDictionary::getString(
-    const std::string & attribute_name, const PaddedPODArray<Key> & ids, const String & def,
+    const std::string & attribute_name,
+    const PaddedPODArray<Key> & ids,
+    const String & def,
     ColumnString * const out) const
 {
     auto & attribute = getAttribute(attribute_name);
@@ -337,7 +331,7 @@ void CacheDictionary::getString(
             name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
             ErrorCodes::TYPE_MISMATCH};
 
-    getItemsString(attribute, ids, out, [&] (const size_t) { return StringRef{def}; });
+    getItemsString(attribute, ids, out, [&](const size_t) { return StringRef{def}; });
 }
 
 
@@ -390,8 +384,6 @@ void CacheDictionary::has(const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8>
 
     const auto rows = ext::size(ids);
     {
-        const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
-
         const auto now = std::chrono::system_clock::now();
         /// fetch up-to-date values, decide which ones require update
         for (const auto row : ext::range(0, rows))
@@ -416,10 +408,6 @@ void CacheDictionary::has(const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8>
         }
     }
 
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired);
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
-
     query_count.fetch_add(rows, std::memory_order_relaxed);
     hit_count.fetch_add(rows - outdated_ids.size(), std::memory_order_release);
 
@@ -427,21 +415,19 @@ void CacheDictionary::has(const PaddedPODArray<Key> & ids, PaddedPODArray<UInt8>
         return;
 
     std::vector<Key> required_ids(outdated_ids.size());
-    std::transform(std::begin(outdated_ids), std::end(outdated_ids), std::begin(required_ids),
-        [] (auto & pair) { return pair.first; });
+    std::transform(std::begin(outdated_ids), std::end(outdated_ids), std::begin(required_ids), [](auto & pair) { return pair.first; });
 
     /// request new values
-    update(required_ids,
-    [&] (const auto id, const auto)
-    {
-        for (const auto row : outdated_ids[id])
-            out[row] = true;
-    },
-    [&] (const auto id, const auto)
-    {
-        for (const auto row : outdated_ids[id])
-            out[row] = false;
-    });
+    update(
+        required_ids,
+        [&](const auto id, const auto) {
+            for (const auto row : outdated_ids[id])
+                out[row] = true;
+        },
+        [&](const auto id, const auto) {
+            for (const auto row : outdated_ids[id])
+                out[row] = false;
+        });
 }
 
 
@@ -476,68 +462,68 @@ CacheDictionary::Attribute CacheDictionary::createAttributeWithType(const Attrib
 
     switch (type)
     {
-        case AttributeUnderlyingType::UInt8:
-            std::get<UInt8>(attr.null_values) = null_value.get<UInt64>();
-            std::get<ContainerPtrType<UInt8>>(attr.arrays) = std::make_unique<ContainerType<UInt8>>(size);
-            bytes_allocated += size * sizeof(UInt8);
-            break;
-        case AttributeUnderlyingType::UInt16:
-            std::get<UInt16>(attr.null_values) = null_value.get<UInt64>();
-            std::get<ContainerPtrType<UInt16>>(attr.arrays) = std::make_unique<ContainerType<UInt16>>(size);
-            bytes_allocated += size * sizeof(UInt16);
-            break;
-        case AttributeUnderlyingType::UInt32:
-            std::get<UInt32>(attr.null_values) = null_value.get<UInt64>();
-            std::get<ContainerPtrType<UInt32>>(attr.arrays) = std::make_unique<ContainerType<UInt32>>(size);
-            bytes_allocated += size * sizeof(UInt32);
-            break;
-        case AttributeUnderlyingType::UInt64:
-            std::get<UInt64>(attr.null_values) = null_value.get<UInt64>();
-            std::get<ContainerPtrType<UInt64>>(attr.arrays) = std::make_unique<ContainerType<UInt64>>(size);
-            bytes_allocated += size * sizeof(UInt64);
-            break;
-        case AttributeUnderlyingType::UInt128:
-            std::get<UInt128>(attr.null_values) = null_value.get<UInt128>();
-            std::get<ContainerPtrType<UInt128>>(attr.arrays) = std::make_unique<ContainerType<UInt128>>(size);
-            bytes_allocated += size * sizeof(UInt128);
-            break;
-        case AttributeUnderlyingType::Int8:
-            std::get<Int8>(attr.null_values) = null_value.get<Int64>();
-            std::get<ContainerPtrType<Int8>>(attr.arrays) = std::make_unique<ContainerType<Int8>>(size);
-            bytes_allocated += size * sizeof(Int8);
-            break;
-        case AttributeUnderlyingType::Int16:
-            std::get<Int16>(attr.null_values) = null_value.get<Int64>();
-            std::get<ContainerPtrType<Int16>>(attr.arrays) = std::make_unique<ContainerType<Int16>>(size);
-            bytes_allocated += size * sizeof(Int16);
-            break;
-        case AttributeUnderlyingType::Int32:
-            std::get<Int32>(attr.null_values) = null_value.get<Int64>();
-            std::get<ContainerPtrType<Int32>>(attr.arrays) = std::make_unique<ContainerType<Int32>>(size);
-            bytes_allocated += size * sizeof(Int32);
-            break;
-        case AttributeUnderlyingType::Int64:
-            std::get<Int64>(attr.null_values) = null_value.get<Int64>();
-            std::get<ContainerPtrType<Int64>>(attr.arrays) = std::make_unique<ContainerType<Int64>>(size);
-            bytes_allocated += size * sizeof(Int64);
-            break;
-        case AttributeUnderlyingType::Float32:
-            std::get<Float32>(attr.null_values) = null_value.get<Float64>();
-            std::get<ContainerPtrType<Float32>>(attr.arrays) = std::make_unique<ContainerType<Float32>>(size);
-            bytes_allocated += size * sizeof(Float32);
-            break;
-        case AttributeUnderlyingType::Float64:
-            std::get<Float64>(attr.null_values) = null_value.get<Float64>();
-            std::get<ContainerPtrType<Float64>>(attr.arrays) = std::make_unique<ContainerType<Float64>>(size);
-            bytes_allocated += size * sizeof(Float64);
-            break;
-        case AttributeUnderlyingType::String:
-            std::get<String>(attr.null_values) = null_value.get<String>();
-            std::get<ContainerPtrType<StringRef>>(attr.arrays) = std::make_unique<ContainerType<StringRef>>(size);
-            bytes_allocated += size * sizeof(StringRef);
-            if (!string_arena)
-                string_arena = std::make_unique<ArenaWithFreeLists>();
-            break;
+    case AttributeUnderlyingType::UInt8:
+        std::get<UInt8>(attr.null_values) = null_value.get<UInt64>();
+        std::get<ContainerPtrType<UInt8>>(attr.arrays) = std::make_unique<ContainerType<UInt8>>(size);
+        bytes_allocated += size * sizeof(UInt8);
+        break;
+    case AttributeUnderlyingType::UInt16:
+        std::get<UInt16>(attr.null_values) = null_value.get<UInt64>();
+        std::get<ContainerPtrType<UInt16>>(attr.arrays) = std::make_unique<ContainerType<UInt16>>(size);
+        bytes_allocated += size * sizeof(UInt16);
+        break;
+    case AttributeUnderlyingType::UInt32:
+        std::get<UInt32>(attr.null_values) = null_value.get<UInt64>();
+        std::get<ContainerPtrType<UInt32>>(attr.arrays) = std::make_unique<ContainerType<UInt32>>(size);
+        bytes_allocated += size * sizeof(UInt32);
+        break;
+    case AttributeUnderlyingType::UInt64:
+        std::get<UInt64>(attr.null_values) = null_value.get<UInt64>();
+        std::get<ContainerPtrType<UInt64>>(attr.arrays) = std::make_unique<ContainerType<UInt64>>(size);
+        bytes_allocated += size * sizeof(UInt64);
+        break;
+    case AttributeUnderlyingType::UInt128:
+        std::get<UInt128>(attr.null_values) = null_value.get<UInt128>();
+        std::get<ContainerPtrType<UInt128>>(attr.arrays) = std::make_unique<ContainerType<UInt128>>(size);
+        bytes_allocated += size * sizeof(UInt128);
+        break;
+    case AttributeUnderlyingType::Int8:
+        std::get<Int8>(attr.null_values) = null_value.get<Int64>();
+        std::get<ContainerPtrType<Int8>>(attr.arrays) = std::make_unique<ContainerType<Int8>>(size);
+        bytes_allocated += size * sizeof(Int8);
+        break;
+    case AttributeUnderlyingType::Int16:
+        std::get<Int16>(attr.null_values) = null_value.get<Int64>();
+        std::get<ContainerPtrType<Int16>>(attr.arrays) = std::make_unique<ContainerType<Int16>>(size);
+        bytes_allocated += size * sizeof(Int16);
+        break;
+    case AttributeUnderlyingType::Int32:
+        std::get<Int32>(attr.null_values) = null_value.get<Int64>();
+        std::get<ContainerPtrType<Int32>>(attr.arrays) = std::make_unique<ContainerType<Int32>>(size);
+        bytes_allocated += size * sizeof(Int32);
+        break;
+    case AttributeUnderlyingType::Int64:
+        std::get<Int64>(attr.null_values) = null_value.get<Int64>();
+        std::get<ContainerPtrType<Int64>>(attr.arrays) = std::make_unique<ContainerType<Int64>>(size);
+        bytes_allocated += size * sizeof(Int64);
+        break;
+    case AttributeUnderlyingType::Float32:
+        std::get<Float32>(attr.null_values) = null_value.get<Float64>();
+        std::get<ContainerPtrType<Float32>>(attr.arrays) = std::make_unique<ContainerType<Float32>>(size);
+        bytes_allocated += size * sizeof(Float32);
+        break;
+    case AttributeUnderlyingType::Float64:
+        std::get<Float64>(attr.null_values) = null_value.get<Float64>();
+        std::get<ContainerPtrType<Float64>>(attr.arrays) = std::make_unique<ContainerType<Float64>>(size);
+        bytes_allocated += size * sizeof(Float64);
+        break;
+    case AttributeUnderlyingType::String:
+        std::get<String>(attr.null_values) = null_value.get<String>();
+        std::get<ContainerPtrType<StringRef>>(attr.arrays) = std::make_unique<ContainerType<StringRef>>(size);
+        bytes_allocated += size * sizeof(StringRef);
+        if (!string_arena)
+            string_arena = std::make_unique<ArenaWithFreeLists>();
+        break;
     }
 
     return attr;
@@ -551,8 +537,8 @@ void CacheDictionary::getItemsNumber(
     PaddedPODArray<OutputType> & out,
     DefaultGetter && get_default) const
 {
-    if (false) {}
-#define DISPATCH(TYPE) \
+    if (false) {} // NOLINT
+#define DISPATCH(TYPE)                                        \
     else if (attribute.type == AttributeUnderlyingType::TYPE) \
         getItemsNumberImpl<TYPE, OutputType>(attribute, ids, out, std::forward<DefaultGetter>(get_default));
     DISPATCH(UInt8)
@@ -567,8 +553,7 @@ void CacheDictionary::getItemsNumber(
     DISPATCH(Float32)
     DISPATCH(Float64)
 #undef DISPATCH
-    else
-        throw Exception("Unexpected type of attribute: " + toString(attribute.type), ErrorCodes::LOGICAL_ERROR);
+    else throw Exception("Unexpected type of attribute: " + toString(attribute.type), ErrorCodes::LOGICAL_ERROR);
 }
 
 template <typename AttributeType, typename OutputType, typename DefaultGetter>
@@ -586,8 +571,6 @@ void CacheDictionary::getItemsNumberImpl(
     size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0;
 
     {
-        const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
-
         const auto now = std::chrono::system_clock::now();
         /// fetch up-to-date values, decide which ones require update
         for (const auto row : ext::range(0, rows))
@@ -618,10 +601,6 @@ void CacheDictionary::getItemsNumberImpl(
         }
     }
 
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired);
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
-
     query_count.fetch_add(rows, std::memory_order_relaxed);
     hit_count.fetch_add(rows - outdated_ids.size(), std::memory_order_release);
 
@@ -629,23 +608,21 @@ void CacheDictionary::getItemsNumberImpl(
         return;
 
     std::vector<Key> required_ids(outdated_ids.size());
-    std::transform(std::begin(outdated_ids), std::end(outdated_ids), std::begin(required_ids),
-        [] (auto & pair) { return pair.first; });
+    std::transform(std::begin(outdated_ids), std::end(outdated_ids), std::begin(required_ids), [](auto & pair) { return pair.first; });
 
     /// request new values
-    update(required_ids,
-    [&] (const auto id, const auto cell_idx)
-    {
-        const auto attribute_value = attribute_array[cell_idx];
+    update(
+        required_ids,
+        [&](const auto id, const auto cell_idx) {
+            const auto attribute_value = attribute_array[cell_idx];
 
-        for (const auto row : outdated_ids[id])
-            out[row] = static_cast<OutputType>(attribute_value);
-    },
-    [&] (const auto id, const auto)
-    {
-        for (const auto row : outdated_ids[id])
-            out[row] = get_default(row);
-    });
+            for (const auto row : outdated_ids[id])
+                out[row] = static_cast<OutputType>(attribute_value); // NOLINT
+        },
+        [&](const auto id, const auto) {
+            for (const auto row : outdated_ids[id])
+                out[row] = get_default(row);
+        });
 }
 
 template <typename DefaultGetter>
@@ -666,8 +643,6 @@ void CacheDictionary::getItemsString(
 
     /// perform optimistic version, fallback to pessimistic if failed
     {
-        const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
-
         const auto now = std::chrono::system_clock::now();
         /// fetch up-to-date values, discard on fail
         for (const auto row : ext::range(0, rows))
@@ -710,8 +685,6 @@ void CacheDictionary::getItemsString(
     size_t total_length = 0;
     size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0;
     {
-        const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
-
         const auto now = std::chrono::system_clock::now();
         for (const auto row : ext::range(0, ids.size()))
         {
@@ -741,10 +714,6 @@ void CacheDictionary::getItemsString(
         }
     }
 
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired);
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
-
     query_count.fetch_add(rows, std::memory_order_relaxed);
     hit_count.fetch_add(rows - outdated_ids.size(), std::memory_order_release);
 
@@ -752,22 +721,20 @@ void CacheDictionary::getItemsString(
     if (!outdated_ids.empty())
     {
         std::vector<Key> required_ids(outdated_ids.size());
-        std::transform(std::begin(outdated_ids), std::end(outdated_ids), std::begin(required_ids),
-            [] (auto & pair) { return pair.first; });
-
-        update(required_ids,
-        [&] (const auto id, const auto cell_idx)
-        {
-            const auto attribute_value = attribute_array[cell_idx];
-
-            map[id] = String{attribute_value};
-            total_length += (attribute_value.size + 1) * outdated_ids[id].size();
-        },
-        [&] (const auto id, const auto)
-        {
-            for (const auto row : outdated_ids[id])
-                total_length += get_default(row).size + 1;
-        });
+        std::transform(std::begin(outdated_ids), std::end(outdated_ids), std::begin(required_ids), [](auto & pair) { return pair.first; });
+
+        update(
+            required_ids,
+            [&](const auto id, const auto cell_idx) {
+                const auto attribute_value = attribute_array[cell_idx];
+
+                map[id] = String{attribute_value};
+                total_length += (attribute_value.size + 1) * outdated_ids[id].size();
+            },
+            [&](const auto id, const auto) {
+                for (const auto row : outdated_ids[id])
+                    total_length += get_default(row).size + 1;
+            });
     }
 
     out->getChars().reserve(total_length);
@@ -790,18 +757,13 @@ void CacheDictionary::update(
 {
     std::unordered_map<Key, UInt8> remaining_ids{requested_ids.size()};
     for (const auto id : requested_ids)
-        remaining_ids.insert({ id, 0 });
+        remaining_ids.insert({id, 0});
 
-    std::uniform_int_distribution<UInt64> distribution
-    {
+    std::uniform_int_distribution<UInt64> distribution{
         dict_lifetime.min_sec,
-        dict_lifetime.max_sec
-    };
-
-    const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
+        dict_lifetime.max_sec};
 
     {
-        CurrentMetrics::Increment metric_increment{CurrentMetrics::DictCacheRequests};
         Stopwatch watch;
         auto stream = source_ptr->loadIds(requested_ids);
         stream->readPrefix();
@@ -810,7 +772,7 @@ void CacheDictionary::update(
 
         while (const auto block = stream->read())
         {
-            const auto id_column = typeid_cast<const ColumnUInt64 *>(block.safeGetByPosition(0).column.get());
+            const auto * id_column = typeid_cast<const ColumnUInt64 *>(block.safeGetByPosition(0).column.get());
             if (!id_column)
                 throw Exception{
                     name + ": id column has type different from UInt64.",
@@ -819,8 +781,7 @@ void CacheDictionary::update(
             const auto & ids = id_column->getData();
 
             /// cache column pointers
-            const auto column_ptrs = ext::map<std::vector>(ext::range(0, attributes.size()), [&block] (size_t i)
-            {
+            const auto column_ptrs = ext::map<std::vector>(ext::range(0, attributes.size()), [&block](size_t i) {
                 return block.safeGetByPosition(i + 1).column.get();
             });
 
@@ -859,9 +820,6 @@ void CacheDictionary::update(
         }
 
         stream->readSuffix();
-
-        ProfileEvents::increment(ProfileEvents::DictCacheKeysRequested, requested_ids.size());
-        ProfileEvents::increment(ProfileEvents::DictCacheRequestTimeNs, watch.elapsed());
     }
 
     size_t not_found_num = 0, found_num = 0;
@@ -903,10 +861,6 @@ void CacheDictionary::update(
         /// inform caller that the cell has not been found
         on_id_not_found(id, cell_idx);
     }
-
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, not_found_num);
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedFound, found_num);
-    ProfileEvents::increment(ProfileEvents::DictCacheRequests);
 }
 
 
@@ -914,32 +868,54 @@ void CacheDictionary::setDefaultAttributeValue(Attribute & attribute, const Key
 {
     switch (attribute.type)
     {
-        case AttributeUnderlyingType::UInt8: std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = std::get<UInt8>(attribute.null_values); break;
-        case AttributeUnderlyingType::UInt16: std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = std::get<UInt16>(attribute.null_values); break;
-        case AttributeUnderlyingType::UInt32: std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = std::get<UInt32>(attribute.null_values); break;
-        case AttributeUnderlyingType::UInt64: std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = std::get<UInt64>(attribute.null_values); break;
-        case AttributeUnderlyingType::UInt128: std::get<ContainerPtrType<UInt128>>(attribute.arrays)[idx] = std::get<UInt128>(attribute.null_values); break;
-        case AttributeUnderlyingType::Int8: std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = std::get<Int8>(attribute.null_values); break;
-        case AttributeUnderlyingType::Int16: std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = std::get<Int16>(attribute.null_values); break;
-        case AttributeUnderlyingType::Int32: std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = std::get<Int32>(attribute.null_values); break;
-        case AttributeUnderlyingType::Int64: std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = std::get<Int64>(attribute.null_values); break;
-        case AttributeUnderlyingType::Float32: std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = std::get<Float32>(attribute.null_values); break;
-        case AttributeUnderlyingType::Float64: std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = std::get<Float64>(attribute.null_values); break;
-        case AttributeUnderlyingType::String:
-        {
-            const auto & null_value_ref = std::get<String>(attribute.null_values);
-            auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
-
-            if (string_ref.data != null_value_ref.data())
-            {
-                if (string_ref.data)
-                    string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
+    case AttributeUnderlyingType::UInt8:
+        std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = std::get<UInt8>(attribute.null_values);
+        break;
+    case AttributeUnderlyingType::UInt16:
+        std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = std::get<UInt16>(attribute.null_values);
+        break;
+    case AttributeUnderlyingType::UInt32:
+        std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = std::get<UInt32>(attribute.null_values);
+        break;
+    case AttributeUnderlyingType::UInt64:
+        std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = std::get<UInt64>(attribute.null_values);
+        break;
+    case AttributeUnderlyingType::UInt128:
+        std::get<ContainerPtrType<UInt128>>(attribute.arrays)[idx] = std::get<UInt128>(attribute.null_values);
+        break;
+    case AttributeUnderlyingType::Int8:
+        std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = std::get<Int8>(attribute.null_values);
+        break;
+    case AttributeUnderlyingType::Int16:
+        std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = std::get<Int16>(attribute.null_values);
+        break;
+    case AttributeUnderlyingType::Int32:
+        std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = std::get<Int32>(attribute.null_values);
+        break;
+    case AttributeUnderlyingType::Int64:
+        std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = std::get<Int64>(attribute.null_values);
+        break;
+    case AttributeUnderlyingType::Float32:
+        std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = std::get<Float32>(attribute.null_values);
+        break;
+    case AttributeUnderlyingType::Float64:
+        std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = std::get<Float64>(attribute.null_values);
+        break;
+    case AttributeUnderlyingType::String:
+    {
+        const auto & null_value_ref = std::get<String>(attribute.null_values);
+        auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
 
-                string_ref = StringRef{null_value_ref};
-            }
+        if (string_ref.data != null_value_ref.data())
+        {
+            if (string_ref.data)
+                string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
 
-            break;
+            string_ref = StringRef{null_value_ref};
         }
+
+        break;
+    }
     }
 }
 
@@ -947,39 +923,61 @@ void CacheDictionary::setAttributeValue(Attribute & attribute, const Key idx, co
 {
     switch (attribute.type)
     {
-        case AttributeUnderlyingType::UInt8: std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
-        case AttributeUnderlyingType::UInt16: std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
-        case AttributeUnderlyingType::UInt32: std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
-        case AttributeUnderlyingType::UInt64: std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
-        case AttributeUnderlyingType::UInt128: std::get<ContainerPtrType<UInt128>>(attribute.arrays)[idx] = value.get<UInt128>(); break;
-        case AttributeUnderlyingType::Int8: std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = value.get<Int64>(); break;
-        case AttributeUnderlyingType::Int16: std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = value.get<Int64>(); break;
-        case AttributeUnderlyingType::Int32: std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = value.get<Int64>(); break;
-        case AttributeUnderlyingType::Int64: std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = value.get<Int64>(); break;
-        case AttributeUnderlyingType::Float32: std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = value.get<Float64>(); break;
-        case AttributeUnderlyingType::Float64: std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = value.get<Float64>(); break;
-        case AttributeUnderlyingType::String:
-        {
-            const auto & string = value.get<String>();
-            auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
-            const auto & null_value_ref = std::get<String>(attribute.null_values);
-
-            /// free memory unless it points to a null_value
-            if (string_ref.data && string_ref.data != null_value_ref.data())
-                string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
+    case AttributeUnderlyingType::UInt8:
+        std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = value.get<UInt64>();
+        break;
+    case AttributeUnderlyingType::UInt16:
+        std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = value.get<UInt64>();
+        break;
+    case AttributeUnderlyingType::UInt32:
+        std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = value.get<UInt64>();
+        break;
+    case AttributeUnderlyingType::UInt64:
+        std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = value.get<UInt64>();
+        break;
+    case AttributeUnderlyingType::UInt128:
+        std::get<ContainerPtrType<UInt128>>(attribute.arrays)[idx] = value.get<UInt128>();
+        break;
+    case AttributeUnderlyingType::Int8:
+        std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = value.get<Int64>();
+        break;
+    case AttributeUnderlyingType::Int16:
+        std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = value.get<Int64>();
+        break;
+    case AttributeUnderlyingType::Int32:
+        std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = value.get<Int64>();
+        break;
+    case AttributeUnderlyingType::Int64:
+        std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = value.get<Int64>();
+        break;
+    case AttributeUnderlyingType::Float32:
+        std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = value.get<Float64>();
+        break;
+    case AttributeUnderlyingType::Float64:
+        std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = value.get<Float64>();
+        break;
+    case AttributeUnderlyingType::String:
+    {
+        const auto & string = value.get<String>();
+        auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
+        const auto & null_value_ref = std::get<String>(attribute.null_values);
 
-            const auto size = string.size();
-            if (size != 0)
-            {
-                auto string_ptr = string_arena->alloc(size + 1);
-                std::copy(string.data(), string.data() + size + 1, string_ptr);
-                string_ref = StringRef{string_ptr, size};
-            }
-            else
-                string_ref = {};
+        /// free memory unless it points to a null_value
+        if (string_ref.data && string_ref.data != null_value_ref.data())
+            string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
 
-            break;
+        const auto size = string.size();
+        if (size != 0)
+        {
+            auto * string_ptr = string_arena->alloc(size + 1);
+            std::copy(string.data(), string.data() + size + 1, string_ptr);
+            string_ref = StringRef{string_ptr, size};
         }
+        else
+            string_ref = {};
+
+        break;
+    }
     }
 }
 
@@ -989,22 +987,18 @@ CacheDictionary::Attribute & CacheDictionary::getAttribute(const std::string & a
     if (it == std::end(attribute_index_by_name))
         throw Exception{
             name + ": no such attribute '" + attribute_name + "'",
-            ErrorCodes::BAD_ARGUMENTS
-        };
+            ErrorCodes::BAD_ARGUMENTS};
 
     return attributes[it->second];
 }
 
 bool CacheDictionary::isEmptyCell(const UInt64 idx) const
 {
-    return (idx != zero_cell_idx && cells[idx].id == 0) || (cells[idx].data
-        == ext::safe_bit_cast<CellMetadata::time_point_urep_t>(CellMetadata::time_point_t()));
+    return (idx != zero_cell_idx && cells[idx].id == 0) || (cells[idx].data == ext::safe_bit_cast<CellMetadata::time_point_urep_t>(CellMetadata::time_point_t()));
 }
 
 PaddedPODArray<CacheDictionary::Key> CacheDictionary::getCachedIds() const
 {
-    const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
-
     PaddedPODArray<Key> array;
     for (size_t idx = 0; idx < cells.size(); ++idx)
     {
@@ -1024,4 +1018,4 @@ BlockInputStreamPtr CacheDictionary::getBlockInputStream(const Names & column_na
 }
 
 
-}
+} // namespace DB
diff --git a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp
index 330ee036136..fb9a94b29a0 100644
--- a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp
+++ b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp
@@ -12,48 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Dictionaries/ComplexKeyCacheDictionary.h>
-#include <Dictionaries/DictionaryBlockInputStream.h>
 #include <Common/Arena.h>
 #include <Common/BitHelpers.h>
-#include <Common/randomSeed.h>
-#include <Common/Stopwatch.h>
-#include <Common/ProfilingScopedRWLock.h>
-#include <Common/ProfileEvents.h>
 #include <Common/CurrentMetrics.h>
-#include <ext/range.h>
-#include <ext/map.h>
-
-
-namespace ProfileEvents
-{
-
-    extern const Event DictCacheKeysRequested;
-    extern const Event DictCacheKeysRequestedMiss;
-    extern const Event DictCacheKeysRequestedFound;
-    extern const Event DictCacheKeysExpired;
-    extern const Event DictCacheKeysNotFound;
-    extern const Event DictCacheKeysHit;
-    extern const Event DictCacheRequestTimeNs;
-    extern const Event DictCacheLockWriteNs;
-    extern const Event DictCacheLockReadNs;
-}
-
-namespace CurrentMetrics
-{
-    extern const Metric DictCacheRequests;
-}
+#include <Common/ProfileEvents.h>
+#include <Common/ProfilingScopedRWLock.h>
+#include <Common/Stopwatch.h>
+#include <Common/randomSeed.h>
+#include <Dictionaries/ComplexKeyCacheDictionary.h>
+#include <Dictionaries/DictionaryBlockInputStream.h>
 
+#include <ext/map.h>
+#include <ext/range.h>
 
 namespace DB
 {
-
 namespace ErrorCodes
 {
-    extern const int TYPE_MISMATCH;
-    extern const int BAD_ARGUMENTS;
-    extern const int UNSUPPORTED_METHOD;
-}
+extern const int TYPE_MISMATCH;
+extern const int BAD_ARGUMENTS;
+extern const int UNSUPPORTED_METHOD;
+} // namespace ErrorCodes
 
 
 inline UInt64 ComplexKeyCacheDictionary::getCellIdx(const StringRef key) const
@@ -64,13 +43,14 @@ inline UInt64 ComplexKeyCacheDictionary::getCellIdx(const StringRef key) const
 }
 
 
-ComplexKeyCacheDictionary::ComplexKeyCacheDictionary(const std::string & name, const DictionaryStructure & dict_struct,
-    DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime,
-    const size_t size)
-    : name{name}, dict_struct(dict_struct), source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
-    size{roundUpToPowerOfTwoOrZero(std::max(size, size_t(max_collision_length)))},
-    size_overlap_mask{this->size - 1},
-    rnd_engine(randomSeed())
+ComplexKeyCacheDictionary::ComplexKeyCacheDictionary(const std::string & name, const DictionaryStructure & dict_struct, DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime, const size_t size)
+    : name{name}
+    , dict_struct(dict_struct)
+    , source_ptr{std::move(source_ptr)}
+    , dict_lifetime(dict_lifetime)
+    , size{roundUpToPowerOfTwoOrZero(std::max(size, size_t(max_collision_length)))}
+    , size_overlap_mask{this->size - 1}
+    , rnd_engine(randomSeed())
 {
     if (!this->source_ptr->supportsSelectiveLoad())
         throw Exception{
@@ -85,7 +65,9 @@ ComplexKeyCacheDictionary::ComplexKeyCacheDictionary(const ComplexKeyCacheDictio
 {}
 
 void ComplexKeyCacheDictionary::getString(
-    const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types,
+    const std::string & attribute_name,
+    const Columns & key_columns,
+    const DataTypes & key_types,
     ColumnString * out) const
 {
     dict_struct.validateKeyTypes(key_types);
@@ -98,12 +80,15 @@ void ComplexKeyCacheDictionary::getString(
 
     const auto null_value = StringRef{std::get<String>(attribute.null_values)};
 
-    getItemsString(attribute, key_columns, out, [&] (const size_t) { return null_value; });
+    getItemsString(attribute, key_columns, out, [&](const size_t) { return null_value; });
 }
 
 void ComplexKeyCacheDictionary::getString(
-    const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types,
-    const ColumnString * const def, ColumnString * const out) const
+    const std::string & attribute_name,
+    const Columns & key_columns,
+    const DataTypes & key_types,
+    const ColumnString * const def,
+    ColumnString * const out) const
 {
     dict_struct.validateKeyTypes(key_types);
 
@@ -113,12 +98,15 @@ void ComplexKeyCacheDictionary::getString(
             name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
             ErrorCodes::TYPE_MISMATCH};
 
-    getItemsString(attribute, key_columns, out, [&] (const size_t row) { return def->getDataAt(row); });
+    getItemsString(attribute, key_columns, out, [&](const size_t row) { return def->getDataAt(row); });
 }
 
 void ComplexKeyCacheDictionary::getString(
-    const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types,
-    const String & def, ColumnString * const out) const
+    const std::string & attribute_name,
+    const Columns & key_columns,
+    const DataTypes & key_types,
+    const String & def,
+    ColumnString * const out) const
 {
     dict_struct.validateKeyTypes(key_types);
 
@@ -128,7 +116,7 @@ void ComplexKeyCacheDictionary::getString(
             name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
             ErrorCodes::TYPE_MISMATCH};
 
-    getItemsString(attribute, key_columns, out, [&] (const size_t) { return StringRef{def}; });
+    getItemsString(attribute, key_columns, out, [&](const size_t) { return StringRef{def}; });
 }
 
 /// returns cell_idx (always valid for replacing), 'cell is valid' flag, 'cell is outdated' flag,
@@ -190,8 +178,6 @@ void ComplexKeyCacheDictionary::has(const Columns & key_columns, const DataTypes
 
     size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0;
     {
-        const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
-
         const auto now = std::chrono::system_clock::now();
         /// fetch up-to-date values, decide which ones require update
         for (const auto row : ext::range(0, rows_num))
@@ -220,9 +206,6 @@ void ComplexKeyCacheDictionary::has(const Columns & key_columns, const DataTypes
             }
         }
     }
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired);
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
-    ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
 
     query_count.fetch_add(rows_num, std::memory_order_relaxed);
     hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release);
@@ -231,18 +214,18 @@ void ComplexKeyCacheDictionary::has(const Columns & key_columns, const DataTypes
         return;
 
     std::vector<size_t> required_rows(outdated_keys.size());
-    std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows),
-        [] (auto & pair) { return pair.getMapped().front(); });
+    std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) { return pair.getMapped().front(); });
 
     /// request new values
-    update(key_columns, keys_array, required_rows,
-        [&] (const StringRef key, const auto)
-        {
+    update(
+        key_columns,
+        keys_array,
+        required_rows,
+        [&](const StringRef key, const auto) {
             for (const auto out_idx : outdated_keys[key])
                 out[out_idx] = true;
         },
-        [&] (const StringRef key, const auto)
-        {
+        [&](const StringRef key, const auto) {
             for (const auto out_idx : outdated_keys[key])
                 out[out_idx] = false;
         });
@@ -297,8 +280,11 @@ void ComplexKeyCacheDictionary::freeKey(const StringRef key) const
 
 template <typename Pool>
 StringRef ComplexKeyCacheDictionary::placeKeysInPool(
-    const size_t row, const Columns & key_columns, StringRefs & keys,
-    const std::vector<DictionaryAttribute> & key_attributes, Pool & pool)
+    const size_t row,
+    const Columns & key_columns,
+    StringRefs & keys,
+    const std::vector<DictionaryAttribute> & key_attributes,
+    Pool & pool)
 {
     const auto keys_size = key_columns.size();
     size_t sum_keys_size{};
@@ -337,25 +323,32 @@ StringRef ComplexKeyCacheDictionary::placeKeysInPool(
         }
     }
 
-    return { place, sum_keys_size };
+    return {place, sum_keys_size};
 }
 
 /// Explicit instantiations.
 
 template StringRef ComplexKeyCacheDictionary::placeKeysInPool<Arena>(
-    const size_t row, const Columns & key_columns, StringRefs & keys,
-    const std::vector<DictionaryAttribute> & key_attributes, Arena & pool);
+    const size_t row,
+    const Columns & key_columns,
+    StringRefs & keys,
+    const std::vector<DictionaryAttribute> & key_attributes,
+    Arena & pool);
 
 template StringRef ComplexKeyCacheDictionary::placeKeysInPool<ArenaWithFreeLists>(
-    const size_t row, const Columns & key_columns, StringRefs & keys,
-    const std::vector<DictionaryAttribute> & key_attributes, ArenaWithFreeLists & pool);
+    const size_t row,
+    const Columns & key_columns,
+    StringRefs & keys,
+    const std::vector<DictionaryAttribute> & key_attributes,
+    ArenaWithFreeLists & pool);
 
 
 StringRef ComplexKeyCacheDictionary::placeKeysInFixedSizePool(
-    const size_t row, const Columns & key_columns) const
+    const size_t row,
+    const Columns & key_columns) const
 {
-    const auto res = fixed_size_keys_pool->alloc();
-    auto place = res;
+    auto * const res = fixed_size_keys_pool->alloc();
+    auto * place = res;
 
     for (const auto & key_column : key_columns)
     {
@@ -364,36 +357,33 @@ StringRef ComplexKeyCacheDictionary::placeKeysInFixedSizePool(
         place += key.size;
     }
 
-    return { res, key_size };
+    return {res, key_size};
 }
 
 StringRef ComplexKeyCacheDictionary::copyIntoArena(StringRef src, Arena & arena)
 {
     char * allocated = arena.alloc(src.size);
     memcpy(allocated, src.data, src.size);
-    return { allocated, src.size };
+    return {allocated, src.size};
 }
 
 StringRef ComplexKeyCacheDictionary::copyKey(const StringRef key) const
 {
-    const auto res = key_size_is_fixed ? fixed_size_keys_pool->alloc() : keys_pool->alloc(key.size);
+    auto * const res = key_size_is_fixed ? fixed_size_keys_pool->alloc() : keys_pool->alloc(key.size);
     memcpy(res, key.data, key.size);
 
-    return { res, key.size };
+    return {res, key.size};
 }
 
 bool ComplexKeyCacheDictionary::isEmptyCell(const UInt64 idx) const
 {
-    return (cells[idx].key == StringRef{} && (idx != zero_cell_idx
-        || cells[idx].data == ext::safe_bit_cast<CellMetadata::time_point_urep_t>(CellMetadata::time_point_t())));
+    return (cells[idx].key == StringRef{} && (idx != zero_cell_idx || cells[idx].data == ext::safe_bit_cast<CellMetadata::time_point_urep_t>(CellMetadata::time_point_t())));
 }
 
 BlockInputStreamPtr ComplexKeyCacheDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const
 {
     std::vector<StringRef> keys;
     {
-        const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
-
         for (auto idx : ext::range(0, cells.size()))
             if (!isEmptyCell(idx)
                 && !cells[idx].isDefault())
@@ -404,4 +394,4 @@ BlockInputStreamPtr ComplexKeyCacheDictionary::getBlockInputStream(const Names &
     return std::make_shared<BlockInputStreamType>(shared_from_this(), max_block_size, keys, column_names);
 }
 
-}
+} // namespace DB
diff --git a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.h b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.h
index feb61261f1d..19fe5214fef 100644
--- a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.h
+++ b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.h
@@ -14,12 +14,6 @@
 
 #pragma once
 
-#include <atomic>
-#include <chrono>
-#include <map>
-#include <tuple>
-#include <vector>
-#include <shared_mutex>
 #include <Columns/ColumnString.h>
 #include <Common/ArenaWithFreeLists.h>
 #include <Common/HashTable/HashMap.h>
@@ -29,24 +23,17 @@
 #include <Dictionaries/IDictionary.h>
 #include <Dictionaries/IDictionarySource.h>
 #include <common/StringRef.h>
+
+#include <atomic>
+#include <chrono>
 #include <ext/bit_cast.h>
 #include <ext/map.h>
 #include <ext/scope_guard.h>
+#include <map>
 #include <pcg_random.hpp>
-
-
-namespace ProfileEvents
-{
-extern const Event DictCacheKeysRequested;
-extern const Event DictCacheKeysRequestedMiss;
-extern const Event DictCacheKeysRequestedFound;
-extern const Event DictCacheKeysExpired;
-extern const Event DictCacheKeysNotFound;
-extern const Event DictCacheKeysHit;
-extern const Event DictCacheRequestTimeNs;
-extern const Event DictCacheLockWriteNs;
-extern const Event DictCacheLockReadNs;
-}
+#include <shared_mutex>
+#include <tuple>
+#include <vector>
 
 namespace DB
 {
@@ -54,10 +41,10 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 {
 public:
     ComplexKeyCacheDictionary(const std::string & name,
-        const DictionaryStructure & dict_struct,
-        DictionarySourcePtr source_ptr,
-        const DictionaryLifetime dict_lifetime,
-        const size_t size);
+                              const DictionaryStructure & dict_struct,
+                              DictionarySourcePtr source_ptr,
+                              const DictionaryLifetime dict_lifetime,
+                              const size_t size);
 
     ComplexKeyCacheDictionary(const ComplexKeyCacheDictionary & other);
 
@@ -144,9 +131,12 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 
 /// In all functions below, key_columns must be full (non-constant) columns.
 /// See the requirement in IDataType.h for text-serialization functions.
-#define DECLARE(TYPE) \
-    void get##TYPE(   \
-        const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, PaddedPODArray<TYPE> & out) const;
+#define DECLARE(TYPE)                       \
+    void get##TYPE(                         \
+        const std::string & attribute_name, \
+        const Columns & key_columns,        \
+        const DataTypes & key_types,        \
+        PaddedPODArray<TYPE> & out) const;
     DECLARE(UInt8)
     DECLARE(UInt16)
     DECLARE(UInt32)
@@ -164,10 +154,10 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 
 #define DECLARE(TYPE)                                  \
     void get##TYPE(const std::string & attribute_name, \
-        const Columns & key_columns,                   \
-        const DataTypes & key_types,                   \
-        const PaddedPODArray<TYPE> & def,              \
-        PaddedPODArray<TYPE> & out) const;
+                   const Columns & key_columns,        \
+                   const DataTypes & key_types,        \
+                   const PaddedPODArray<TYPE> & def,   \
+                   PaddedPODArray<TYPE> & out) const;
     DECLARE(UInt8)
     DECLARE(UInt16)
     DECLARE(UInt32)
@@ -182,17 +172,17 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 #undef DECLARE
 
     void getString(const std::string & attribute_name,
-        const Columns & key_columns,
-        const DataTypes & key_types,
-        const ColumnString * const def,
-        ColumnString * const out) const;
+                   const Columns & key_columns,
+                   const DataTypes & key_types,
+                   const ColumnString * const def,
+                   ColumnString * const out) const;
 
 #define DECLARE(TYPE)                                  \
     void get##TYPE(const std::string & attribute_name, \
-        const Columns & key_columns,                   \
-        const DataTypes & key_types,                   \
-        const TYPE def,                                \
-        PaddedPODArray<TYPE> & out) const;
+                   const Columns & key_columns,        \
+                   const DataTypes & key_types,        \
+                   const TYPE def,                     \
+                   PaddedPODArray<TYPE> & out) const;
     DECLARE(UInt8)
     DECLARE(UInt16)
     DECLARE(UInt32)
@@ -207,10 +197,10 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 #undef DECLARE
 
     void getString(const std::string & attribute_name,
-        const Columns & key_columns,
-        const DataTypes & key_types,
-        const String & def,
-        ColumnString * const out) const;
+                   const Columns & key_columns,
+                   const DataTypes & key_types,
+                   const String & def,
+                   ColumnString * const out) const;
 
     void has(const Columns & key_columns, const DataTypes & key_types, PaddedPODArray<UInt8> & out) const;
 
@@ -263,17 +253,17 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
         AttributeUnderlyingType type;
         std::tuple<UInt8, UInt16, UInt32, UInt64, UInt128, Int8, Int16, Int32, Int64, Float32, Float64, String> null_values;
         std::tuple<ContainerPtrType<UInt8>,
-            ContainerPtrType<UInt16>,
-            ContainerPtrType<UInt32>,
-            ContainerPtrType<UInt64>,
-            ContainerPtrType<UInt128>,
-            ContainerPtrType<Int8>,
-            ContainerPtrType<Int16>,
-            ContainerPtrType<Int32>,
-            ContainerPtrType<Int64>,
-            ContainerPtrType<Float32>,
-            ContainerPtrType<Float64>,
-            ContainerPtrType<StringRef>>
+                   ContainerPtrType<UInt16>,
+                   ContainerPtrType<UInt32>,
+                   ContainerPtrType<UInt64>,
+                   ContainerPtrType<UInt128>,
+                   ContainerPtrType<Int8>,
+                   ContainerPtrType<Int16>,
+                   ContainerPtrType<Int32>,
+                   ContainerPtrType<Int64>,
+                   ContainerPtrType<Float32>,
+                   ContainerPtrType<Float64>,
+                   ContainerPtrType<StringRef>>
             arrays;
     };
 
@@ -283,7 +273,10 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 
     template <typename OutputType, typename DefaultGetter>
     void getItemsNumber(
-        Attribute & attribute, const Columns & key_columns, PaddedPODArray<OutputType> & out, DefaultGetter && get_default) const
+        Attribute & attribute,
+        const Columns & key_columns,
+        PaddedPODArray<OutputType> & out,
+        DefaultGetter && get_default) const
     {
         if (false)
         {
@@ -308,7 +301,10 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 
     template <typename AttributeType, typename OutputType, typename DefaultGetter>
     void getItemsNumberImpl(
-        Attribute & attribute, const Columns & key_columns, PaddedPODArray<OutputType> & out, DefaultGetter && get_default) const
+        Attribute & attribute,
+        const Columns & key_columns,
+        PaddedPODArray<OutputType> & out,
+        DefaultGetter && get_default) const
     {
         /// Mapping: <key> -> { all indices `i` of `key_columns` such that `key_columns[i]` = <key> }
         MapType<std::vector<size_t>> outdated_keys;
@@ -322,8 +318,6 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 
         size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0;
         {
-            const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
-
             const auto now = std::chrono::system_clock::now();
             /// fetch up-to-date values, decide which ones require update
             for (const auto row : ext::range(0, rows_num))
@@ -354,9 +348,6 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
                 }
             }
         }
-        ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired);
-        ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
-        ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
         query_count.fetch_add(rows_num, std::memory_order_relaxed);
         hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release);
 
@@ -365,19 +356,21 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 
         std::vector<size_t> required_rows(outdated_keys.size());
         std::transform(
-            std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) { return pair.getMapped().front(); });
+            std::begin(outdated_keys),
+            std::end(outdated_keys),
+            std::begin(required_rows),
+            [](auto & pair) { return pair.getMapped().front(); });
 
         /// request new values
-        update(key_columns,
+        update(
+            key_columns,
             keys_array,
             required_rows,
-            [&](const StringRef key, const size_t cell_idx)
-            {
+            [&](const StringRef key, const size_t cell_idx) {
                 for (const auto row : outdated_keys[key])
                     out[row] = static_cast<OutputType>(attribute_array[cell_idx]);
             },
-            [&](const StringRef key, const size_t)
-            {
+            [&](const StringRef key, const size_t) {
                 for (const auto row : outdated_keys[key])
                     out[row] = get_default(row);
             });
@@ -400,8 +393,6 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 
         /// perform optimistic version, fallback to pessimistic if failed
         {
-            const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
-
             const auto now = std::chrono::system_clock::now();
             /// fetch up-to-date values, discard on fail
             for (const auto row : ext::range(0, rows_num))
@@ -446,8 +437,6 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
         size_t total_length = 0;
         size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0;
         {
-            const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
-
             const auto now = std::chrono::system_clock::now();
             for (const auto row : ext::range(0, rows_num))
             {
@@ -477,9 +466,6 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
                 }
             }
         }
-        ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired);
-        ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
-        ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
 
         query_count.fetch_add(rows_num, std::memory_order_relaxed);
         hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release);
@@ -488,16 +474,15 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
         if (!outdated_keys.empty())
         {
             std::vector<size_t> required_rows(outdated_keys.size());
-            std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair)
-            {
+            std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) {
                 return pair.getMapped().front();
             });
 
-            update(key_columns,
+            update(
+                key_columns,
                 keys_array,
                 required_rows,
-                [&](const StringRef key, const size_t cell_idx)
-                {
+                [&](const StringRef key, const size_t cell_idx) {
                     const StringRef attribute_value = attribute_array[cell_idx];
 
                     /// We must copy key and value to own memory, because it may be replaced with another
@@ -508,8 +493,7 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
                     map[copied_key] = copied_value;
                     total_length += (attribute_value.size + 1) * outdated_keys[key].size();
                 },
-                [&](const StringRef key, const size_t)
-                {
+                [&](const StringRef key, const size_t) {
                     for (const auto row : outdated_keys[key])
                         total_length += get_default(row).size + 1;
                 });
@@ -521,17 +505,17 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
         {
             const StringRef key = keys_array[row];
             const auto it = map.find(key);
-            const auto string_ref = it != std::end(map) ? it->getMapped(): get_default(row);
+            const auto string_ref = it != std::end(map) ? it->getMapped() : get_default(row);
             out->insertData(string_ref.data, string_ref.size);
         }
     };
 
     template <typename PresentKeyHandler, typename AbsentKeyHandler>
     void update(const Columns & in_key_columns,
-        const PODArray<StringRef> & in_keys,
-        const std::vector<size_t> & in_requested_rows,
-        PresentKeyHandler && on_cell_updated,
-        AbsentKeyHandler && on_key_not_found) const
+                const PODArray<StringRef> & in_keys,
+                const std::vector<size_t> & in_requested_rows,
+                PresentKeyHandler && on_cell_updated,
+                AbsentKeyHandler && on_key_not_found) const
     {
         MapType<bool> remaining_keys{in_requested_rows.size()};
         for (const auto row : in_requested_rows)
@@ -539,7 +523,6 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 
         std::uniform_int_distribution<UInt64> distribution(dict_lifetime.min_sec, dict_lifetime.max_sec);
 
-        const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
         {
             Stopwatch watch;
             auto stream = source_ptr->loadKeys(in_key_columns, in_requested_rows);
@@ -555,10 +538,11 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
             {
                 /// cache column pointers
                 const auto key_columns = ext::map<Columns>(
-                    ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; });
+                    ext::range(0, keys_size),
+                    [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; });
 
                 const auto attribute_columns = ext::map<Columns>(ext::range(0, attributes_size),
-                    [&](const size_t attribute_idx) { return block.safeGetByPosition(keys_size + attribute_idx).column; });
+                                                                 [&](const size_t attribute_idx) { return block.safeGetByPosition(keys_size + attribute_idx).column; });
 
                 const auto rows_num = block.rows();
 
@@ -612,9 +596,6 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
             }
 
             stream->readSuffix();
-
-            ProfileEvents::increment(ProfileEvents::DictCacheKeysRequested, in_requested_rows.size());
-            ProfileEvents::increment(ProfileEvents::DictCacheRequestTimeNs, watch.elapsed());
         }
 
         size_t found_num = 0;
@@ -671,9 +652,6 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
             /// inform caller that the cell has not been found
             on_key_not_found(key, cell_idx);
         }
-
-        ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, found_num);
-        ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, not_found_num);
     };
 
     UInt64 getCellIdx(const StringRef key) const;
@@ -690,10 +668,10 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 
     template <typename Arena>
     static StringRef placeKeysInPool(const size_t row,
-        const Columns & key_columns,
-        StringRefs & keys,
-        const std::vector<DictionaryAttribute> & key_attributes,
-        Arena & pool);
+                                     const Columns & key_columns,
+                                     StringRefs & keys,
+                                     const std::vector<DictionaryAttribute> & key_attributes,
+                                     Arena & pool);
 
     StringRef placeKeysInFixedSizePool(const size_t row, const Columns & key_columns) const;
 
@@ -752,4 +730,4 @@ class ComplexKeyCacheDictionary final : public IDictionaryBase
 
     const std::chrono::time_point<std::chrono::system_clock> creation_time = std::chrono::system_clock::now();
 };
-}
+} // namespace DB
diff --git a/dbms/src/Encryption/WriteBufferFromFileProvider.cpp b/dbms/src/Encryption/WriteBufferFromFileProvider.cpp
index 4c99b8e24b1..a17dd85d379 100644
--- a/dbms/src/Encryption/WriteBufferFromFileProvider.cpp
+++ b/dbms/src/Encryption/WriteBufferFromFileProvider.cpp
@@ -19,7 +19,6 @@
 namespace ProfileEvents
 {
 extern const Event WriteBufferFromFileDescriptorWrite;
-extern const Event WriteBufferFromFileDescriptorWriteFailed;
 extern const Event WriteBufferFromFileDescriptorWriteBytes;
 } // namespace ProfileEvents
 
@@ -72,8 +71,7 @@ void WriteBufferFromFileProvider::nextImpl()
 
         if ((-1 == res || 0 == res) && errno != EINTR)
         {
-            ProfileEvents::increment(ProfileEvents::WriteBufferFromFileDescriptorWriteFailed);
-            throwFromErrno("Cannot write to file " + getFileName(), ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR);
+            throwFromErrno("Cannot write to file " + getFileName(), ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR); // NOLINT
         }
 
         if (res > 0)
diff --git a/dbms/src/Encryption/createReadBufferFromFileBaseByFileProvider.cpp b/dbms/src/Encryption/createReadBufferFromFileBaseByFileProvider.cpp
index b76d58c20cd..1858d474d60 100644
--- a/dbms/src/Encryption/createReadBufferFromFileBaseByFileProvider.cpp
+++ b/dbms/src/Encryption/createReadBufferFromFileBaseByFileProvider.cpp
@@ -20,10 +20,6 @@
 #endif
 #include <Common/ProfileEvents.h>
 #include <IO/ChecksumBuffer.h>
-namespace ProfileEvents
-{
-extern const Event CreatedReadBufferOrdinary;
-}
 
 namespace DB
 {
@@ -46,7 +42,6 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBaseByFileProvid
 {
     if ((aio_threshold == 0) || (estimated_size < aio_threshold))
     {
-        ProfileEvents::increment(ProfileEvents::CreatedReadBufferOrdinary);
         return std::make_unique<ReadBufferFromFileProvider>(
             file_provider,
             filename_,
@@ -75,7 +70,6 @@ createReadBufferFromFileBaseByFileProvider(
     size_t checksum_frame_size,
     int flags_)
 {
-    ProfileEvents::increment(ProfileEvents::CreatedReadBufferOrdinary);
     auto file = file_provider->newRandomAccessFile(filename_, encryption_path_, read_limiter, flags_);
     auto allocation_size = std::min(estimated_size, checksum_frame_size);
     switch (checksum_algorithm)
diff --git a/dbms/src/Encryption/createWriteBufferFromFileBaseByFileProvider.cpp b/dbms/src/Encryption/createWriteBufferFromFileBaseByFileProvider.cpp
index 2f1a2cbaeb8..5e8a6940598 100644
--- a/dbms/src/Encryption/createWriteBufferFromFileBaseByFileProvider.cpp
+++ b/dbms/src/Encryption/createWriteBufferFromFileBaseByFileProvider.cpp
@@ -20,11 +20,6 @@
 #include <Common/ProfileEvents.h>
 #include <IO/ChecksumBuffer.h>
 
-namespace ProfileEvents
-{
-extern const Event CreatedWriteBufferOrdinary;
-}
-
 namespace DB
 {
 namespace ErrorCodes
@@ -49,7 +44,6 @@ createWriteBufferFromFileBaseByFileProvider(
 {
     if ((aio_threshold == 0) || (estimated_size < aio_threshold))
     {
-        ProfileEvents::increment(ProfileEvents::CreatedWriteBufferOrdinary);
         return std::make_unique<WriteBufferFromFileProvider>(
             file_provider,
             filename_,
@@ -81,7 +75,6 @@ createWriteBufferFromFileBaseByFileProvider(
     int flags_,
     mode_t mode)
 {
-    ProfileEvents::increment(ProfileEvents::CreatedWriteBufferOrdinary);
     auto file_ptr
         = file_provider->newWritableFile(filename_, encryption_path_, true, create_new_encryption_info_, write_limiter_, flags_, mode);
     switch (checksum_algorithm)
diff --git a/dbms/src/Flash/Coprocessor/ArrowColCodec.cpp b/dbms/src/Flash/Coprocessor/ArrowColCodec.cpp
index a1c6061948a..1609c83b029 100644
--- a/dbms/src/Flash/Coprocessor/ArrowColCodec.cpp
+++ b/dbms/src/Flash/Coprocessor/ArrowColCodec.cpp
@@ -20,7 +20,6 @@
 #include <DataTypes/DataTypeDecimal.h>
 #include <DataTypes/DataTypeEnum.h>
 #include <DataTypes/DataTypeMyDate.h>
-#include <DataTypes/DataTypeMyDateTime.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
@@ -41,7 +40,7 @@ extern const int NOT_IMPLEMENTED;
 const IColumn * getNestedCol(const IColumn * flash_col)
 {
     if (flash_col->isColumnNullable())
-        return dynamic_cast<const ColumnNullable *>(flash_col)->getNestedColumnPtr().get();
+        return static_cast<const ColumnNullable *>(flash_col)->getNestedColumnPtr().get();
     else
         return flash_col;
 }
@@ -75,8 +74,8 @@ bool flashDecimalColToArrowColInternal(
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
     if (checkColumn<ColumnDecimal<T>>(nested_col) && checkDataType<DataTypeDecimal<T>>(data_type))
     {
-        const ColumnDecimal<T> * flash_col = checkAndGetColumn<ColumnDecimal<T>>(nested_col);
-        const DataTypeDecimal<T> * type = checkAndGetDataType<DataTypeDecimal<T>>(data_type);
+        const auto * flash_col = checkAndGetColumn<ColumnDecimal<T>>(nested_col);
+        const auto * type = checkAndGetDataType<DataTypeDecimal<T>>(data_type);
         UInt32 scale = type->getScale();
         for (size_t i = start_index; i < end_index; i++)
         {
@@ -92,8 +91,8 @@ bool flashDecimalColToArrowColInternal(
             std::vector<Int32> digits;
             digits.reserve(type->getPrec());
             decimalToVector<typename T::NativeType>(dec.value, digits, scale);
-            TiDBDecimal tiDecimal(scale, digits, dec.value < 0);
-            dag_column.append(tiDecimal);
+            TiDBDecimal ti_decimal(scale, digits, dec.value < 0);
+            dag_column.append(ti_decimal);
         }
         return true;
     }
@@ -121,7 +120,7 @@ template <typename T, bool is_nullable>
 bool flashIntegerColToArrowColInternal(TiDBColumn & dag_column, const IColumn * flash_col_untyped, size_t start_index, size_t end_index)
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
-    if (const ColumnVector<T> * flash_col = checkAndGetColumn<ColumnVector<T>>(nested_col))
+    if (const auto * flash_col = checkAndGetColumn<ColumnVector<T>>(nested_col))
     {
         constexpr bool is_unsigned = std::is_unsigned_v<T>;
         for (size_t i = start_index; i < end_index; i++)
@@ -135,9 +134,9 @@ bool flashIntegerColToArrowColInternal(TiDBColumn & dag_column, const IColumn *
                 }
             }
             if constexpr (is_unsigned)
-                dag_column.append((UInt64)flash_col->getElement(i));
+                dag_column.append(static_cast<UInt64>(flash_col->getElement(i)));
             else
-                dag_column.append((Int64)flash_col->getElement(i));
+                dag_column.append(static_cast<UInt64>(flash_col->getElement(i)));
         }
         return true;
     }
@@ -148,7 +147,7 @@ template <typename T, bool is_nullable>
 void flashDoubleColToArrowCol(TiDBColumn & dag_column, const IColumn * flash_col_untyped, size_t start_index, size_t end_index)
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
-    if (const ColumnVector<T> * flash_col = checkAndGetColumn<ColumnVector<T>>(nested_col))
+    if (const auto * flash_col = checkAndGetColumn<ColumnVector<T>>(nested_col))
     {
         for (size_t i = start_index; i < end_index; i++)
         {
@@ -160,7 +159,7 @@ void flashDoubleColToArrowCol(TiDBColumn & dag_column, const IColumn * flash_col
                     continue;
                 }
             }
-            dag_column.append((T)flash_col->getElement(i));
+            dag_column.append(static_cast<T>(flash_col->getElement(i)));
         }
         return;
     }
@@ -196,7 +195,7 @@ void flashDateOrDateTimeColToArrowCol(
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
     using DateFieldType = DataTypeMyTimeBase::FieldType;
-    auto * flash_col = checkAndGetColumn<ColumnVector<DateFieldType>>(nested_col);
+    const auto * flash_col = checkAndGetColumn<ColumnVector<DateFieldType>>(nested_col);
     for (size_t i = start_index; i < end_index; i++)
     {
         if constexpr (is_nullable)
@@ -217,7 +216,7 @@ void flashStringColToArrowCol(TiDBColumn & dag_column, const IColumn * flash_col
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
     // columnFixedString is not used so do not check it
-    auto * flash_col = checkAndGetColumn<ColumnString>(nested_col);
+    const auto * flash_col = checkAndGetColumn<ColumnString>(nested_col);
     for (size_t i = start_index; i < end_index; i++)
     {
         // todo check if we can convert flash_col to DAG col directly since the internal representation is almost the same
@@ -242,7 +241,7 @@ void flashBitColToArrowCol(
     const tipb::FieldType & field_type)
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
-    auto * flash_col = checkAndGetColumn<ColumnVector<UInt64>>(nested_col);
+    const auto * flash_col = checkAndGetColumn<ColumnVector<UInt64>>(nested_col);
     for (size_t i = start_index; i < end_index; i++)
     {
         if constexpr (is_nullable)
@@ -267,7 +266,7 @@ void flashEnumColToArrowCol(
     const IDataType * data_type)
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
-    auto * flash_col = checkAndGetColumn<ColumnVector<DataTypeEnum16::FieldType>>(nested_col);
+    const auto * flash_col = checkAndGetColumn<ColumnVector<DataTypeEnum16::FieldType>>(nested_col);
     const auto * enum_type = checkAndGetDataType<DataTypeEnum16>(data_type);
     size_t enum_value_size = enum_type->getValues().size();
     for (size_t i = start_index; i < end_index; i++)
@@ -280,10 +279,10 @@ void flashEnumColToArrowCol(
                 continue;
             }
         }
-        auto enum_value = (UInt64)flash_col->getElement(i);
+        auto enum_value = static_cast<UInt64>(flash_col->getElement(i));
         if (enum_value == 0 || enum_value > enum_value_size)
             throw TiFlashException("number of enum overflow enum boundary", Errors::Coprocessor::Internal);
-        TiDBEnum ti_enum(enum_value, enum_type->getNameForValue((const DataTypeEnum16::FieldType)enum_value));
+        TiDBEnum ti_enum(enum_value, enum_type->getNameForValue(static_cast<const DataTypeEnum16::FieldType>(enum_value)));
         dag_column.append(ti_enum);
     }
 }
@@ -300,7 +299,7 @@ void flashColToArrowCol(TiDBColumn & dag_column, const ColumnWithTypeAndName & f
         throw TiFlashException("Flash column and TiDB column has different not null flag", Errors::Coprocessor::Internal);
     }
     if (type->isNullable())
-        type = dynamic_cast<const DataTypeNullable *>(type)->getNestedType().get();
+        type = static_cast<const DataTypeNullable *>(type)->getNestedType().get();
 
     switch (tidb_column_info.tp)
     {
@@ -457,7 +456,7 @@ const char * arrowEnumColToFlashCol(
     {
         if (checkNull(i, null_count, null_bitmap, col))
             continue;
-        const auto enum_value = (Int64)toLittleEndian(*(reinterpret_cast<const UInt32 *>(pos + offsets[i])));
+        const auto enum_value = static_cast<Int64>(toLittleEndian(*(reinterpret_cast<const UInt32 *>(pos + offsets[i]))));
         col.column->assumeMutable()->insert(Field(enum_value));
     }
     return pos + offsets[length];
@@ -479,11 +478,11 @@ const char * arrowBitColToFlashCol(
             continue;
         const String value = String(pos + offsets[i], pos + offsets[i + 1]);
         if (value.length() == 0)
-            col.column->assumeMutable()->insert(Field(UInt64(0)));
+            col.column->assumeMutable()->insert(Field(static_cast<UInt64>(0)));
         UInt64 result = 0;
-        for (auto & c : value)
+        for (const auto & c : value)
         {
-            result = (result << 8u) | (UInt8)c;
+            result = (result << 8u) | static_cast<UInt8>(c);
         }
         col.column->assumeMutable()->insert(Field(result));
     }
@@ -500,7 +499,7 @@ T toCHDecimal(UInt8 digits_int, UInt8 digits_frac, bool negative, const Int32 *
     UInt8 tailing_digit = digits_frac % DIGITS_PER_WORD;
 
     typename T::NativeType value = 0;
-    const int word_max = int(1e9);
+    const int word_max = static_cast<int>(1e9);
     for (int i = 0; i < word_int; i++)
     {
         value = value * word_max + word_buf[i];
@@ -552,28 +551,28 @@ const char * arrowDecimalColToFlashCol(
         pos += 1;
         Int32 word_buf[MAX_WORD_BUF_LEN];
         const DataTypePtr decimal_type
-            = col.type->isNullable() ? dynamic_cast<const DataTypeNullable *>(col.type.get())->getNestedType() : col.type;
-        for (int j = 0; j < MAX_WORD_BUF_LEN; j++)
+            = col.type->isNullable() ? static_cast<const DataTypeNullable *>(col.type.get())->getNestedType() : col.type;
+        for (int & j : word_buf)
         {
-            word_buf[j] = toLittleEndian(*(reinterpret_cast<const Int32 *>(pos)));
+            j = toLittleEndian(*(reinterpret_cast<const Int32 *>(pos)));
             pos += 4;
         }
-        if (auto * type32 = checkDecimal<Decimal32>(*decimal_type))
+        if (const auto * type32 = checkDecimal<Decimal32>(*decimal_type))
         {
             auto res = toCHDecimal<Decimal32>(digits_int, digits_frac, negative, word_buf);
             col.column->assumeMutable()->insert(DecimalField<Decimal32>(res, type32->getScale()));
         }
-        else if (auto * type64 = checkDecimal<Decimal64>(*decimal_type))
+        else if (const auto * type64 = checkDecimal<Decimal64>(*decimal_type))
         {
             auto res = toCHDecimal<Decimal64>(digits_int, digits_frac, negative, word_buf);
             col.column->assumeMutable()->insert(DecimalField<Decimal64>(res, type64->getScale()));
         }
-        else if (auto * type128 = checkDecimal<Decimal128>(*decimal_type))
+        else if (const auto * type128 = checkDecimal<Decimal128>(*decimal_type))
         {
             auto res = toCHDecimal<Decimal128>(digits_int, digits_frac, negative, word_buf);
             col.column->assumeMutable()->insert(DecimalField<Decimal128>(res, type128->getScale()));
         }
-        else if (auto * type256 = checkDecimal<Decimal256>(*decimal_type))
+        else if (const auto * type256 = checkDecimal<Decimal256>(*decimal_type))
         {
             auto res = toCHDecimal<Decimal256>(digits_int, digits_frac, negative, word_buf);
             col.column->assumeMutable()->insert(DecimalField<Decimal256>(res, type256->getScale()));
@@ -600,13 +599,13 @@ const char * arrowDateColToFlashCol(
             continue;
         }
         UInt64 chunk_time = toLittleEndian(*(reinterpret_cast<const UInt64 *>(pos)));
-        UInt16 year = (UInt16)((chunk_time & MyTimeBase::YEAR_BIT_FIELD_MASK) >> MyTimeBase::YEAR_BIT_FIELD_OFFSET);
-        UInt8 month = (UInt8)((chunk_time & MyTimeBase::MONTH_BIT_FIELD_MASK) >> MyTimeBase::MONTH_BIT_FIELD_OFFSET);
-        UInt8 day = (UInt8)((chunk_time & MyTimeBase::DAY_BIT_FIELD_MASK) >> MyTimeBase::DAY_BIT_FIELD_OFFSET);
-        UInt16 hour = (UInt16)((chunk_time & MyTimeBase::HOUR_BIT_FIELD_MASK) >> MyTimeBase::HOUR_BIT_FIELD_OFFSET);
-        UInt8 minute = (UInt8)((chunk_time & MyTimeBase::MINUTE_BIT_FIELD_MASK) >> MyTimeBase::MINUTE_BIT_FIELD_OFFSET);
-        UInt8 second = (UInt8)((chunk_time & MyTimeBase::SECOND_BIT_FIELD_MASK) >> MyTimeBase::SECOND_BIT_FIELD_OFFSET);
-        UInt32 micro_second = (UInt32)((chunk_time & MyTimeBase::MICROSECOND_BIT_FIELD_MASK) >> MyTimeBase::MICROSECOND_BIT_FIELD_OFFSET);
+        auto year = static_cast<UInt16>((chunk_time & MyTimeBase::YEAR_BIT_FIELD_MASK) >> MyTimeBase::YEAR_BIT_FIELD_OFFSET);
+        auto month = static_cast<UInt8>((chunk_time & MyTimeBase::MONTH_BIT_FIELD_MASK) >> MyTimeBase::MONTH_BIT_FIELD_OFFSET);
+        auto day = static_cast<UInt8>((chunk_time & MyTimeBase::DAY_BIT_FIELD_MASK) >> MyTimeBase::DAY_BIT_FIELD_OFFSET);
+        auto hour = static_cast<UInt16>((chunk_time & MyTimeBase::HOUR_BIT_FIELD_MASK) >> MyTimeBase::HOUR_BIT_FIELD_OFFSET);
+        auto minute = static_cast<UInt8>((chunk_time & MyTimeBase::MINUTE_BIT_FIELD_MASK) >> MyTimeBase::MINUTE_BIT_FIELD_OFFSET);
+        auto second = static_cast<UInt8>((chunk_time & MyTimeBase::SECOND_BIT_FIELD_MASK) >> MyTimeBase::SECOND_BIT_FIELD_OFFSET);
+        auto micro_second = static_cast<UInt32>((chunk_time & MyTimeBase::MICROSECOND_BIT_FIELD_MASK) >> MyTimeBase::MICROSECOND_BIT_FIELD_OFFSET);
         MyDateTime mt(year, month, day, hour, minute, second, micro_second);
         pos += field_length;
         col.column->assumeMutable()->insert(Field(mt.toPackedUInt()));
@@ -659,7 +658,7 @@ const char * arrowNumColToFlashCol(
         case TiDB::TypeFloat:
             u32 = toLittleEndian(*(reinterpret_cast<const UInt32 *>(pos)));
             std::memcpy(&f32, &u32, sizeof(Float32));
-            col.column->assumeMutable()->insert(Field((Float64)f32));
+            col.column->assumeMutable()->insert(Field(static_cast<Float64>(f32)));
             break;
         case TiDB::TypeDouble:
             u64 = toLittleEndian(*(reinterpret_cast<const UInt64 *>(pos)));
diff --git a/dbms/src/Flash/Coprocessor/CoprocessorReader.h b/dbms/src/Flash/Coprocessor/CoprocessorReader.h
index 25c07cff49c..b48fdbcd6dc 100644
--- a/dbms/src/Flash/Coprocessor/CoprocessorReader.h
+++ b/dbms/src/Flash/Coprocessor/CoprocessorReader.h
@@ -139,7 +139,8 @@ class CoprocessorReader
         return detail;
     }
 
-    CoprocessorReaderResult nextResult(std::queue<Block> & block_queue, const Block & header)
+    // stream_id is only meaningful for ExchagneReceiver.
+    CoprocessorReaderResult nextResult(std::queue<Block> & block_queue, const Block & header, size_t /*stream_id*/)
     {
         auto && [result, has_next] = resp_iter.next();
         if (!result.error.empty())
diff --git a/dbms/src/Flash/Coprocessor/DAGContext.cpp b/dbms/src/Flash/Coprocessor/DAGContext.cpp
index a17eaf53b64..1cf7a0d6c87 100644
--- a/dbms/src/Flash/Coprocessor/DAGContext.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGContext.cpp
@@ -30,6 +30,8 @@ extern const int DIVIDED_BY_ZERO;
 extern const int INVALID_TIME;
 } // namespace ErrorCodes
 
+const String enableFineGrainedShuffleExtraInfo = "enable fine grained shuffle";
+
 bool strictSqlMode(UInt64 sql_mode)
 {
     return sql_mode & TiDBSQLMode::STRICT_ALL_TABLES || sql_mode & TiDBSQLMode::STRICT_TRANS_TABLES;
@@ -210,60 +212,21 @@ void DAGContext::attachBlockIO(const BlockIO & io_)
 {
     io = io_;
 }
-void DAGContext::initExchangeReceiverIfMPP(Context & context, size_t max_streams)
-{
-    if (isMPPTask())
-    {
-        if (mpp_exchange_receiver_map_inited)
-            throw TiFlashException("Repeatedly initialize mpp_exchange_receiver_map", Errors::Coprocessor::Internal);
-        traverseExecutors(dag_request, [&](const tipb::Executor & executor) {
-            if (executor.tp() == tipb::ExecType::TypeExchangeReceiver)
-            {
-                assert(executor.has_executor_id());
-                const auto & executor_id = executor.executor_id();
-                // In order to distinguish different exchange receivers.
-                auto exchange_receiver = std::make_shared<ExchangeReceiver>(
-                    std::make_shared<GRPCReceiverContext>(
-                        executor.exchange_receiver(),
-                        getMPPTaskMeta(),
-                        context.getTMTContext().getKVCluster(),
-                        context.getTMTContext().getMPPTaskManager(),
-                        context.getSettingsRef().enable_local_tunnel,
-                        context.getSettingsRef().enable_async_grpc_client),
-                    executor.exchange_receiver().encoded_task_meta_size(),
-                    max_streams,
-                    log->identifier(),
-                    executor_id);
-                mpp_exchange_receiver_map[executor_id] = exchange_receiver;
-                new_thread_count_of_exchange_receiver += exchange_receiver->computeNewThreadCount();
-            }
-            return true;
-        });
-        mpp_exchange_receiver_map_inited = true;
-    }
-}
-
 
-const std::unordered_map<String, std::shared_ptr<ExchangeReceiver>> & DAGContext::getMPPExchangeReceiverMap() const
+ExchangeReceiverPtr DAGContext::getMPPExchangeReceiver(const String & executor_id) const
 {
     if (!isMPPTask())
         throw TiFlashException("mpp_exchange_receiver_map is used in mpp only", Errors::Coprocessor::Internal);
-    if (!mpp_exchange_receiver_map_inited)
-        throw TiFlashException("mpp_exchange_receiver_map has not been initialized", Errors::Coprocessor::Internal);
-    return mpp_exchange_receiver_map;
-}
-
-void DAGContext::cancelAllExchangeReceiver()
-{
-    for (auto & it : mpp_exchange_receiver_map)
-    {
-        it.second->cancel();
-    }
+    RUNTIME_ASSERT(mpp_receiver_set != nullptr, log, "MPPTask without receiver set");
+    return mpp_receiver_set->getExchangeReceiver(executor_id);
 }
 
-int DAGContext::getNewThreadCountOfExchangeReceiver() const
+void DAGContext::addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader)
 {
-    return new_thread_count_of_exchange_receiver;
+    if (!isMPPTask())
+        return;
+    RUNTIME_ASSERT(mpp_receiver_set != nullptr, log, "MPPTask without receiver set");
+    return mpp_receiver_set->addCoprocessorReader(coprocessor_reader);
 }
 
 bool DAGContext::containsRegionsInfoForTable(Int64 table_id) const
diff --git a/dbms/src/Flash/Coprocessor/DAGContext.h b/dbms/src/Flash/Coprocessor/DAGContext.h
index 8d84a7c6add..7bfc67afcad 100644
--- a/dbms/src/Flash/Coprocessor/DAGContext.h
+++ b/dbms/src/Flash/Coprocessor/DAGContext.h
@@ -37,6 +37,13 @@ namespace DB
 class Context;
 class MPPTunnelSet;
 class ExchangeReceiver;
+using ExchangeReceiverPtr = std::shared_ptr<ExchangeReceiver>;
+/// key: executor_id of ExchangeReceiver nodes in dag.
+using ExchangeReceiverMap = std::unordered_map<String, ExchangeReceiverPtr>;
+class MPPReceiverSet;
+using MPPReceiverSetPtr = std::shared_ptr<MPPReceiverSet>;
+class CoprocessorReader;
+using CoprocessorReaderPtr = std::shared_ptr<CoprocessorReader>;
 
 class Join;
 using JoinPtr = std::shared_ptr<Join>;
@@ -109,6 +116,13 @@ constexpr UInt64 NO_ENGINE_SUBSTITUTION = 1ul << 30ul;
 constexpr UInt64 ALLOW_INVALID_DATES = 1ul << 32ul;
 } // namespace TiDBSQLMode
 
+inline bool enableFineGrainedShuffle(uint64_t stream_count)
+{
+    return stream_count > 0;
+}
+
+extern const String enableFineGrainedShuffleExtraInfo;
+
 /// A context used to track the information that needs to be passed around during DAG planning.
 class DAGContext
 {
@@ -254,7 +268,6 @@ class DAGContext
         return io;
     }
 
-    int getNewThreadCountOfExchangeReceiver() const;
     UInt64 getFlags() const
     {
         return flags;
@@ -305,10 +318,12 @@ class DAGContext
 
     bool columnsForTestEmpty() { return columns_for_test_map.empty(); }
 
-    void cancelAllExchangeReceiver();
-
-    void initExchangeReceiverIfMPP(Context & context, size_t max_streams);
-    const std::unordered_map<String, std::shared_ptr<ExchangeReceiver>> & getMPPExchangeReceiverMap() const;
+    ExchangeReceiverPtr getMPPExchangeReceiver(const String & executor_id) const;
+    void setMPPReceiverSet(const MPPReceiverSetPtr & receiver_set)
+    {
+        mpp_receiver_set = receiver_set;
+    }
+    void addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader);
 
     void addSubquery(const String & subquery_id, SubqueryForSet && subquery);
     bool hasSubquery() const { return !subqueries.empty(); }
@@ -343,6 +358,10 @@ class DAGContext
     std::vector<tipb::FieldType> output_field_types;
     std::vector<Int32> output_offsets;
 
+    /// Hold the order of list based executors.
+    /// It is used to ensure that the order of Execution summary of list based executors is the same as the order of list based executors.
+    std::vector<String> list_based_executors_order;
+
 private:
     void initExecutorIdToJoinIdMap();
     void initOutputInfo();
@@ -350,7 +369,7 @@ class DAGContext
 private:
     /// Hold io for correcting the destruction order.
     BlockIO io;
-    /// profile_streams_map is a map that maps from executor_id to profile BlockInputStreams
+    /// profile_streams_map is a map that maps from executor_id to profile BlockInputStreams.
     std::unordered_map<String, BlockInputStreams> profile_streams_map;
     /// executor_id_to_join_id_map is a map that maps executor id to all the join executor id of itself and all its children.
     std::unordered_map<String, std::vector<String>> executor_id_to_join_id_map;
@@ -369,10 +388,8 @@ class DAGContext
     ConcurrentBoundedQueue<tipb::Error> warnings;
     /// warning_count is the actual warning count during the entire execution
     std::atomic<UInt64> warning_count;
-    int new_thread_count_of_exchange_receiver = 0;
-    /// key: executor_id of ExchangeReceiver nodes in dag.
-    std::unordered_map<String, std::shared_ptr<ExchangeReceiver>> mpp_exchange_receiver_map;
-    bool mpp_exchange_receiver_map_inited = false;
+
+    MPPReceiverSetPtr mpp_receiver_set;
     /// vector of SubqueriesForSets(such as join build subquery).
     /// The order of the vector is also the order of the subquery.
     std::vector<SubqueriesForSets> subqueries;
diff --git a/dbms/src/Flash/Coprocessor/DAGDriver.cpp b/dbms/src/Flash/Coprocessor/DAGDriver.cpp
index 55a2024a8bc..9fe388f8fe4 100644
--- a/dbms/src/Flash/Coprocessor/DAGDriver.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGDriver.cpp
@@ -72,6 +72,7 @@ DAGDriver<true>::DAGDriver(
     ::grpc::ServerWriter<::coprocessor::BatchResponse> * writer_,
     bool internal_)
     : context(context_)
+    , dag_response(nullptr)
     , writer(writer_)
     , internal(internal_)
     , log(&Poco::Logger::get("DAGDriver"))
@@ -129,7 +130,7 @@ try
         auto streaming_writer = std::make_shared<StreamWriter>(writer);
         TiDB::TiDBCollators collators;
 
-        std::unique_ptr<DAGResponseWriter> response_writer = std::make_unique<StreamingDAGResponseWriter<StreamWriterPtr>>(
+        std::unique_ptr<DAGResponseWriter> response_writer = std::make_unique<StreamingDAGResponseWriter<StreamWriterPtr, false>>(
             streaming_writer,
             std::vector<Int64>(),
             collators,
@@ -137,7 +138,9 @@ try
             context.getSettingsRef().dag_records_per_chunk,
             context.getSettingsRef().batch_send_min_limit,
             true,
-            dag_context);
+            dag_context,
+            /*fine_grained_shuffle_stream_count=*/0,
+            /*fine_grained_shuffle_batch_size=*/0);
         dag_output_stream = std::make_shared<DAGBlockOutputStream>(streams.in->getHeader(), std::move(response_writer));
         copyData(*streams.in, *dag_output_stream);
     }
diff --git a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp
index 9b765f30cc6..5fbd86e9762 100644
--- a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp
@@ -1124,6 +1124,24 @@ std::pair<bool, BoolVec> DAGExpressionAnalyzer::isCastRequiredForRootFinalProjec
     return std::make_pair(need_append_type_cast, std::move(need_append_type_cast_vec));
 }
 
+NamesWithAliases DAGExpressionAnalyzer::appendFinalProjectForRootQueryBlock(
+    ExpressionActionsChain & chain,
+    const std::vector<tipb::FieldType> & schema,
+    const std::vector<Int32> & output_offsets,
+    const String & column_prefix,
+    bool keep_session_timezone_info)
+{
+    auto & step = initAndGetLastStep(chain);
+
+    NamesWithAliases final_project = buildFinalProjection(step.actions, schema, output_offsets, column_prefix, keep_session_timezone_info);
+
+    for (const auto & name : final_project)
+    {
+        step.required_output.push_back(name.first);
+    }
+    return final_project;
+}
+
 NamesWithAliases DAGExpressionAnalyzer::buildFinalProjection(
     const ExpressionActionsPtr & actions,
     const std::vector<tipb::FieldType> & schema,
@@ -1148,24 +1166,6 @@ NamesWithAliases DAGExpressionAnalyzer::buildFinalProjection(
     return genRootFinalProjectAliases(column_prefix, output_offsets);
 }
 
-NamesWithAliases DAGExpressionAnalyzer::appendFinalProjectForRootQueryBlock(
-    ExpressionActionsChain & chain,
-    const std::vector<tipb::FieldType> & schema,
-    const std::vector<Int32> & output_offsets,
-    const String & column_prefix,
-    bool keep_session_timezone_info)
-{
-    auto & step = initAndGetLastStep(chain);
-
-    NamesWithAliases final_project = buildFinalProjection(step.actions, schema, output_offsets, column_prefix, keep_session_timezone_info);
-
-    for (const auto & name : final_project)
-    {
-        step.required_output.push_back(name.first);
-    }
-    return final_project;
-}
-
 String DAGExpressionAnalyzer::alignReturnType(
     const tipb::Expr & expr,
     const ExpressionActionsPtr & actions,
diff --git a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.h b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.h
index 7506047b34f..c42312b95c3 100644
--- a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.h
+++ b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.h
@@ -166,7 +166,7 @@ class DAGExpressionAnalyzer : private boost::noncopyable
     void appendCastAfterWindow(
         const ExpressionActionsPtr & actions,
         const tipb::Window & window,
-        const size_t window_columns_start_index);
+        size_t window_columns_start_index);
 
     NamesAndTypes buildOrderColumns(
         const ExpressionActionsPtr & actions,
@@ -199,6 +199,7 @@ class DAGExpressionAnalyzer : private boost::noncopyable
 #ifndef DBMS_PUBLIC_GTEST
 private:
 #endif
+
     String buildTupleFunctionForGroupConcat(
         const tipb::Expr & expr,
         SortDescription & sort_desc,
diff --git a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp
index ee529680d28..23bbb4586b3 100644
--- a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp
@@ -450,6 +450,7 @@ DAGExpressionAnalyzerHelper::FunctionBuilderMap DAGExpressionAnalyzerHelper::fun
      {"bitOr", DAGExpressionAnalyzerHelper::buildBitwiseFunction},
      {"bitXor", DAGExpressionAnalyzerHelper::buildBitwiseFunction},
      {"bitNot", DAGExpressionAnalyzerHelper::buildBitwiseFunction},
+     {"bitShiftRight", DAGExpressionAnalyzerHelper::buildBitwiseFunction},
      {"leftUTF8", DAGExpressionAnalyzerHelper::buildLeftUTF8Function},
      {"date_add", DAGExpressionAnalyzerHelper::buildDateAddOrSubFunction<DateAdd>},
      {"date_sub", DAGExpressionAnalyzerHelper::buildDateAddOrSubFunction<DateSub>},
diff --git a/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.cpp b/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.cpp
index 81fe2c8f713..d67a8f6ec52 100644
--- a/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.cpp
@@ -346,14 +346,26 @@ void DAGQueryBlockInterpreter::executeWhere(DAGPipeline & pipeline, const Expres
 
 void DAGQueryBlockInterpreter::executeWindow(
     DAGPipeline & pipeline,
-    WindowDescription & window_description)
+    WindowDescription & window_description,
+    bool enable_fine_grained_shuffle)
 {
     executeExpression(pipeline, window_description.before_window, log, "before window");
 
-    /// If there are several streams, we merge them into one
-    executeUnion(pipeline, max_streams, log, false, "merge into one for window input");
-    assert(pipeline.streams.size() == 1);
-    pipeline.firstStream() = std::make_shared<WindowBlockInputStream>(pipeline.firstStream(), window_description, log->identifier());
+    if (enable_fine_grained_shuffle)
+    {
+        /// Window function can be multiple threaded when fine grained shuffle is enabled.
+        pipeline.transform([&](auto & stream) {
+            stream = std::make_shared<WindowBlockInputStream>(stream, window_description, log->identifier());
+            stream->setExtraInfo(enableFineGrainedShuffleExtraInfo);
+        });
+    }
+    else
+    {
+        /// If there are several streams, we merge them into one.
+        executeUnion(pipeline, max_streams, log, false, "merge into one for window input");
+        assert(pipeline.streams.size() == 1);
+        pipeline.firstStream() = std::make_shared<WindowBlockInputStream>(pipeline.firstStream(), window_description, log->identifier());
+    }
 }
 
 void DAGQueryBlockInterpreter::executeAggregation(
@@ -379,34 +391,39 @@ void DAGQueryBlockInterpreter::executeAggregation(
         is_final_agg);
 
     /// If there are several sources, then we perform parallel aggregation
-    if (pipeline.streams.size() > 1)
+    if (pipeline.streams.size() > 1 || pipeline.streams_with_non_joined_data.size() > 1)
     {
         const Settings & settings = context.getSettingsRef();
-        BlockInputStreamPtr stream_with_non_joined_data = combinedNonJoinedDataStream(pipeline, max_streams, log);
-        pipeline.firstStream() = std::make_shared<ParallelAggregatingBlockInputStream>(
+        BlockInputStreamPtr stream = std::make_shared<ParallelAggregatingBlockInputStream>(
             pipeline.streams,
-            stream_with_non_joined_data,
+            pipeline.streams_with_non_joined_data,
             params,
             context.getFileProvider(),
             true,
             max_streams,
             settings.aggregation_memory_efficient_merge_threads ? static_cast<size_t>(settings.aggregation_memory_efficient_merge_threads) : static_cast<size_t>(settings.max_threads),
             log->identifier());
+
         pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
+        pipeline.firstStream() = std::move(stream);
+
         // should record for agg before restore concurrency. See #3804.
         recordProfileStreams(pipeline, query_block.aggregation_name);
         restorePipelineConcurrency(pipeline);
     }
     else
     {
-        BlockInputStreamPtr stream_with_non_joined_data = combinedNonJoinedDataStream(pipeline, max_streams, log);
         BlockInputStreams inputs;
         if (!pipeline.streams.empty())
             inputs.push_back(pipeline.firstStream());
-        else
-            pipeline.streams.resize(1);
-        if (stream_with_non_joined_data)
-            inputs.push_back(stream_with_non_joined_data);
+
+        if (!pipeline.streams_with_non_joined_data.empty())
+            inputs.push_back(pipeline.streams_with_non_joined_data.at(0));
+
+        pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
+
         pipeline.firstStream() = std::make_shared<AggregatingBlockInputStream>(
             std::make_shared<ConcatBlockInputStream>(inputs, log->identifier()),
             params,
@@ -417,15 +434,15 @@ void DAGQueryBlockInterpreter::executeAggregation(
     }
 }
 
-void DAGQueryBlockInterpreter::executeWindowOrder(DAGPipeline & pipeline, SortDescription sort_desc)
+void DAGQueryBlockInterpreter::executeWindowOrder(DAGPipeline & pipeline, SortDescription sort_desc, bool enable_fine_grained_shuffle)
 {
-    orderStreams(pipeline, max_streams, sort_desc, 0, context, log);
+    orderStreams(pipeline, max_streams, sort_desc, 0, enable_fine_grained_shuffle, context, log);
 }
 
 void DAGQueryBlockInterpreter::executeOrder(DAGPipeline & pipeline, const NamesAndTypes & order_columns)
 {
     Int64 limit = query_block.limit_or_topn->topn().limit();
-    orderStreams(pipeline, max_streams, getSortDescription(order_columns, query_block.limit_or_topn->topn().order_by()), limit, context, log);
+    orderStreams(pipeline, max_streams, getSortDescription(order_columns, query_block.limit_or_topn->topn().order_by()), limit, false, context, log);
 }
 
 void DAGQueryBlockInterpreter::recordProfileStreams(DAGPipeline & pipeline, const String & key)
@@ -436,17 +453,30 @@ void DAGQueryBlockInterpreter::recordProfileStreams(DAGPipeline & pipeline, cons
 
 void DAGQueryBlockInterpreter::handleExchangeReceiver(DAGPipeline & pipeline)
 {
-    auto it = dagContext().getMPPExchangeReceiverMap().find(query_block.source_name);
-    if (unlikely(it == dagContext().getMPPExchangeReceiverMap().end()))
+    auto exchange_receiver = dagContext().getMPPExchangeReceiver(query_block.source_name);
+    if (unlikely(exchange_receiver == nullptr))
         throw Exception("Can not find exchange receiver for " + query_block.source_name, ErrorCodes::LOGICAL_ERROR);
     // todo choose a more reasonable stream number
     auto & exchange_receiver_io_input_streams = dagContext().getInBoundIOInputStreamsMap()[query_block.source_name];
-    for (size_t i = 0; i < max_streams; ++i)
+
+    const bool enable_fine_grained_shuffle = enableFineGrainedShuffle(exchange_receiver->getFineGrainedShuffleStreamCount());
+    String extra_info = "squashing after exchange receiver";
+    size_t stream_count = max_streams;
+    if (enable_fine_grained_shuffle)
+    {
+        extra_info += ", " + enableFineGrainedShuffleExtraInfo;
+        stream_count = std::min(max_streams, exchange_receiver->getFineGrainedShuffleStreamCount());
+    }
+
+    for (size_t i = 0; i < stream_count; ++i)
     {
-        BlockInputStreamPtr stream = std::make_shared<ExchangeReceiverInputStream>(it->second, log->identifier(), query_block.source_name);
+        BlockInputStreamPtr stream = std::make_shared<ExchangeReceiverInputStream>(exchange_receiver,
+                                                                                   log->identifier(),
+                                                                                   query_block.source_name,
+                                                                                   /*stream_id=*/enable_fine_grained_shuffle ? i : 0);
         exchange_receiver_io_input_streams.push_back(stream);
         stream = std::make_shared<SquashingBlockInputStream>(stream, 8192, 0, log->identifier());
-        stream->setExtraInfo("squashing after exchange receiver");
+        stream->setExtraInfo(extra_info);
         pipeline.streams.push_back(stream);
     }
     NamesAndTypes source_columns;
@@ -508,7 +538,7 @@ void DAGQueryBlockInterpreter::handleProjection(DAGPipeline & pipeline, const ti
     analyzer = std::make_unique<DAGExpressionAnalyzer>(std::move(output_columns), context);
 }
 
-void DAGQueryBlockInterpreter::handleWindow(DAGPipeline & pipeline, const tipb::Window & window)
+void DAGQueryBlockInterpreter::handleWindow(DAGPipeline & pipeline, const tipb::Window & window, bool enable_fine_grained_shuffle)
 {
     NamesAndTypes input_columns;
     assert(input_streams_vec.size() == 1);
@@ -517,13 +547,13 @@ void DAGQueryBlockInterpreter::handleWindow(DAGPipeline & pipeline, const tipb::
         input_columns.emplace_back(p.name, p.type);
     DAGExpressionAnalyzer dag_analyzer(input_columns, context);
     WindowDescription window_description = dag_analyzer.buildWindowDescription(window);
-    executeWindow(pipeline, window_description);
+    executeWindow(pipeline, window_description, enable_fine_grained_shuffle);
     executeExpression(pipeline, window_description.after_window, log, "cast after window");
 
     analyzer = std::make_unique<DAGExpressionAnalyzer>(window_description.after_window_columns, context);
 }
 
-void DAGQueryBlockInterpreter::handleWindowOrder(DAGPipeline & pipeline, const tipb::Sort & window_sort)
+void DAGQueryBlockInterpreter::handleWindowOrder(DAGPipeline & pipeline, const tipb::Sort & window_sort, bool enable_fine_grained_shuffle)
 {
     NamesAndTypes input_columns;
     assert(input_streams_vec.size() == 1);
@@ -532,7 +562,7 @@ void DAGQueryBlockInterpreter::handleWindowOrder(DAGPipeline & pipeline, const t
         input_columns.emplace_back(p.name, p.type);
     DAGExpressionAnalyzer dag_analyzer(input_columns, context);
     auto order_columns = dag_analyzer.buildWindowOrderColumns(window_sort);
-    executeWindowOrder(pipeline, getSortDescription(order_columns, window_sort.byitems()));
+    executeWindowOrder(pipeline, getSortDescription(order_columns, window_sort.byitems()), enable_fine_grained_shuffle);
 
     analyzer = std::make_unique<DAGExpressionAnalyzer>(std::move(input_columns), context);
 }
@@ -580,13 +610,13 @@ void DAGQueryBlockInterpreter::executeImpl(DAGPipeline & pipeline)
     }
     else if (query_block.source->tp() == tipb::ExecType::TypeWindow)
     {
-        handleWindow(pipeline, query_block.source->window());
+        handleWindow(pipeline, query_block.source->window(), enableFineGrainedShuffle(query_block.source->fine_grained_shuffle_stream_count()));
         recordProfileStreams(pipeline, query_block.source_name);
         restorePipelineConcurrency(pipeline);
     }
     else if (query_block.source->tp() == tipb::ExecType::TypeSort)
     {
-        handleWindowOrder(pipeline, query_block.source->sort());
+        handleWindowOrder(pipeline, query_block.source->sort(), enableFineGrainedShuffle(query_block.source->fine_grained_shuffle_stream_count()));
         recordProfileStreams(pipeline, query_block.source_name);
     }
     else
@@ -692,19 +722,47 @@ void DAGQueryBlockInterpreter::handleExchangeSender(DAGPipeline & pipeline)
     std::vector<Int64> partition_col_ids = ExchangeSenderInterpreterHelper::genPartitionColIds(exchange_sender);
     TiDB::TiDBCollators partition_col_collators = ExchangeSenderInterpreterHelper::genPartitionColCollators(exchange_sender);
     int stream_id = 0;
-    pipeline.transform([&](auto & stream) {
-        // construct writer
-        std::unique_ptr<DAGResponseWriter> response_writer = std::make_unique<StreamingDAGResponseWriter<MPPTunnelSetPtr>>(
-            context.getDAGContext()->tunnel_set,
-            partition_col_ids,
-            partition_col_collators,
-            exchange_sender.tp(),
-            context.getSettingsRef().dag_records_per_chunk,
-            context.getSettingsRef().batch_send_min_limit,
-            stream_id++ == 0, /// only one stream needs to sending execution summaries for the last response
-            dagContext());
-        stream = std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), log->identifier());
-    });
+    const uint64_t stream_count = query_block.exchange_sender->fine_grained_shuffle_stream_count();
+    const uint64_t batch_size = query_block.exchange_sender->fine_grained_shuffle_batch_size();
+
+    if (enableFineGrainedShuffle(stream_count))
+    {
+        pipeline.transform([&](auto & stream) {
+            // construct writer
+            std::unique_ptr<DAGResponseWriter> response_writer = std::make_unique<StreamingDAGResponseWriter<MPPTunnelSetPtr, true>>(
+                context.getDAGContext()->tunnel_set,
+                partition_col_ids,
+                partition_col_collators,
+                exchange_sender.tp(),
+                context.getSettingsRef().dag_records_per_chunk,
+                context.getSettingsRef().batch_send_min_limit,
+                stream_id++ == 0, /// only one stream needs to sending execution summaries for the last response
+                dagContext(),
+                stream_count,
+                batch_size);
+            stream = std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), log->identifier());
+            stream->setExtraInfo(enableFineGrainedShuffleExtraInfo);
+        });
+        RUNTIME_CHECK(exchange_sender.tp() == tipb::ExchangeType::Hash, Exception, "exchange_sender has to be hash partition when fine grained shuffle is enabled");
+        RUNTIME_CHECK(stream_count <= 1024, Exception, "fine_grained_shuffle_stream_count should not be greater than 1024");
+    }
+    else
+    {
+        pipeline.transform([&](auto & stream) {
+            std::unique_ptr<DAGResponseWriter> response_writer = std::make_unique<StreamingDAGResponseWriter<MPPTunnelSetPtr, false>>(
+                context.getDAGContext()->tunnel_set,
+                partition_col_ids,
+                partition_col_collators,
+                exchange_sender.tp(),
+                context.getSettingsRef().dag_records_per_chunk,
+                context.getSettingsRef().batch_send_min_limit,
+                stream_id++ == 0, /// only one stream needs to sending execution summaries for the last response
+                dagContext(),
+                stream_count,
+                batch_size);
+            stream = std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), log->identifier());
+        });
+    }
 }
 
 void DAGQueryBlockInterpreter::handleMockExchangeSender(DAGPipeline & pipeline)
@@ -732,4 +790,4 @@ BlockInputStreams DAGQueryBlockInterpreter::execute()
 
     return pipeline.streams;
 }
-} // namespace DB
\ No newline at end of file
+} // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.h b/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.h
index cabdd4dc9be..c449b37e360 100644
--- a/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.h
+++ b/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.h
@@ -64,15 +64,16 @@ class DAGQueryBlockInterpreter
     void handleExchangeReceiver(DAGPipeline & pipeline);
     void handleMockExchangeReceiver(DAGPipeline & pipeline);
     void handleProjection(DAGPipeline & pipeline, const tipb::Projection & projection);
-    void handleWindow(DAGPipeline & pipeline, const tipb::Window & window);
-    void handleWindowOrder(DAGPipeline & pipeline, const tipb::Sort & window_sort);
+    void handleWindow(DAGPipeline & pipeline, const tipb::Window & window, bool enable_fine_grained_shuffle);
+    void handleWindowOrder(DAGPipeline & pipeline, const tipb::Sort & window_sort, bool enable_fine_grained_shuffle);
     void executeWhere(DAGPipeline & pipeline, const ExpressionActionsPtr & expressionActionsPtr, String & filter_column, const String & extra_info = "");
-    void executeWindowOrder(DAGPipeline & pipeline, SortDescription sort_desc);
+    void executeWindowOrder(DAGPipeline & pipeline, SortDescription sort_desc, bool enable_fine_grained_shuffle);
     void executeOrder(DAGPipeline & pipeline, const NamesAndTypes & order_columns);
     void executeLimit(DAGPipeline & pipeline);
     void executeWindow(
         DAGPipeline & pipeline,
-        WindowDescription & window_description);
+        WindowDescription & window_description,
+        bool enable_fine_grained_shuffle);
     void executeAggregation(
         DAGPipeline & pipeline,
         const ExpressionActionsPtr & expression_actions_ptr,
diff --git a/dbms/src/Flash/Coprocessor/DAGQuerySource.cpp b/dbms/src/Flash/Coprocessor/DAGQuerySource.cpp
index 882699e1599..d68a7b17aaa 100644
--- a/dbms/src/Flash/Coprocessor/DAGQuerySource.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGQuerySource.cpp
@@ -20,6 +20,26 @@
 
 namespace DB
 {
+namespace
+{
+void fillOrderForListBasedExecutors(DAGContext & dag_context, const DAGQueryBlock & query_block)
+{
+    assert(query_block.source);
+    auto & list_based_executors_order = dag_context.list_based_executors_order;
+    list_based_executors_order.push_back(query_block.source_name);
+    if (query_block.selection)
+        list_based_executors_order.push_back(query_block.selection_name);
+    if (query_block.aggregation)
+        list_based_executors_order.push_back(query_block.aggregation_name);
+    if (query_block.having)
+        list_based_executors_order.push_back(query_block.having_name);
+    if (query_block.limit_or_topn)
+        list_based_executors_order.push_back(query_block.limit_or_topn_name);
+    if (query_block.exchange_sender)
+        dag_context.list_based_executors_order.push_back(query_block.exchange_sender_name);
+}
+} // namespace
+
 DAGQuerySource::DAGQuerySource(Context & context_)
     : context(context_)
 {
@@ -32,6 +52,9 @@ DAGQuerySource::DAGQuerySource(Context & context_)
     else
     {
         root_query_block = std::make_shared<DAGQueryBlock>(1, dag_request.executors());
+        auto & dag_context = getDAGContext();
+        if (!dag_context.return_executor_id)
+            fillOrderForListBasedExecutors(dag_context, *root_query_block);
     }
 }
 
diff --git a/dbms/src/Flash/Coprocessor/DAGResponseWriter.cpp b/dbms/src/Flash/Coprocessor/DAGResponseWriter.cpp
index 53bebc91da8..33f6d99f9d8 100644
--- a/dbms/src/Flash/Coprocessor/DAGResponseWriter.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGResponseWriter.cpp
@@ -89,12 +89,10 @@ void DAGResponseWriter::addExecuteSummaries(tipb::SelectResponse & response, boo
         }
     }
 
-    /// add execution_summary for local executor
-    for (auto & p : dag_context.getProfileStreamsMap())
-    {
+    auto fill_execution_summary = [&](const String & executor_id, const BlockInputStreams & streams) {
         ExecutionSummary current;
         /// part 1: local execution info
-        for (auto & stream_ptr : p.second)
+        for (const auto & stream_ptr : streams)
         {
             if (auto * p_stream = dynamic_cast<IProfilingBlockInputStream *>(stream_ptr.get()))
             {
@@ -105,16 +103,16 @@ void DAGResponseWriter::addExecuteSummaries(tipb::SelectResponse & response, boo
             current.concurrency++;
         }
         /// part 2: remote execution info
-        if (merged_remote_execution_summaries.find(p.first) != merged_remote_execution_summaries.end())
+        if (merged_remote_execution_summaries.find(executor_id) != merged_remote_execution_summaries.end())
         {
-            for (auto & remote : merged_remote_execution_summaries[p.first])
+            for (auto & remote : merged_remote_execution_summaries[executor_id])
                 current.merge(remote, false);
         }
         /// part 3: for join need to add the build time
         /// In TiFlash, a hash join's build side is finished before probe side starts,
         /// so the join probe side's running time does not include hash table's build time,
         /// when construct ExecSummaries, we need add the build cost to probe executor
-        auto all_join_id_it = dag_context.getExecutorIdToJoinIdMap().find(p.first);
+        auto all_join_id_it = dag_context.getExecutorIdToJoinIdMap().find(executor_id);
         if (all_join_id_it != dag_context.getExecutorIdToJoinIdMap().end())
         {
             for (const auto & join_executor_id : all_join_id_it->second)
@@ -138,8 +136,27 @@ void DAGResponseWriter::addExecuteSummaries(tipb::SelectResponse & response, boo
         }
 
         current.time_processed_ns += dag_context.compile_time_ns;
-        fillTiExecutionSummary(response.add_execution_summaries(), current, p.first, delta_mode);
+        fillTiExecutionSummary(response.add_execution_summaries(), current, executor_id, delta_mode);
+    };
+
+    /// add execution_summary for local executor
+    if (dag_context.return_executor_id)
+    {
+        for (auto & p : dag_context.getProfileStreamsMap())
+            fill_execution_summary(p.first, p.second);
+    }
+    else
+    {
+        const auto & profile_streams_map = dag_context.getProfileStreamsMap();
+        assert(profile_streams_map.size() == dag_context.list_based_executors_order.size());
+        for (const auto & executor_id : dag_context.list_based_executors_order)
+        {
+            auto it = profile_streams_map.find(executor_id);
+            assert(it != profile_streams_map.end());
+            fill_execution_summary(executor_id, it->second);
+        }
     }
+
     for (auto & p : merged_remote_execution_summaries)
     {
         if (local_executors.find(p.first) == local_executors.end())
diff --git a/dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp b/dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp
index df7e504d2c4..390ce7b9948 100644
--- a/dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp
@@ -19,6 +19,7 @@
 #include <DataStreams/ExpressionBlockInputStream.h>
 #include <DataStreams/FilterBlockInputStream.h>
 #include <DataStreams/IProfilingBlockInputStream.h>
+#include <DataStreams/MultiplexInputStream.h>
 #include <DataStreams/NullBlockInputStream.h>
 #include <DataStreams/TiRemoteBlockInputStream.h>
 #include <Flash/Coprocessor/ChunkCodec.h>
@@ -485,7 +486,8 @@ void DAGStorageInterpreter::buildRemoteStreams(std::vector<RemoteRequest> && rem
         std::vector<pingcap::coprocessor::copTask> tasks(all_tasks.begin() + task_start, all_tasks.begin() + task_end);
 
         auto coprocessor_reader = std::make_shared<CoprocessorReader>(schema, cluster, tasks, has_enforce_encode_type, 1);
-        BlockInputStreamPtr input = std::make_shared<CoprocessorBlockInputStream>(coprocessor_reader, log->identifier(), table_scan.getTableScanExecutorID());
+        context.getDAGContext()->addCoprocessorReader(coprocessor_reader);
+        BlockInputStreamPtr input = std::make_shared<CoprocessorBlockInputStream>(coprocessor_reader, log->identifier(), table_scan.getTableScanExecutorID(), /*stream_id=*/0);
         pipeline.streams.push_back(input);
         task_start = task_end;
     }
@@ -634,6 +636,9 @@ void DAGStorageInterpreter::buildLocalStreams(DAGPipeline & pipeline, size_t max
     if (total_local_region_num == 0)
         return;
     const auto table_query_infos = generateSelectQueryInfos();
+    bool has_multiple_partitions = table_query_infos.size() > 1;
+    // MultiPartitionStreamPool will be disabled in no partition mode or single-partition case
+    std::shared_ptr<MultiPartitionStreamPool> stream_pool = has_multiple_partitions ? std::make_shared<MultiPartitionStreamPool>() : nullptr;
     for (const auto & table_query_info : table_query_infos)
     {
         DAGPipeline current_pipeline;
@@ -642,9 +647,6 @@ void DAGStorageInterpreter::buildLocalStreams(DAGPipeline & pipeline, size_t max
         size_t region_num = query_info.mvcc_query_info->regions_query_info.size();
         if (region_num == 0)
             continue;
-        /// calculate weighted max_streams for each partition, note at least 1 stream is needed for each partition
-        size_t current_max_streams = table_query_infos.size() == 1 ? max_streams : (max_streams * region_num + total_local_region_num - 1) / total_local_region_num;
-
         QueryProcessingStage::Enum from_stage = QueryProcessingStage::FetchColumns;
         assert(storages_with_structure_lock.find(table_id) != storages_with_structure_lock.end());
         auto & storage = storages_with_structure_lock[table_id].storage;
@@ -654,7 +656,7 @@ void DAGStorageInterpreter::buildLocalStreams(DAGPipeline & pipeline, size_t max
         {
             try
             {
-                current_pipeline.streams = storage->read(required_columns, query_info, context, from_stage, max_block_size, current_max_streams);
+                current_pipeline.streams = storage->read(required_columns, query_info, context, from_stage, max_block_size, max_streams);
 
                 // After getting streams from storage, we need to validate whether Regions have changed or not after learner read.
                 // (by calling `validateQueryInfo`). In case the key ranges of Regions have changed (Region merge/split), those `streams`
@@ -778,7 +780,19 @@ void DAGStorageInterpreter::buildLocalStreams(DAGPipeline & pipeline, size_t max
                 throw;
             }
         }
-        pipeline.streams.insert(pipeline.streams.end(), current_pipeline.streams.begin(), current_pipeline.streams.end());
+        if (has_multiple_partitions)
+            stream_pool->addPartitionStreams(current_pipeline.streams);
+        else
+            pipeline.streams.insert(pipeline.streams.end(), current_pipeline.streams.begin(), current_pipeline.streams.end());
+    }
+    if (has_multiple_partitions)
+    {
+        String req_info = dag_context.isMPPTask() ? dag_context.getMPPTaskId().toString() : "";
+        int exposed_streams_cnt = std::min(static_cast<int>(max_streams), stream_pool->addedStreamsCnt());
+        for (int i = 0; i < exposed_streams_cnt; ++i)
+        {
+            pipeline.streams.push_back(std::make_shared<MultiplexInputStream>(stream_pool, req_info));
+        }
     }
 }
 
diff --git a/dbms/src/Flash/Coprocessor/DAGUtils.cpp b/dbms/src/Flash/Coprocessor/DAGUtils.cpp
index 87f58131c8c..2003103a20a 100644
--- a/dbms/src/Flash/Coprocessor/DAGUtils.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGUtils.cpp
@@ -29,7 +29,6 @@
 #include <unordered_map>
 namespace DB
 {
-
 const Int8 VAR_SIZE = 0;
 
 extern const String uniq_raw_res_name;
@@ -333,7 +332,7 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
     {tipb::ScalarFuncSig::DecimalIsFalseWithNull, "isFalseWithNull"},
 
     //{tipb::ScalarFuncSig::LeftShift, "cast"},
-    //{tipb::ScalarFuncSig::RightShift, "cast"},
+    {tipb::ScalarFuncSig::RightShift, "bitShiftRight"},
 
     //{tipb::ScalarFuncSig::BitCount, "cast"},
     //{tipb::ScalarFuncSig::GetParamString, "cast"},
@@ -514,7 +513,7 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
     //{tipb::ScalarFuncSig::YearWeekWithMode, "cast"},
     //{tipb::ScalarFuncSig::YearWeekWithoutMode, "cast"},
 
-    //{tipb::ScalarFuncSig::GetFormat, "cast"},
+    {tipb::ScalarFuncSig::GetFormat, "getFormat"},
     {tipb::ScalarFuncSig::SysDateWithFsp, "sysDateWithFsp"},
     {tipb::ScalarFuncSig::SysDateWithoutFsp, "sysDateWithoutFsp"},
     //{tipb::ScalarFuncSig::CurrentDate, "cast"},
@@ -562,7 +561,7 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
     {tipb::ScalarFuncSig::Quarter, "toQuarter"},
 
     //{tipb::ScalarFuncSig::SecToTime, "cast"},
-    //{tipb::ScalarFuncSig::TimeToSec, "cast"},
+    {tipb::ScalarFuncSig::TimeToSec, "tidbTimeToSec"},
     //{tipb::ScalarFuncSig::TimestampAdd, "cast"},
     {tipb::ScalarFuncSig::ToDays, "tidbToDays"},
     {tipb::ScalarFuncSig::ToSeconds, "tidbToSeconds"},
@@ -649,8 +648,8 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
     //{tipb::ScalarFuncSig::Quote, "cast"},
     //{tipb::ScalarFuncSig::Repeat, "cast"},
     {tipb::ScalarFuncSig::Replace, "replaceAll"},
-    //{tipb::ScalarFuncSig::ReverseUTF8, "cast"},
-    //{tipb::ScalarFuncSig::Reverse, "cast"},
+    {tipb::ScalarFuncSig::ReverseUTF8, "reverseUTF8"},
+    {tipb::ScalarFuncSig::Reverse, "reverse"},
     {tipb::ScalarFuncSig::RightUTF8, "rightUTF8"},
     //{tipb::ScalarFuncSig::Right, "cast"},
     {tipb::ScalarFuncSig::RpadUTF8, "rpadUTF8"},
@@ -770,6 +769,10 @@ const String & getFunctionName(const tipb::Expr & expr)
     {
         return getAggFunctionName(expr);
     }
+    else if (isWindowFunctionExpr(expr))
+    {
+        return getWindowFunctionName(expr);
+    }
     else
     {
         auto it = scalar_func_map.find(expr.sig());
@@ -1429,6 +1432,7 @@ tipb::EncodeType analyzeDAGEncodeType(DAGContext & dag_context)
         return tipb::EncodeType::TypeDefault;
     return encode_type;
 }
+
 tipb::ScalarFuncSig reverseGetFuncSigByFuncName(const String & name)
 {
     static std::unordered_map<String, tipb::ScalarFuncSig> func_name_sig_map = getFuncNameToSigMap();
diff --git a/dbms/src/Flash/Coprocessor/DecodeDetail.h b/dbms/src/Flash/Coprocessor/DecodeDetail.h
index 9bad0ca2b72..91851650d9e 100644
--- a/dbms/src/Flash/Coprocessor/DecodeDetail.h
+++ b/dbms/src/Flash/Coprocessor/DecodeDetail.h
@@ -21,8 +21,12 @@ namespace DB
 /// Detail of the packet that decoding in TiRemoteInputStream.RemoteReader.decodeChunks()
 struct DecodeDetail
 {
+    // For fine grained shuffle, each ExchangeReceiver/thread will decode its own blocks.
+    // So this is the row number of partial blocks of the original packet.
+    // This will be the row number of all blocks of the original packet if it's not fine grained shuffle.
     Int64 rows = 0;
-    // byte size of origin packet.
+
+    // Total byte size of the origin packet, even for fine grained shuffle.
     Int64 packet_bytes = 0;
 };
-} // namespace DB
\ No newline at end of file
+} // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/InterpreterDAG.cpp b/dbms/src/Flash/Coprocessor/InterpreterDAG.cpp
index 5ca8d26758f..964384ce885 100644
--- a/dbms/src/Flash/Coprocessor/InterpreterDAG.cpp
+++ b/dbms/src/Flash/Coprocessor/InterpreterDAG.cpp
@@ -25,19 +25,8 @@ namespace DB
 InterpreterDAG::InterpreterDAG(Context & context_, const DAGQuerySource & dag_)
     : context(context_)
     , dag(dag_)
+    , max_streams(context.getMaxStreams())
 {
-    const Settings & settings = context.getSettingsRef();
-    if (dagContext().isBatchCop() || (dagContext().isMPPTask() && !dagContext().isTest()))
-        max_streams = settings.max_threads;
-    else if (dagContext().isTest())
-        max_streams = dagContext().initialize_concurrency;
-    else
-        max_streams = 1;
-
-    if (max_streams > 1)
-    {
-        max_streams *= settings.max_streams_to_max_threads_ratio;
-    }
 }
 
 void setRestorePipelineConcurrency(DAGQueryBlock & query_block)
@@ -89,10 +78,6 @@ BlockInputStreams InterpreterDAG::executeQueryBlock(DAGQueryBlock & query_block)
 
 BlockIO InterpreterDAG::execute()
 {
-    /// Due to learner read, DAGQueryBlockInterpreter may take a long time to build
-    /// the query plan, so we init mpp exchange receiver before executeQueryBlock
-    dagContext().initExchangeReceiverIfMPP(context, max_streams);
-
     BlockInputStreams streams = executeQueryBlock(*dag.getRootQueryBlock());
     DAGPipeline pipeline;
     pipeline.streams = streams;
diff --git a/dbms/src/Flash/Coprocessor/InterpreterUtils.cpp b/dbms/src/Flash/Coprocessor/InterpreterUtils.cpp
index 9de5b83626f..002a06d07b9 100644
--- a/dbms/src/Flash/Coprocessor/InterpreterUtils.cpp
+++ b/dbms/src/Flash/Coprocessor/InterpreterUtils.cpp
@@ -17,6 +17,7 @@
 #include <DataStreams/PartialSortingBlockInputStream.h>
 #include <DataStreams/SharedQueryBlockInputStream.h>
 #include <DataStreams/UnionBlockInputStream.h>
+#include <Flash/Coprocessor/DAGContext.h>
 #include <Flash/Coprocessor/InterpreterUtils.h>
 #include <Interpreters/Context.h>
 
@@ -42,32 +43,6 @@ void restoreConcurrency(
     }
 }
 
-BlockInputStreamPtr combinedNonJoinedDataStream(
-    DAGPipeline & pipeline,
-    size_t max_threads,
-    const LoggerPtr & log,
-    bool ignore_block)
-{
-    BlockInputStreamPtr ret = nullptr;
-    if (pipeline.streams_with_non_joined_data.size() == 1)
-        ret = pipeline.streams_with_non_joined_data.at(0);
-    else if (pipeline.streams_with_non_joined_data.size() > 1)
-    {
-        if (ignore_block)
-        {
-            ret = std::make_shared<UnionWithoutBlock>(pipeline.streams_with_non_joined_data, nullptr, max_threads, log->identifier());
-            ret->setExtraInfo("combine non joined(ignore block)");
-        }
-        else
-        {
-            ret = std::make_shared<UnionWithBlock>(pipeline.streams_with_non_joined_data, nullptr, max_threads, log->identifier());
-            ret->setExtraInfo("combine non joined");
-        }
-    }
-    pipeline.streams_with_non_joined_data.clear();
-    return ret;
-}
-
 void executeUnion(
     DAGPipeline & pipeline,
     size_t max_streams,
@@ -75,21 +50,33 @@ void executeUnion(
     bool ignore_block,
     const String & extra_info)
 {
-    if (pipeline.streams.size() == 1 && pipeline.streams_with_non_joined_data.empty())
-        return;
-    auto non_joined_data_stream = combinedNonJoinedDataStream(pipeline, max_streams, log, ignore_block);
-    if (!pipeline.streams.empty())
+    switch (pipeline.streams.size() + pipeline.streams_with_non_joined_data.size())
+    {
+    case 0:
+        break;
+    case 1:
     {
+        if (pipeline.streams.size() == 1)
+            break;
+        // streams_with_non_joined_data's size is 1.
+        pipeline.streams.push_back(pipeline.streams_with_non_joined_data.at(0));
+        pipeline.streams_with_non_joined_data.clear();
+        break;
+    }
+    default:
+    {
+        BlockInputStreamPtr stream;
         if (ignore_block)
-            pipeline.firstStream() = std::make_shared<UnionWithoutBlock>(pipeline.streams, non_joined_data_stream, max_streams, log->identifier());
+            stream = std::make_shared<UnionWithoutBlock>(pipeline.streams, pipeline.streams_with_non_joined_data, max_streams, log->identifier());
         else
-            pipeline.firstStream() = std::make_shared<UnionWithBlock>(pipeline.streams, non_joined_data_stream, max_streams, log->identifier());
-        pipeline.firstStream()->setExtraInfo(extra_info);
+            stream = std::make_shared<UnionWithBlock>(pipeline.streams, pipeline.streams_with_non_joined_data, max_streams, log->identifier());
+        stream->setExtraInfo(extra_info);
+
         pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
+        pipeline.firstStream() = std::move(stream);
+        break;
     }
-    else if (non_joined_data_stream != nullptr)
-    {
-        pipeline.streams.push_back(non_joined_data_stream);
     }
 }
 
@@ -126,10 +113,14 @@ void orderStreams(
     size_t max_streams,
     SortDescription order_descr,
     Int64 limit,
+    bool enable_fine_grained_shuffle,
     const Context & context,
     const LoggerPtr & log)
 {
     const Settings & settings = context.getSettingsRef();
+    String extra_info;
+    if (enable_fine_grained_shuffle)
+        extra_info = enableFineGrainedShuffleExtraInfo;
 
     pipeline.transform([&](auto & stream) {
         auto sorting_stream = std::make_shared<PartialSortingBlockInputStream>(stream, order_descr, log->identifier(), limit);
@@ -141,19 +132,37 @@ void orderStreams(
         sorting_stream->setLimits(limits);
 
         stream = sorting_stream;
+        stream->setExtraInfo(extra_info);
     });
 
-    /// If there are several streams, we merge them into one
-    executeUnion(pipeline, max_streams, log, false, "for partial order");
+    if (enable_fine_grained_shuffle)
+    {
+        pipeline.transform([&](auto & stream) {
+            stream = std::make_shared<MergeSortingBlockInputStream>(
+                stream,
+                order_descr,
+                settings.max_block_size,
+                limit,
+                settings.max_bytes_before_external_sort,
+                context.getTemporaryPath(),
+                log->identifier());
+            stream->setExtraInfo(enableFineGrainedShuffleExtraInfo);
+        });
+    }
+    else
+    {
+        /// If there are several streams, we merge them into one
+        executeUnion(pipeline, max_streams, log, false, "for partial order");
 
-    /// Merge the sorted blocks.
-    pipeline.firstStream() = std::make_shared<MergeSortingBlockInputStream>(
-        pipeline.firstStream(),
-        order_descr,
-        settings.max_block_size,
-        limit,
-        settings.max_bytes_before_external_sort,
-        context.getTemporaryPath(),
-        log->identifier());
+        /// Merge the sorted blocks.
+        pipeline.firstStream() = std::make_shared<MergeSortingBlockInputStream>(
+            pipeline.firstStream(),
+            order_descr,
+            settings.max_block_size,
+            limit,
+            settings.max_bytes_before_external_sort,
+            context.getTemporaryPath(),
+            log->identifier());
+    }
 }
 } // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/InterpreterUtils.h b/dbms/src/Flash/Coprocessor/InterpreterUtils.h
index 36280f3b903..bd64346718c 100644
--- a/dbms/src/Flash/Coprocessor/InterpreterUtils.h
+++ b/dbms/src/Flash/Coprocessor/InterpreterUtils.h
@@ -57,6 +57,7 @@ void orderStreams(
     size_t max_streams,
     SortDescription order_descr,
     Int64 limit,
+    bool enable_fine_grained_shuffle,
     const Context & context,
     const LoggerPtr & log);
 } // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp
index f915653fe96..a72dfcc16ef 100644
--- a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp
+++ b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp
@@ -23,6 +23,8 @@
 #include <Flash/Mpp/MPPTunnelSet.h>
 #include <Interpreters/AggregationCommon.h>
 
+#include <iostream>
+
 namespace DB
 {
 namespace ErrorCodes
@@ -37,8 +39,8 @@ inline void serializeToPacket(mpp::MPPDataPacket & packet, const tipb::SelectRes
         throw Exception(fmt::format("Fail to serialize response, response size: {}", response.ByteSizeLong()));
 }
 
-template <class StreamWriterPtr>
-StreamingDAGResponseWriter<StreamWriterPtr>::StreamingDAGResponseWriter(
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
+StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::StreamingDAGResponseWriter(
     StreamWriterPtr writer_,
     std::vector<Int64> partition_col_ids_,
     TiDB::TiDBCollators collators_,
@@ -46,7 +48,9 @@ StreamingDAGResponseWriter<StreamWriterPtr>::StreamingDAGResponseWriter(
     Int64 records_per_chunk_,
     Int64 batch_send_min_limit_,
     bool should_send_exec_summary_at_last_,
-    DAGContext & dag_context_)
+    DAGContext & dag_context_,
+    uint64_t fine_grained_shuffle_stream_count_,
+    UInt64 fine_grained_shuffle_batch_size_)
     : DAGResponseWriter(records_per_chunk_, dag_context_)
     , batch_send_min_limit(batch_send_min_limit_)
     , should_send_exec_summary_at_last(should_send_exec_summary_at_last_)
@@ -54,6 +58,8 @@ StreamingDAGResponseWriter<StreamWriterPtr>::StreamingDAGResponseWriter(
     , writer(writer_)
     , partition_col_ids(std::move(partition_col_ids_))
     , collators(std::move(collators_))
+    , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
+    , fine_grained_shuffle_batch_size(fine_grained_shuffle_batch_size_)
 {
     rows_in_blocks = 0;
     partition_num = writer_->getPartitionNum();
@@ -71,17 +77,37 @@ StreamingDAGResponseWriter<StreamWriterPtr>::StreamingDAGResponseWriter(
     }
 }
 
-template <class StreamWriterPtr>
-void StreamingDAGResponseWriter<StreamWriterPtr>::finishWrite()
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::finishWrite()
 {
     if (should_send_exec_summary_at_last)
-        batchWrite<true>();
+    {
+        if constexpr (enable_fine_grained_shuffle)
+        {
+            assert(exchange_type == tipb::ExchangeType::Hash);
+            batchWriteFineGrainedShuffle<true>();
+        }
+        else
+        {
+            batchWrite<true>();
+        }
+    }
     else
-        batchWrite<false>();
+    {
+        if constexpr (enable_fine_grained_shuffle)
+        {
+            assert(exchange_type == tipb::ExchangeType::Hash);
+            batchWriteFineGrainedShuffle<false>();
+        }
+        else
+        {
+            batchWrite<false>();
+        }
+    }
 }
 
-template <class StreamWriterPtr>
-void StreamingDAGResponseWriter<StreamWriterPtr>::write(const Block & block)
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::write(const Block & block)
 {
     if (block.columns() != dag_context.result_field_types.size())
         throw TiFlashException("Output column size mismatch with field type size", Errors::Coprocessor::Internal);
@@ -91,15 +117,23 @@ void StreamingDAGResponseWriter<StreamWriterPtr>::write(const Block & block)
     {
         blocks.push_back(block);
     }
-    if (static_cast<Int64>(rows_in_blocks) > (dag_context.encode_type == tipb::EncodeType::TypeCHBlock ? batch_send_min_limit : records_per_chunk - 1))
+
+    if constexpr (enable_fine_grained_shuffle)
     {
-        batchWrite<false>();
+        assert(exchange_type == tipb::ExchangeType::Hash);
+        if (static_cast<UInt64>(rows_in_blocks) >= fine_grained_shuffle_batch_size)
+            batchWriteFineGrainedShuffle<false>();
+    }
+    else
+    {
+        if (static_cast<Int64>(rows_in_blocks) > (dag_context.encode_type == tipb::EncodeType::TypeCHBlock ? batch_send_min_limit : records_per_chunk - 1))
+            batchWrite<false>();
     }
 }
 
-template <class StreamWriterPtr>
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 template <bool send_exec_summary_at_last>
-void StreamingDAGResponseWriter<StreamWriterPtr>::encodeThenWriteBlocks(
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::encodeThenWriteBlocks(
     const std::vector<Block> & input_blocks,
     tipb::SelectResponse & response) const
 {
@@ -191,133 +225,238 @@ void StreamingDAGResponseWriter<StreamWriterPtr>::encodeThenWriteBlocks(
     }
 }
 
-/// hash exchanging data among only TiFlash nodes.
-template <class StreamWriterPtr>
+
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 template <bool send_exec_summary_at_last>
-void StreamingDAGResponseWriter<StreamWriterPtr>::partitionAndEncodeThenWriteBlocks(
-    std::vector<Block> & input_blocks,
-    tipb::SelectResponse & response) const
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::batchWrite()
 {
-    std::vector<mpp::MPPDataPacket> packet(partition_num);
-
-    std::vector<size_t> responses_row_count(partition_num);
+    tipb::SelectResponse response;
+    if constexpr (send_exec_summary_at_last)
+        addExecuteSummaries(response, !dag_context.isMPPTask() || dag_context.isRootMPPTask());
+    if (exchange_type == tipb::ExchangeType::Hash)
+    {
+        partitionAndEncodeThenWriteBlocks<send_exec_summary_at_last>(blocks, response);
+    }
+    else
+    {
+        encodeThenWriteBlocks<send_exec_summary_at_last>(blocks, response);
+    }
+    blocks.clear();
+    rows_in_blocks = 0;
+}
 
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
+template <bool send_exec_summary_at_last>
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::handleExecSummary(
+    const std::vector<Block> & input_blocks,
+    std::vector<mpp::MPPDataPacket> & packet,
+    tipb::SelectResponse & response) const
+{
     if constexpr (send_exec_summary_at_last)
     {
         /// Sending the response to only one node, default the first one.
         serializeToPacket(packet[0], response);
-    }
 
-    if (input_blocks.empty())
-    {
-        if constexpr (send_exec_summary_at_last)
+        // No need to send data when blocks are not empty,
+        // because exec_summary will be sent together with blocks.
+        if (input_blocks.empty())
         {
             for (auto part_id = 0; part_id < partition_num; ++part_id)
             {
                 writer->write(packet[part_id], part_id);
             }
         }
-        return;
     }
+}
 
-    // partition tuples in blocks
-    // 1) compute partition id
-    // 2) partition each row
-    // 3) encode each chunk and send it
-    std::vector<String> partition_key_containers(collators.size());
-    for (auto & block : input_blocks)
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
+template <bool send_exec_summary_at_last>
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::writePackets(const std::vector<size_t> & responses_row_count,
+                                                                                            std::vector<mpp::MPPDataPacket> & packets) const
+{
+    for (size_t part_id = 0; part_id < packets.size(); ++part_id)
     {
-        std::vector<Block> dest_blocks(partition_num);
-        std::vector<MutableColumns> dest_tbl_cols(partition_num);
-
-        for (size_t i = 0; i < block.columns(); ++i)
+        if constexpr (send_exec_summary_at_last)
         {
-            if (ColumnPtr converted = block.getByPosition(i).column->convertToFullColumnIfConst())
-            {
-                block.getByPosition(i).column = converted;
-            }
+            writer->write(packets[part_id], part_id);
         }
-
-        for (auto i = 0; i < partition_num; ++i)
+        else
         {
-            dest_tbl_cols[i] = block.cloneEmptyColumns();
-            dest_blocks[i] = block.cloneEmpty();
+            if (responses_row_count[part_id] > 0)
+                writer->write(packets[part_id], part_id);
         }
+    }
+}
 
-        size_t rows = block.rows();
-        WeakHash32 hash(rows);
-
-        // get hash values by all partition key columns
-        for (size_t i = 0; i < partition_col_ids.size(); i++)
+inline void initInputBlocks(std::vector<Block> & input_blocks)
+{
+    for (auto & input_block : input_blocks)
+    {
+        for (size_t i = 0; i < input_block.columns(); ++i)
         {
-            block.getByPosition(partition_col_ids[i]).column->updateWeakHash32(hash, collators[i], partition_key_containers[i]);
+            if (ColumnPtr converted = input_block.getByPosition(i).column->convertToFullColumnIfConst())
+                input_block.getByPosition(i).column = converted;
         }
-        const auto & hash_data = hash.getData();
+    }
+}
 
-        // partition each row
-        IColumn::Selector selector(rows);
-        for (size_t row = 0; row < rows; ++row)
-        {
-            /// Row from interval [(2^32 / partition_num) * i, (2^32 / partition_num) * (i + 1)) goes to bucket with number i.
-            selector[row] = hash_data[row]; /// [0, 2^32)
-            selector[row] *= partition_num; /// [0, partition_num * 2^32), selector stores 64 bit values.
-            selector[row] >>= 32u; /// [0, partition_num)
-        }
+inline void initDestColumns(const Block & input_block, std::vector<MutableColumns> & dest_tbl_cols)
+{
+    for (auto & cols : dest_tbl_cols)
+    {
+        cols = input_block.cloneEmptyColumns();
+    }
+}
 
-        for (size_t col_id = 0; col_id < block.columns(); ++col_id)
-        {
-            // Scatter columns to different partitions
-            auto scattered_columns = block.getByPosition(col_id).column->scatter(partition_num, selector);
-            for (size_t part_id = 0; part_id < partition_num; ++part_id)
-            {
-                dest_tbl_cols[part_id][col_id] = std::move(scattered_columns[part_id]);
-            }
-        }
-        // serialize each partitioned block and write it to its destination
-        for (auto part_id = 0; part_id < partition_num; ++part_id)
-        {
-            dest_blocks[part_id].setColumns(std::move(dest_tbl_cols[part_id]));
-            responses_row_count[part_id] += dest_blocks[part_id].rows();
-            chunk_codec_stream->encode(dest_blocks[part_id], 0, dest_blocks[part_id].rows());
-            packet[part_id].add_chunks(chunk_codec_stream->getString());
-            chunk_codec_stream->clear();
-        }
+void computeHash(const Block & input_block,
+                 uint32_t bucket_num,
+                 const TiDB::TiDBCollators & collators,
+                 std::vector<String> & partition_key_containers,
+                 const std::vector<Int64> & partition_col_ids,
+                 std::vector<std::vector<MutableColumnPtr>> & result_columns)
+{
+    size_t rows = input_block.rows();
+    WeakHash32 hash(rows);
+
+    // get hash values by all partition key columns
+    for (size_t i = 0; i < partition_col_ids.size(); ++i)
+    {
+        input_block.getByPosition(partition_col_ids[i]).column->updateWeakHash32(hash, collators[i], partition_key_containers[i]);
     }
 
-    for (auto part_id = 0; part_id < partition_num; ++part_id)
+    const auto & hash_data = hash.getData();
+
+    // partition each row
+    IColumn::Selector selector(rows);
+    for (size_t row = 0; row < rows; ++row)
     {
-        if constexpr (send_exec_summary_at_last)
+        /// Row from interval [(2^32 / bucket_num) * i, (2^32 / bucket_num) * (i + 1)) goes to bucket with number i.
+        selector[row] = hash_data[row]; /// [0, 2^32)
+        selector[row] *= bucket_num; /// [0, bucket_num * 2^32), selector stores 64 bit values.
+        selector[row] >>= 32u; /// [0, bucket_num)
+    }
+
+    for (size_t col_id = 0; col_id < input_block.columns(); ++col_id)
+    {
+        // Scatter columns to different partitions
+        std::vector<MutableColumnPtr> part_columns = input_block.getByPosition(col_id).column->scatter(bucket_num, selector);
+        assert(part_columns.size() == bucket_num);
+        for (size_t bucket_idx = 0; bucket_idx < bucket_num; ++bucket_idx)
         {
-            writer->write(packet[part_id], part_id);
+            result_columns[bucket_idx][col_id] = std::move(part_columns[bucket_idx]);
         }
-        else
+    }
+}
+
+/// Hash exchanging data among only TiFlash nodes. Only be called when enable_fine_grained_shuffle is false.
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
+template <bool send_exec_summary_at_last>
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::partitionAndEncodeThenWriteBlocks(
+    std::vector<Block> & input_blocks,
+    tipb::SelectResponse & response) const
+{
+    static_assert(!enable_fine_grained_shuffle);
+    std::vector<mpp::MPPDataPacket> packet(partition_num);
+    std::vector<size_t> responses_row_count(partition_num);
+    handleExecSummary<send_exec_summary_at_last>(input_blocks, packet, response);
+    if (input_blocks.empty())
+        return;
+
+    initInputBlocks(input_blocks);
+    Block dest_block = input_blocks[0].cloneEmpty();
+    std::vector<String> partition_key_containers(collators.size());
+    for (const auto & block : input_blocks)
+    {
+        std::vector<MutableColumns> dest_tbl_cols(partition_num);
+        initDestColumns(block, dest_tbl_cols);
+
+        computeHash(block, partition_num, collators, partition_key_containers, partition_col_ids, dest_tbl_cols);
+
+        for (size_t part_id = 0; part_id < partition_num; ++part_id)
         {
-            if (responses_row_count[part_id] > 0)
-                writer->write(packet[part_id], part_id);
+            dest_block.setColumns(std::move(dest_tbl_cols[part_id]));
+            responses_row_count[part_id] += dest_block.rows();
+            chunk_codec_stream->encode(dest_block, 0, dest_block.rows());
+            packet[part_id].add_chunks(chunk_codec_stream->getString());
+            chunk_codec_stream->clear();
         }
     }
+
+    writePackets<send_exec_summary_at_last>(responses_row_count, packet);
 }
 
-template <class StreamWriterPtr>
+/// Hash exchanging data among only TiFlash nodes. Only be called when enable_fine_grained_shuffle is true.
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 template <bool send_exec_summary_at_last>
-void StreamingDAGResponseWriter<StreamWriterPtr>::batchWrite()
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::batchWriteFineGrainedShuffle()
 {
+    static_assert(enable_fine_grained_shuffle);
+    assert(exchange_type == tipb::ExchangeType::Hash);
+    assert(fine_grained_shuffle_stream_count <= 1024);
+
     tipb::SelectResponse response;
     if constexpr (send_exec_summary_at_last)
         addExecuteSummaries(response, !dag_context.isMPPTask() || dag_context.isRootMPPTask());
-    if (exchange_type == tipb::ExchangeType::Hash)
-    {
-        partitionAndEncodeThenWriteBlocks<send_exec_summary_at_last>(blocks, response);
-    }
-    else
+
+    std::vector<mpp::MPPDataPacket> packet(partition_num);
+    std::vector<size_t> responses_row_count(partition_num, 0);
+
+    // fine_grained_shuffle_stream_count is in [0, 1024], and partition_num is uint16_t, so will not overflow.
+    uint32_t bucket_num = partition_num * fine_grained_shuffle_stream_count;
+    handleExecSummary<send_exec_summary_at_last>(blocks, packet, response);
+    if (!blocks.empty())
     {
-        encodeThenWriteBlocks<send_exec_summary_at_last>(blocks, response);
+        std::vector<MutableColumns> final_dest_tbl_columns(bucket_num);
+        initInputBlocks(blocks);
+        initDestColumns(blocks[0], final_dest_tbl_columns);
+
+        // Hash partition input_blocks into bucket_num.
+        for (const auto & block : blocks)
+        {
+            std::vector<String> partition_key_containers(collators.size());
+            std::vector<MutableColumns> dest_tbl_columns(bucket_num);
+            initDestColumns(block, dest_tbl_columns);
+            computeHash(block, bucket_num, collators, partition_key_containers, partition_col_ids, dest_tbl_columns);
+            for (size_t bucket_idx = 0; bucket_idx < bucket_num; ++bucket_idx)
+            {
+                for (size_t col_id = 0; col_id < block.columns(); ++col_id)
+                {
+                    const MutableColumnPtr & src_col = dest_tbl_columns[bucket_idx][col_id];
+                    final_dest_tbl_columns[bucket_idx][col_id]->insertRangeFrom(*src_col, 0, src_col->size());
+                }
+            }
+        }
+
+        // For i-th stream_count buckets, send to i-th tiflash node.
+        for (size_t bucket_idx = 0; bucket_idx < bucket_num; bucket_idx += fine_grained_shuffle_stream_count)
+        {
+            size_t part_id = bucket_idx / fine_grained_shuffle_stream_count; // NOLINT(clang-analyzer-core.DivideZero)
+            size_t row_count_per_part = 0;
+            for (uint64_t stream_idx = 0; stream_idx < fine_grained_shuffle_stream_count; ++stream_idx)
+            {
+                Block dest_block = blocks[0].cloneEmpty();
+                // For now we put all rows into one Block, may cause this Block too large.
+                dest_block.setColumns(std::move(final_dest_tbl_columns[bucket_idx + stream_idx]));
+                row_count_per_part += dest_block.rows();
+
+                chunk_codec_stream->encode(dest_block, 0, dest_block.rows());
+                packet[part_id].add_chunks(chunk_codec_stream->getString());
+                packet[part_id].add_stream_ids(stream_idx);
+                chunk_codec_stream->clear();
+            }
+            responses_row_count[part_id] = row_count_per_part;
+        }
     }
+
+    writePackets<send_exec_summary_at_last>(responses_row_count, packet);
+
     blocks.clear();
     rows_in_blocks = 0;
 }
 
-template class StreamingDAGResponseWriter<StreamWriterPtr>;
-template class StreamingDAGResponseWriter<MPPTunnelSetPtr>;
+template class StreamingDAGResponseWriter<StreamWriterPtr, /*enable_fine_grained_shuffle=*/true>;
+template class StreamingDAGResponseWriter<MPPTunnelSetPtr, /*enable_fine_grained_shuffle=*/true>;
+template class StreamingDAGResponseWriter<StreamWriterPtr, /*enable_fine_grained_shuffle=*/false>;
+template class StreamingDAGResponseWriter<MPPTunnelSetPtr, /*enable_fine_grained_shuffle=*/false>;
 
 } // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h
index 9b5e3864c64..cd7559d1e79 100644
--- a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h
+++ b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h
@@ -33,7 +33,7 @@ namespace DB
 /// Serializes the stream of blocks and sends them to TiDB or TiFlash with different serialization paths.
 /// When sending data to TiDB, blocks with extra info are written into tipb::SelectResponse, then the whole tipb::SelectResponse is further serialized into mpp::MPPDataPacket.data.
 /// Differently when sending data to TiFlash, blocks with only tuples are directly serialized into mpp::MPPDataPacket.chunks, but for the last block, its extra info (like execution summaries) is written into tipb::SelectResponse, then further serialized into mpp::MPPDataPacket.data.
-template <class StreamWriterPtr>
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 class StreamingDAGResponseWriter : public DAGResponseWriter
 {
 public:
@@ -45,18 +45,30 @@ class StreamingDAGResponseWriter : public DAGResponseWriter
         Int64 records_per_chunk_,
         Int64 batch_send_min_limit_,
         bool should_send_exec_summary_at_last,
-        DAGContext & dag_context_);
+        DAGContext & dag_context_,
+        UInt64 fine_grained_shuffle_stream_count_,
+        UInt64 fine_grained_shuffle_batch_size);
     void write(const Block & block) override;
     void finishWrite() override;
 
 private:
     template <bool send_exec_summary_at_last>
     void batchWrite();
+    template <bool send_exec_summary_at_last>
+    void batchWriteFineGrainedShuffle();
+
     template <bool send_exec_summary_at_last>
     void encodeThenWriteBlocks(const std::vector<Block> & input_blocks, tipb::SelectResponse & response) const;
     template <bool send_exec_summary_at_last>
     void partitionAndEncodeThenWriteBlocks(std::vector<Block> & input_blocks, tipb::SelectResponse & response) const;
 
+    template <bool send_exec_summary_at_last>
+    void handleExecSummary(const std::vector<Block> & input_blocks,
+                           std::vector<mpp::MPPDataPacket> & packet,
+                           tipb::SelectResponse & response) const;
+    template <bool send_exec_summary_at_last>
+    void writePackets(const std::vector<size_t> & responses_row_count, std::vector<mpp::MPPDataPacket> & packets) const;
+
     Int64 batch_send_min_limit;
     bool should_send_exec_summary_at_last; /// only one stream needs to sending execution summaries at last.
     tipb::ExchangeType exchange_type;
@@ -67,6 +79,8 @@ class StreamingDAGResponseWriter : public DAGResponseWriter
     size_t rows_in_blocks;
     uint16_t partition_num;
     std::unique_ptr<ChunkCodecStream> chunk_codec_stream;
+    UInt64 fine_grained_shuffle_stream_count;
+    UInt64 fine_grained_shuffle_batch_size;
 };
 
 } // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/TiDBColumn.cpp b/dbms/src/Flash/Coprocessor/TiDBColumn.cpp
index 7183374a5c1..eef89696d3a 100644
--- a/dbms/src/Flash/Coprocessor/TiDBColumn.cpp
+++ b/dbms/src/Flash/Coprocessor/TiDBColumn.cpp
@@ -28,7 +28,7 @@ template <typename T>
 void encodeLittleEndian(const T & value, WriteBuffer & ss)
 {
     auto v = toLittleEndian(value);
-    ss.write(reinterpret_cast<const char *>(&v), sizeof(v));
+    ss.template writeFixed<T>(&v);
 }
 
 TiDBColumn::TiDBColumn(Int8 element_len_)
@@ -141,10 +141,10 @@ void TiDBColumn::append(const TiDBDecimal & decimal)
     encodeLittleEndian<UInt8>(decimal.digits_int, *data);
     encodeLittleEndian<UInt8>(decimal.digits_frac, *data);
     encodeLittleEndian<UInt8>(decimal.result_frac, *data);
-    encodeLittleEndian<UInt8>((UInt8)decimal.negative, *data);
-    for (int i = 0; i < MAX_WORD_BUF_LEN; i++)
+    encodeLittleEndian<UInt8>(static_cast<UInt8>(decimal.negative), *data);
+    for (int i : decimal.word_buf)
     {
-        encodeLittleEndian<Int32>(decimal.word_buf[i], *data);
+        encodeLittleEndian<Int32>(i, *data);
     }
     finishAppendFixed();
 }
diff --git a/dbms/src/Flash/Coprocessor/collectOutputFieldTypes.cpp b/dbms/src/Flash/Coprocessor/collectOutputFieldTypes.cpp
index b68279faa13..87744c553e0 100644
--- a/dbms/src/Flash/Coprocessor/collectOutputFieldTypes.cpp
+++ b/dbms/src/Flash/Coprocessor/collectOutputFieldTypes.cpp
@@ -45,19 +45,36 @@ bool collectForAgg(std::vector<tipb::FieldType> & output_field_types, const tipb
 {
     for (const auto & expr : agg.agg_func())
     {
-        if (!exprHasValidFieldType(expr))
+        if (unlikely(!exprHasValidFieldType(expr)))
             throw TiFlashException("Agg expression without valid field type", Errors::Coprocessor::BadRequest);
         output_field_types.push_back(expr.field_type());
     }
     for (const auto & expr : agg.group_by())
     {
-        if (!exprHasValidFieldType(expr))
+        if (unlikely(!exprHasValidFieldType(expr)))
             throw TiFlashException("Group by expression without valid field type", Errors::Coprocessor::BadRequest);
         output_field_types.push_back(expr.field_type());
     }
     return false;
 }
 
+bool collectForExecutor(std::vector<tipb::FieldType> & output_field_types, const tipb::Executor & executor);
+bool collectForWindow(std::vector<tipb::FieldType> & output_field_types, const tipb::Executor & executor)
+{
+    // collect output_field_types of child
+    getChildren(executor).forEach([&output_field_types](const tipb::Executor & child) {
+        traverseExecutorTree(child, [&output_field_types](const tipb::Executor & e) { return collectForExecutor(output_field_types, e); });
+    });
+
+    for (const auto & expr : executor.window().func_desc())
+    {
+        if (unlikely(!exprHasValidFieldType(expr)))
+            throw TiFlashException("Window expression without valid field type", Errors::Coprocessor::BadRequest);
+        output_field_types.push_back(expr.field_type());
+    }
+    return false;
+}
+
 bool collectForReceiver(std::vector<tipb::FieldType> & output_field_types, const tipb::ExchangeReceiver & receiver)
 {
     for (const auto & field_type : receiver.field_types())
@@ -82,7 +99,6 @@ bool collectForTableScan(std::vector<tipb::FieldType> & output_field_types, cons
     return false;
 }
 
-bool collectForExecutor(std::vector<tipb::FieldType> & output_field_types, const tipb::Executor & executor);
 bool collectForJoin(std::vector<tipb::FieldType> & output_field_types, const tipb::Executor & executor)
 {
     // collect output_field_types of children
@@ -147,8 +163,8 @@ bool collectForExecutor(std::vector<tipb::FieldType> & output_field_types, const
     case tipb::ExecType::TypeWindow:
         // Window will only be pushed down in mpp mode.
         // In mpp mode, ExchangeSender or Sender will return output_field_types directly.
-        // If not in mpp mode, window executor type is invalid.
-        throw TiFlashException("Window executor type is invalid in non-mpp mode, should not reach here.", Errors::Coprocessor::Internal);
+        // If not in mpp mode or debug mode, window executor type is invalid.
+        return collectForWindow(output_field_types, executor);
     case tipb::ExecType::TypeExchangeReceiver:
         return collectForReceiver(output_field_types, executor.exchange_receiver());
     case tipb::ExecType::TypeTableScan:
diff --git a/dbms/src/Flash/Coprocessor/tests/gtest_streaming_dag_writer.cpp b/dbms/src/Flash/Coprocessor/tests/gtest_streaming_dag_writer.cpp
new file mode 100644
index 00000000000..5d4186123b7
--- /dev/null
+++ b/dbms/src/Flash/Coprocessor/tests/gtest_streaming_dag_writer.cpp
@@ -0,0 +1,184 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Flash/Coprocessor/CHBlockChunkCodec.h>
+#include <TestUtils/TiFlashTestBasic.h>
+#include <TestUtils/TiFlashTestEnv.h>
+#include <gtest/gtest.h>
+
+#include <Flash/Coprocessor/StreamingDAGResponseWriter.cpp>
+#include <iostream>
+
+namespace DB
+{
+namespace tests
+{
+
+using BlockPtr = std::shared_ptr<Block>;
+class TestStreamingDAGResponseWriter : public testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        dag_context_ptr = std::make_unique<DAGContext>(1024);
+        dag_context_ptr->encode_type = tipb::EncodeType::TypeCHBlock;
+        dag_context_ptr->is_mpp_task = true;
+        dag_context_ptr->is_root_mpp_task = false;
+        dag_context_ptr->result_field_types = makeFields();
+        context.setDAGContext(dag_context_ptr.get());
+    }
+
+public:
+    TestStreamingDAGResponseWriter()
+        : context(TiFlashTestEnv::getContext())
+        , part_col_ids{0}
+        , part_col_collators{
+              TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY)}
+    {}
+
+    // Return 10 Int64 column.
+    static std::vector<tipb::FieldType> makeFields()
+    {
+        std::vector<tipb::FieldType> fields(10);
+        for (int i = 0; i < 10; ++i)
+        {
+            fields[i].set_tp(TiDB::TypeLongLong);
+        }
+        return fields;
+    }
+
+    // Return a block with **rows** and 10 Int64 column.
+    static BlockPtr prepareBlock(const std::vector<Int64> & rows)
+    {
+        BlockPtr block = std::make_shared<Block>();
+        for (int i = 0; i < 10; ++i)
+        {
+            DataTypePtr int64_data_type = std::make_shared<DataTypeInt64>();
+            DataTypePtr nullable_int64_data_type = std::make_shared<DataTypeNullable>(int64_data_type);
+            MutableColumnPtr int64_col = nullable_int64_data_type->createColumn();
+            for (Int64 r : rows)
+            {
+                int64_col->insert(Field(r));
+            }
+            block->insert(ColumnWithTypeAndName{std::move(int64_col),
+                                                nullable_int64_data_type,
+                                                String("col") + std::to_string(i)});
+        }
+        return block;
+    }
+
+    Context context;
+    std::vector<Int64> part_col_ids;
+    TiDB::TiDBCollators part_col_collators;
+
+    std::unique_ptr<DAGContext> dag_context_ptr;
+};
+
+using MockStreamWriterChecker = std::function<void(mpp::MPPDataPacket &, uint16_t)>;
+
+struct MockStreamWriter
+{
+    MockStreamWriter(MockStreamWriterChecker checker_,
+                     uint16_t part_num_)
+        : checker(checker_)
+        , part_num(part_num_)
+    {}
+
+    void write(mpp::MPPDataPacket &) { FAIL() << "cannot reach here, because we only expect hash partition"; }
+    void write(mpp::MPPDataPacket & packet, uint16_t part_id) { checker(packet, part_id); }
+    void write(tipb::SelectResponse &, uint16_t) { FAIL() << "cannot reach here, only consider CH Block format"; }
+    void write(tipb::SelectResponse &) { FAIL() << "cannot reach here, only consider CH Block format"; }
+    uint16_t getPartitionNum() const { return part_num; }
+
+private:
+    MockStreamWriterChecker checker;
+    uint16_t part_num;
+};
+
+// Input block data is distributed uniform.
+// partition_num: 4
+// fine_grained_shuffle_stream_count: 8
+TEST_F(TestStreamingDAGResponseWriter, testBatchWriteFineGrainedShuffle)
+try
+{
+    const size_t block_rows = 1024;
+    const uint16_t part_num = 4;
+    const uint32_t fine_grained_shuffle_stream_count = 8;
+    const Int64 fine_grained_shuffle_batch_size = 4096;
+
+    // Set these to 1, because when fine grained shuffle is enabled,
+    // batchWriteFineGrainedShuffle() only check fine_grained_shuffle_batch_size.
+    // records_per_chunk and batch_send_min_limit are useless.
+    const Int64 records_per_chunk = 1;
+    const Int64 batch_send_min_limit = 1;
+    const bool should_send_exec_summary_at_last = true;
+
+    // 1. Build Block.
+    std::vector<Int64> uniform_data_set;
+    for (size_t i = 0; i < block_rows; ++i)
+    {
+        uniform_data_set.push_back(i);
+    }
+    BlockPtr block = prepareBlock(uniform_data_set);
+
+    // 2. Build MockStreamWriter.
+    std::unordered_map<uint16_t, mpp::MPPDataPacket> write_report;
+    auto checker = [&write_report](mpp::MPPDataPacket & packet, uint16_t part_id) {
+        auto res = write_report.insert({part_id, packet});
+        // Should always insert succeed.
+        // Because block.rows(1024) < fine_grained_shuffle_batch_size(4096),
+        // batchWriteFineGrainedShuffle() only called once, so will only be one packet for each partition.
+        ASSERT_TRUE(res.second);
+    };
+    auto mock_writer = std::make_shared<MockStreamWriter>(checker, part_num);
+
+    // 3. Start to write.
+    auto dag_writer = std::make_shared<StreamingDAGResponseWriter<std::shared_ptr<MockStreamWriter>, /*enable_fine_grained_shuffle=*/true>>(
+        mock_writer,
+        part_col_ids,
+        part_col_collators,
+        tipb::ExchangeType::Hash,
+        records_per_chunk,
+        batch_send_min_limit,
+        should_send_exec_summary_at_last,
+        *dag_context_ptr,
+        fine_grained_shuffle_stream_count,
+        fine_grained_shuffle_batch_size);
+    dag_writer->write(*block);
+    dag_writer->finishWrite();
+
+    // 4. Start to check write_report.
+    std::vector<Block> decoded_blocks;
+    ASSERT_EQ(write_report.size(), part_num);
+    for (const auto & ele : write_report)
+    {
+        const mpp::MPPDataPacket & packet = ele.second;
+        ASSERT_EQ(packet.chunks_size(), packet.stream_ids_size());
+        for (int i = 0; i < packet.chunks_size(); ++i)
+        {
+            decoded_blocks.push_back(CHBlockChunkCodec::decode(packet.chunks(i), *block));
+        }
+    }
+    ASSERT_EQ(decoded_blocks.size(), fine_grained_shuffle_stream_count * part_num);
+    for (const auto & block : decoded_blocks)
+    {
+        ASSERT_EQ(block.rows(), block_rows / (fine_grained_shuffle_stream_count * part_num));
+    }
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Flash/EstablishCall.cpp b/dbms/src/Flash/EstablishCall.cpp
index 8af81e30962..2f8c7c15f56 100644
--- a/dbms/src/Flash/EstablishCall.cpp
+++ b/dbms/src/Flash/EstablishCall.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/TiFlashMetrics.h>
 #include <Flash/EstablishCall.h>
 #include <Flash/FlashService.h>
@@ -19,6 +20,11 @@
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_tunnel_init_rpc_failure_failpoint[];
+} // namespace FailPoints
+
 EstablishCallData::EstablishCallData(AsyncFlashService * service, grpc::ServerCompletionQueue * cq, grpc::ServerCompletionQueue * notify_cq, const std::shared_ptr<std::atomic<bool>> & is_shutdown)
     : service(service)
     , cq(cq)
@@ -71,6 +77,7 @@ void EstablishCallData::initRpc()
     std::exception_ptr eptr = nullptr;
     try
     {
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_tunnel_init_rpc_failure_failpoint);
         service->establishMPPConnectionSyncOrAsync(&ctx, &request, nullptr, this);
     }
     catch (...)
@@ -136,7 +143,7 @@ void EstablishCallData::finishTunnelAndResponder()
     state = FINISH;
     if (mpp_tunnel)
     {
-        mpp_tunnel->consumerFinish("grpc writes failed.", true); //trigger mpp tunnel finish work
+        mpp_tunnel->consumerFinish(fmt::format("{}: finishTunnelAndResponder called.", mpp_tunnel->id()), true); //trigger mpp tunnel finish work
     }
     grpc::Status status(static_cast<grpc::StatusCode>(GRPC_STATUS_UNKNOWN), "Consumer exits unexpected, grpc writes failed.");
     responder.Finish(status, this);
diff --git a/dbms/src/Flash/Management/tests/gtest_manual_compact.cpp b/dbms/src/Flash/Management/tests/gtest_manual_compact.cpp
index df6c881c306..1e9da93ffe3 100644
--- a/dbms/src/Flash/Management/tests/gtest_manual_compact.cpp
+++ b/dbms/src/Flash/Management/tests/gtest_manual_compact.cpp
@@ -14,7 +14,6 @@
 
 #include <Common/FailPoint.h>
 #include <Flash/Management/ManualCompact.h>
-#include <Poco/Logger.h>
 #include <Storages/ColumnsDescription.h>
 #include <Storages/DeltaMerge/RowKeyRange.h>
 #include <Storages/DeltaMerge/tests/DMTestEnv.h>
@@ -48,7 +47,6 @@ class BasicManualCompactTest
 
     BasicManualCompactTest()
     {
-        log = &Poco::Logger::get(DB::base::TiFlashStorageTestBasic::getCurrentFullTestName());
         pk_type = GetParam();
     }
 
@@ -63,7 +61,7 @@ class BasicManualCompactTest
             setupStorage();
 
             // In tests let's only compact one segment.
-            db_context->setSetting("manual_compact_more_until_ms", UInt64(0));
+            db_context->setSetting("manual_compact_more_until_ms", Field(UInt64(0)));
 
             // Split into 4 segments, and prepare some delta data for first 3 segments.
             helper = std::make_unique<DM::tests::MultiSegmentTestUtil>(*db_context);
@@ -116,8 +114,6 @@ class BasicManualCompactTest
     std::unique_ptr<DB::Management::ManualCompactManager> manager;
 
     DM::tests::DMTestEnv::PkType pk_type;
-
-    [[maybe_unused]] Poco::Logger * log;
 };
 
 
@@ -315,7 +311,7 @@ CATCH
 TEST_P(BasicManualCompactTest, CompactMultiple)
 try
 {
-    db_context->setSetting("manual_compact_more_until_ms", UInt64(60 * 1000)); // Hope it's long enough!
+    db_context->setSetting("manual_compact_more_until_ms", Field(UInt64(60 * 1000))); // Hope it's long enough!
 
     auto request = ::kvrpcpb::CompactRequest();
     request.set_physical_table_id(TABLE_ID);
diff --git a/dbms/src/Flash/Mpp/ExchangeReceiver.cpp b/dbms/src/Flash/Mpp/ExchangeReceiver.cpp
index f194afee31f..ab8d83a1481 100644
--- a/dbms/src/Flash/Mpp/ExchangeReceiver.cpp
+++ b/dbms/src/Flash/Mpp/ExchangeReceiver.cpp
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <Common/CPUAffinityManager.h>
+#include <Common/Exception.h>
+#include <Common/FailPoint.h>
 #include <Common/ThreadFactory.h>
 #include <Common/TiFlashMetrics.h>
 #include <Flash/Coprocessor/CoprocessorReader.h>
@@ -22,6 +24,12 @@
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_receiver_sync_msg_push_failure_failpoint[];
+extern const char random_receiver_async_msg_push_failure_failpoint[];
+} // namespace FailPoints
+
 namespace
 {
 String getReceiverStateStr(const ExchangeReceiverState & s)
@@ -41,6 +49,106 @@ String getReceiverStateStr(const ExchangeReceiverState & s)
     }
 }
 
+// If enable_fine_grained_shuffle:
+//      Seperate chunks according to packet.stream_ids[i], then push to msg_channels[stream_id].
+// If fine grained_shuffle is disabled:
+//      Push all chunks to msg_channels[0].
+// Return true if all push succeed, otherwise return false.
+// NOTE: shared_ptr<MPPDataPacket> will be hold by all ExchangeReceiverBlockInputStream to make chunk pointer valid.
+template <bool enable_fine_grained_shuffle, bool is_sync>
+bool pushPacket(size_t source_index,
+                const String & req_info,
+                MPPDataPacketPtr & packet,
+                const std::vector<MsgChannelPtr> & msg_channels,
+                LoggerPtr & log)
+{
+    bool push_succeed = true;
+
+    const mpp::Error * error_ptr = nullptr;
+    if (packet->has_error())
+        error_ptr = &packet->error();
+    const String * resp_ptr = nullptr;
+    if (!packet->data().empty())
+        resp_ptr = &packet->data();
+
+    if constexpr (enable_fine_grained_shuffle)
+    {
+        std::vector<std::vector<const String *>> chunks(msg_channels.size());
+        if (!packet->chunks().empty())
+        {
+            // Packet not empty.
+            if (unlikely(packet->stream_ids().empty()))
+            {
+                // Fine grained shuffle is enabled in receiver, but sender didn't. We cannot handle this, so return error.
+                // This can happen when there are old version nodes when upgrading.
+                LOG_FMT_ERROR(log, "MPPDataPacket.stream_ids empty, it means ExchangeSender is old version of binary "
+                                   "(source_index: {}) while fine grained shuffle of ExchangeReceiver is enabled. "
+                                   "Cannot handle this.",
+                              source_index);
+                return false;
+            }
+            // packet.stream_ids[i] is corresponding to packet.chunks[i],
+            // indicating which stream_id this chunk belongs to.
+            assert(packet->chunks_size() == packet->stream_ids_size());
+
+            for (int i = 0; i < packet->stream_ids_size(); ++i)
+            {
+                UInt64 stream_id = packet->stream_ids(i) % msg_channels.size();
+                chunks[stream_id].push_back(&packet->chunks(i));
+            }
+        }
+        // Still need to send error_ptr or resp_ptr even if packet.chunks_size() is zero.
+        for (size_t i = 0; i < msg_channels.size() && push_succeed; ++i)
+        {
+            if (resp_ptr == nullptr && error_ptr == nullptr && chunks[i].empty())
+                continue;
+
+            std::shared_ptr<ReceivedMessage> recv_msg = std::make_shared<ReceivedMessage>(
+                source_index,
+                req_info,
+                packet,
+                error_ptr,
+                resp_ptr,
+                std::move(chunks[i]));
+            push_succeed = msg_channels[i]->push(std::move(recv_msg));
+            if constexpr (is_sync)
+                fiu_do_on(FailPoints::random_receiver_sync_msg_push_failure_failpoint, push_succeed = false;);
+            else
+                fiu_do_on(FailPoints::random_receiver_async_msg_push_failure_failpoint, push_succeed = false;);
+
+            // Only the first ExchangeReceiverInputStream need to handle resp.
+            resp_ptr = nullptr;
+        }
+    }
+    else
+    {
+        std::vector<const String *> chunks(packet->chunks_size());
+        for (int i = 0; i < packet->chunks_size(); ++i)
+        {
+            chunks[i] = &packet->chunks(i);
+        }
+
+        if (!(resp_ptr == nullptr && error_ptr == nullptr && chunks.empty()))
+        {
+            std::shared_ptr<ReceivedMessage> recv_msg = std::make_shared<ReceivedMessage>(
+                source_index,
+                req_info,
+                packet,
+                error_ptr,
+                resp_ptr,
+                std::move(chunks));
+
+            push_succeed = msg_channels[0]->push(std::move(recv_msg));
+            if constexpr (is_sync)
+                fiu_do_on(FailPoints::random_receiver_sync_msg_push_failure_failpoint, push_succeed = false;);
+            else
+                fiu_do_on(FailPoints::random_receiver_async_msg_push_failure_failpoint, push_succeed = false;);
+        }
+    }
+    LOG_FMT_DEBUG(log, "push recv_msg to msg_channels(size: {}) succeed:{}, enable_fine_grained_shuffle: {}", msg_channels.size(), push_succeed, enable_fine_grained_shuffle);
+    return push_succeed;
+}
+
 enum class AsyncRequestStage
 {
     NEED_INIT,
@@ -57,25 +165,25 @@ using TimePoint = Clock::time_point;
 constexpr Int32 max_retry_times = 10;
 constexpr Int32 batch_packet_count = 16;
 
-template <typename RPCContext>
+template <typename RPCContext, bool enable_fine_grained_shuffle>
 class AsyncRequestHandler : public UnaryCallback<bool>
 {
 public:
     using Status = typename RPCContext::Status;
     using Request = typename RPCContext::Request;
     using AsyncReader = typename RPCContext::AsyncReader;
-    using Self = AsyncRequestHandler<RPCContext>;
+    using Self = AsyncRequestHandler<RPCContext, enable_fine_grained_shuffle>;
 
     AsyncRequestHandler(
         MPMCQueue<Self *> * queue,
-        MPMCQueue<std::shared_ptr<ReceivedMessage>> * msg_channel_,
+        std::vector<MsgChannelPtr> * msg_channels_,
         const std::shared_ptr<RPCContext> & context,
         const Request & req,
         const String & req_id)
         : rpc_context(context)
         , request(&req)
         , notify_queue(queue)
-        , msg_channel(msg_channel_)
+        , msg_channels(msg_channels_)
         , req_info(fmt::format("tunnel{}+{}", req.send_task_id, req.recv_task_id))
         , log(Logger::get("ExchangeReceiver", req_id, req_info))
     {
@@ -253,11 +361,7 @@ class AsyncRequestHandler : public UnaryCallback<bool>
         for (size_t i = 0; i < read_packet_index; ++i)
         {
             auto & packet = packets[i];
-            auto recv_msg = std::make_shared<ReceivedMessage>();
-            recv_msg->packet = std::move(packet);
-            recv_msg->source_index = request->source_index;
-            recv_msg->req_info = req_info;
-            if (!msg_channel->push(std::move(recv_msg)))
+            if (!pushPacket<enable_fine_grained_shuffle, false>(request->source_index, req_info, packet, *msg_channels, log))
                 return false;
             // can't reuse packet since it is sent to readers.
             packet = std::make_shared<MPPDataPacket>();
@@ -274,7 +378,7 @@ class AsyncRequestHandler : public UnaryCallback<bool>
     std::shared_ptr<RPCContext> rpc_context;
     const Request * request; // won't be null
     MPMCQueue<Self *> * notify_queue; // won't be null
-    MPMCQueue<std::shared_ptr<ReceivedMessage>> * msg_channel; // won't be null
+    std::vector<MsgChannelPtr> * msg_channels; // won't be null
 
     String req_info;
     bool meet_error = false;
@@ -299,20 +403,32 @@ ExchangeReceiverBase<RPCContext>::ExchangeReceiverBase(
     size_t source_num_,
     size_t max_streams_,
     const String & req_id,
-    const String & executor_id)
+    const String & executor_id,
+    uint64_t fine_grained_shuffle_stream_count_)
     : rpc_context(std::move(rpc_context_))
     , source_num(source_num_)
     , max_streams(max_streams_)
     , max_buffer_size(std::max<size_t>(batch_packet_count, std::max(source_num, max_streams_) * 2))
     , thread_manager(newThreadManager())
-    , msg_channel(max_buffer_size)
     , live_connections(source_num)
     , state(ExchangeReceiverState::NORMAL)
     , exc_log(Logger::get("ExchangeReceiver", req_id, executor_id))
     , collected(false)
+    , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
 {
     try
     {
+        if (enableFineGrainedShuffle(fine_grained_shuffle_stream_count_))
+        {
+            for (size_t i = 0; i < max_streams_; ++i)
+            {
+                msg_channels.push_back(std::make_unique<MPMCQueue<std::shared_ptr<ReceivedMessage>>>(max_buffer_size));
+            }
+        }
+        else
+        {
+            msg_channels.push_back(std::make_unique<MPMCQueue<std::shared_ptr<ReceivedMessage>>>(max_buffer_size));
+        }
         rpc_context->fillSchema(schema);
         setUpConnection();
     }
@@ -349,14 +465,14 @@ template <typename RPCContext>
 void ExchangeReceiverBase<RPCContext>::cancel()
 {
     setEndState(ExchangeReceiverState::CANCELED);
-    msg_channel.finish();
+    cancelAllMsgChannels();
 }
 
 template <typename RPCContext>
 void ExchangeReceiverBase<RPCContext>::close()
 {
     setEndState(ExchangeReceiverState::CLOSED);
-    msg_channel.finish();
+    finishAllMsgChannels();
 }
 
 template <typename RPCContext>
@@ -371,7 +487,12 @@ void ExchangeReceiverBase<RPCContext>::setUpConnection()
             async_requests.push_back(std::move(req));
         else
         {
-            thread_manager->schedule(true, "Receiver", [this, req = std::move(req)] { readLoop(req); });
+            thread_manager->schedule(true, "Receiver", [this, req = std::move(req)] {
+                if (enableFineGrainedShuffle(fine_grained_shuffle_stream_count))
+                    readLoop<true>(req);
+                else
+                    readLoop<false>(req);
+            });
             ++thread_count;
         }
     }
@@ -379,15 +500,21 @@ void ExchangeReceiverBase<RPCContext>::setUpConnection()
     // TODO: reduce this thread in the future.
     if (!async_requests.empty())
     {
-        thread_manager->schedule(true, "RecvReactor", [this, async_requests = std::move(async_requests)] { reactor(async_requests); });
+        thread_manager->schedule(true, "RecvReactor", [this, async_requests = std::move(async_requests)] {
+            if (enableFineGrainedShuffle(fine_grained_shuffle_stream_count))
+                reactor<true>(async_requests);
+            else
+                reactor<false>(async_requests);
+        });
         ++thread_count;
     }
 }
 
 template <typename RPCContext>
+template <bool enable_fine_grained_shuffle>
 void ExchangeReceiverBase<RPCContext>::reactor(const std::vector<Request> & async_requests)
 {
-    using AsyncHandler = AsyncRequestHandler<RPCContext>;
+    using AsyncHandler = AsyncRequestHandler<RPCContext, enable_fine_grained_shuffle>;
 
     GET_METRIC(tiflash_thread_count, type_threads_of_receiver_reactor).Increment();
     SCOPE_EXIT({
@@ -403,7 +530,7 @@ void ExchangeReceiverBase<RPCContext>::reactor(const std::vector<Request> & asyn
     std::vector<std::unique_ptr<AsyncHandler>> handlers;
     handlers.reserve(alive_async_connections);
     for (const auto & req : async_requests)
-        handlers.emplace_back(std::make_unique<AsyncHandler>(&ready_requests, &msg_channel, rpc_context, req, exc_log->identifier()));
+        handlers.emplace_back(std::make_unique<AsyncHandler>(&ready_requests, &msg_channels, rpc_context, req, exc_log->identifier()));
 
     while (alive_async_connections > 0)
     {
@@ -415,7 +542,7 @@ void ExchangeReceiverBase<RPCContext>::reactor(const std::vector<Request> & asyn
         for (Int32 i = 0; i < check_waiting_requests_freq; ++i)
         {
             AsyncHandler * handler = nullptr;
-            if (unlikely(!ready_requests.tryPop(handler, timeout)))
+            if (unlikely(!ready_requests.popTimeout(handler, timeout)))
                 break;
 
             handler->handle();
@@ -448,6 +575,7 @@ void ExchangeReceiverBase<RPCContext>::reactor(const std::vector<Request> & asyn
 }
 
 template <typename RPCContext>
+template <bool enable_fine_grained_shuffle>
 void ExchangeReceiverBase<RPCContext>::readLoop(const Request & req)
 {
     GET_METRIC(tiflash_thread_count, type_threads_of_receiver_read_loop).Increment();
@@ -472,18 +600,15 @@ void ExchangeReceiverBase<RPCContext>::readLoop(const Request & req)
             for (;;)
             {
                 LOG_FMT_TRACE(log, "begin next ");
-                auto recv_msg = std::make_shared<ReceivedMessage>();
-                recv_msg->packet = std::make_shared<MPPDataPacket>();
-                recv_msg->req_info = req_info;
-                recv_msg->source_index = req.source_index;
-                bool success = reader->read(recv_msg->packet);
+                MPPDataPacketPtr packet = std::make_shared<MPPDataPacket>();
+                bool success = reader->read(packet);
                 if (!success)
                     break;
                 has_data = true;
-                if (recv_msg->packet->has_error())
-                    throw Exception("Exchange receiver meet error : " + recv_msg->packet->error().msg());
+                if (packet->has_error())
+                    throw Exception("Exchange receiver meet error : " + packet->error().msg());
 
-                if (!msg_channel.push(std::move(recv_msg)))
+                if (!pushPacket<enable_fine_grained_shuffle, true>(req.source_index, req_info, packet, msg_channels, log))
                 {
                     meet_error = true;
                     auto local_state = getState();
@@ -553,15 +678,15 @@ DecodeDetail ExchangeReceiverBase<RPCContext>::decodeChunks(
     assert(recv_msg != nullptr);
     DecodeDetail detail;
 
-    int chunk_size = recv_msg->packet->chunks_size();
-    if (chunk_size == 0)
+    if (recv_msg->chunks.empty())
         return detail;
 
+    // Record total packet size even if fine grained shuffle is enabled.
     detail.packet_bytes = recv_msg->packet->ByteSizeLong();
-    /// ExchangeReceiverBase should receive chunks of TypeCHBlock
-    for (int i = 0; i < chunk_size; ++i)
+
+    for (const String * chunk : recv_msg->chunks)
     {
-        Block block = CHBlockChunkCodec::decode(recv_msg->packet->chunks(i), header);
+        Block block = CHBlockChunkCodec::decode(*chunk, header);
         detail.rows += block.rows();
         if (unlikely(block.rows() == 0))
             continue;
@@ -571,10 +696,15 @@ DecodeDetail ExchangeReceiverBase<RPCContext>::decodeChunks(
 }
 
 template <typename RPCContext>
-ExchangeReceiverResult ExchangeReceiverBase<RPCContext>::nextResult(std::queue<Block> & block_queue, const Block & header)
+ExchangeReceiverResult ExchangeReceiverBase<RPCContext>::nextResult(std::queue<Block> & block_queue, const Block & header, size_t stream_id)
 {
+    if (unlikely(stream_id >= msg_channels.size()))
+    {
+        LOG_FMT_ERROR(exc_log, "stream_id out of range, stream_id: {}, total_stream_count: {}", stream_id, msg_channels.size());
+        return {nullptr, 0, "", true, "stream_id out of range", false};
+    }
     std::shared_ptr<ReceivedMessage> recv_msg;
-    if (!msg_channel.pop(recv_msg))
+    if (!msg_channels[stream_id]->pop(recv_msg))
     {
         std::unique_lock lock(mu);
 
@@ -596,29 +726,32 @@ ExchangeReceiverResult ExchangeReceiverBase<RPCContext>::nextResult(std::queue<B
             return {nullptr, 0, "ExchangeReceiver", false, "", true};
         }
     }
-    assert(recv_msg != nullptr && recv_msg->packet != nullptr);
+    assert(recv_msg != nullptr);
     ExchangeReceiverResult result;
-    if (recv_msg->packet->has_error())
+    if (recv_msg->error_ptr != nullptr)
     {
-        result = {nullptr, recv_msg->source_index, recv_msg->req_info, true, recv_msg->packet->error().msg(), false};
+        result = {nullptr, recv_msg->source_index, recv_msg->req_info, true, recv_msg->error_ptr->msg(), false};
     }
     else
     {
-        if (!recv_msg->packet->data().empty()) /// the data of the last packet is serialized from tipb::SelectResponse including execution summaries.
+        if (recv_msg->resp_ptr != nullptr) /// the data of the last packet is serialized from tipb::SelectResponse including execution summaries.
         {
-            auto resp_ptr = std::make_shared<tipb::SelectResponse>();
-            if (!resp_ptr->ParseFromString(recv_msg->packet->data()))
+            auto select_resp = std::make_shared<tipb::SelectResponse>();
+            if (!select_resp->ParseFromString(*(recv_msg->resp_ptr)))
             {
                 result = {nullptr, recv_msg->source_index, recv_msg->req_info, true, "decode error", false};
             }
             else
             {
-                result = {resp_ptr, recv_msg->source_index, recv_msg->req_info, false, "", false};
-                /// If mocking TiFlash as TiDB, here should decode chunks from resp_ptr.
-                if (!resp_ptr->chunks().empty())
+                result = {select_resp, recv_msg->source_index, recv_msg->req_info, false, "", false};
+                /// If mocking TiFlash as TiDB, here should decode chunks from select_resp.
+                if (!select_resp->chunks().empty())
                 {
-                    assert(recv_msg->packet->chunks().empty());
-                    result.decode_detail = CoprocessorReader::decodeChunks(resp_ptr, block_queue, header, schema);
+                    assert(recv_msg->chunks.empty());
+                    // Fine grained shuffle should only be enabled when sending data to TiFlash node.
+                    // So all data should be encoded into MPPDataPacket.chunks.
+                    RUNTIME_CHECK(!enableFineGrainedShuffle(fine_grained_shuffle_stream_count), Exception, "Data should not be encoded into tipb::SelectResponse.chunks when fine grained shuffle is enabled");
+                    result.decode_detail = CoprocessorReader::decodeChunks(select_resp, block_queue, header, schema);
                 }
             }
         }
@@ -626,7 +759,7 @@ ExchangeReceiverResult ExchangeReceiverBase<RPCContext>::nextResult(std::queue<B
         {
             result = {nullptr, recv_msg->source_index, recv_msg->req_info, false, "", false};
         }
-        if (!result.meet_error && !recv_msg->packet->chunks().empty())
+        if (!result.meet_error && !recv_msg->chunks.empty())
         {
             assert(result.decode_detail.rows == 0);
             result.decode_detail = decodeChunks(recv_msg, block_queue, header);
@@ -688,7 +821,21 @@ void ExchangeReceiverBase<RPCContext>::connectionDone(
         throw Exception("live_connections should not be less than 0!");
 
     if (meet_error || copy_live_conn == 0)
-        msg_channel.finish();
+        finishAllMsgChannels();
+}
+
+template <typename RPCContext>
+void ExchangeReceiverBase<RPCContext>::finishAllMsgChannels()
+{
+    for (auto & msg_channel : msg_channels)
+        msg_channel->finish();
+}
+
+template <typename RPCContext>
+void ExchangeReceiverBase<RPCContext>::cancelAllMsgChannels()
+{
+    for (auto & msg_channel : msg_channels)
+        msg_channel->cancel();
 }
 
 /// Explicit template instantiations - to avoid code bloat in headers.
diff --git a/dbms/src/Flash/Mpp/ExchangeReceiver.h b/dbms/src/Flash/Mpp/ExchangeReceiver.h
index 830dc6241a9..708f133f226 100644
--- a/dbms/src/Flash/Mpp/ExchangeReceiver.h
+++ b/dbms/src/Flash/Mpp/ExchangeReceiver.h
@@ -35,9 +35,28 @@ namespace DB
 {
 struct ReceivedMessage
 {
-    std::shared_ptr<mpp::MPPDataPacket> packet;
-    size_t source_index = 0;
+    size_t source_index;
     String req_info;
+    // shared_ptr<const MPPDataPacket> is copied to make sure error_ptr, resp_ptr and chunks are valid.
+    const std::shared_ptr<const MPPDataPacket> packet;
+    const mpp::Error * error_ptr;
+    const String * resp_ptr;
+    std::vector<const String *> chunks;
+
+    // Constructor that move chunks.
+    ReceivedMessage(size_t source_index_,
+                    const String & req_info_,
+                    const std::shared_ptr<const MPPDataPacket> & packet_,
+                    const mpp::Error * error_ptr_,
+                    const String * resp_ptr_,
+                    std::vector<const String *> && chunks_)
+        : source_index(source_index_)
+        , req_info(req_info_)
+        , packet(packet_)
+        , error_ptr(error_ptr_)
+        , resp_ptr(resp_ptr_)
+        , chunks(chunks_)
+    {}
 };
 
 struct ExchangeReceiverResult
@@ -78,6 +97,7 @@ enum class ExchangeReceiverState
     CLOSED,
 };
 
+using MsgChannelPtr = std::unique_ptr<MPMCQueue<std::shared_ptr<ReceivedMessage>>>;
 
 template <typename RPCContext>
 class ExchangeReceiverBase
@@ -92,7 +112,8 @@ class ExchangeReceiverBase
         size_t source_num_,
         size_t max_streams_,
         const String & req_id,
-        const String & executor_id);
+        const String & executor_id,
+        uint64_t fine_grained_shuffle_stream_count);
 
     ~ExchangeReceiverBase();
 
@@ -104,9 +125,11 @@ class ExchangeReceiverBase
 
     ExchangeReceiverResult nextResult(
         std::queue<Block> & block_queue,
-        const Block & header);
+        const Block & header,
+        size_t stream_id);
 
     size_t getSourceNum() const { return source_num; }
+    uint64_t getFineGrainedShuffleStreamCount() const { return fine_grained_shuffle_stream_count; }
 
     int computeNewThreadCount() const { return thread_count; }
 
@@ -128,7 +151,10 @@ class ExchangeReceiverBase
     using Request = typename RPCContext::Request;
 
     void setUpConnection();
+    // Template argument enable_fine_grained_shuffle will be setup properly in setUpConnection().
+    template <bool enable_fine_grained_shuffle>
     void readLoop(const Request & req);
+    template <bool enable_fine_grained_shuffle>
     void reactor(const std::vector<Request> & async_requests);
 
     bool setEndState(ExchangeReceiverState new_state);
@@ -139,12 +165,14 @@ class ExchangeReceiverBase
         std::queue<Block> & block_queue,
         const Block & header);
 
-
     void connectionDone(
         bool meet_error,
         const String & local_err_msg,
         const LoggerPtr & log);
 
+    void finishAllMsgChannels();
+    void cancelAllMsgChannels();
+
     std::shared_ptr<RPCContext> rpc_context;
 
     const tipb::ExchangeReceiver pb_exchange_receiver;
@@ -156,7 +184,7 @@ class ExchangeReceiverBase
     std::shared_ptr<ThreadManager> thread_manager;
     DAGSchema schema;
 
-    MPMCQueue<std::shared_ptr<ReceivedMessage>> msg_channel;
+    std::vector<MsgChannelPtr> msg_channels;
 
     std::mutex mu;
     /// should lock `mu` when visit these members
@@ -168,6 +196,7 @@ class ExchangeReceiverBase
 
     bool collected = false;
     int thread_count = 0;
+    uint64_t fine_grained_shuffle_stream_count;
 };
 
 class ExchangeReceiver : public ExchangeReceiverBase<GRPCReceiverContext>
diff --git a/dbms/src/Flash/Mpp/MPPHandler.cpp b/dbms/src/Flash/Mpp/MPPHandler.cpp
index a3096aaa644..7f97a1dd698 100644
--- a/dbms/src/Flash/Mpp/MPPHandler.cpp
+++ b/dbms/src/Flash/Mpp/MPPHandler.cpp
@@ -31,7 +31,7 @@ void MPPHandler::handleError(const MPPTaskPtr & task, String error)
     try
     {
         if (task)
-            task->cancel(error);
+            task->handleError(error);
     }
     catch (...)
     {
diff --git a/dbms/src/Flash/Mpp/MPPReceiverSet.cpp b/dbms/src/Flash/Mpp/MPPReceiverSet.cpp
new file mode 100644
index 00000000000..60cca308c18
--- /dev/null
+++ b/dbms/src/Flash/Mpp/MPPReceiverSet.cpp
@@ -0,0 +1,48 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Flash/Mpp/ExchangeReceiver.h>
+#include <Flash/Mpp/MPPReceiverSet.h>
+
+namespace DB
+{
+void MPPReceiverSet::addExchangeReceiver(const String & executor_id, const ExchangeReceiverPtr & exchange_receiver)
+{
+    RUNTIME_ASSERT(exchange_receiver_map.find(executor_id) == exchange_receiver_map.end(), log, "Duplicate executor_id: {} in DAGRequest", executor_id);
+    exchange_receiver_map[executor_id] = exchange_receiver;
+}
+
+void MPPReceiverSet::addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader)
+{
+    coprocessor_readers.push_back(coprocessor_reader);
+}
+
+ExchangeReceiverPtr MPPReceiverSet::getExchangeReceiver(const String & executor_id) const
+{
+    auto it = exchange_receiver_map.find(executor_id);
+    if (unlikely(it == exchange_receiver_map.end()))
+        return nullptr;
+    return it->second;
+}
+
+void MPPReceiverSet::cancel()
+{
+    for (auto & it : exchange_receiver_map)
+    {
+        it.second->cancel();
+    }
+    for (auto & cop_reader : coprocessor_readers)
+        cop_reader->cancel();
+}
+} // namespace DB
diff --git a/dbms/src/Flash/Mpp/MPPReceiverSet.h b/dbms/src/Flash/Mpp/MPPReceiverSet.h
new file mode 100644
index 00000000000..44274cb3ce8
--- /dev/null
+++ b/dbms/src/Flash/Mpp/MPPReceiverSet.h
@@ -0,0 +1,44 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Flash/Coprocessor/CoprocessorReader.h>
+#include <Flash/Coprocessor/DAGContext.h>
+
+namespace DB
+{
+class MPPReceiverSet
+{
+public:
+    explicit MPPReceiverSet(const String & req_id)
+        : log(Logger::get("MPPReceiverSet", req_id))
+    {}
+    void addExchangeReceiver(const String & executor_id, const ExchangeReceiverPtr & exchange_receiver);
+    void addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader);
+    ExchangeReceiverPtr getExchangeReceiver(const String & executor_id) const;
+    void cancel();
+
+private:
+    /// two kinds of receiver in MPP
+    /// ExchangeReceiver: receiver data from other MPPTask
+    /// CoprocessorReader: used in remote read
+    ExchangeReceiverMap exchange_receiver_map;
+    std::vector<CoprocessorReaderPtr> coprocessor_readers;
+    const LoggerPtr log;
+};
+
+using MPPReceiverSetPtr = std::shared_ptr<MPPReceiverSet>;
+
+} // namespace DB
diff --git a/dbms/src/Flash/Mpp/MPPTask.cpp b/dbms/src/Flash/Mpp/MPPTask.cpp
index 0f18ad582b4..7ddc6af361f 100644
--- a/dbms/src/Flash/Mpp/MPPTask.cpp
+++ b/dbms/src/Flash/Mpp/MPPTask.cpp
@@ -22,11 +22,14 @@
 #include <Flash/Coprocessor/DAGCodec.h>
 #include <Flash/Coprocessor/DAGUtils.h>
 #include <Flash/CoprocessorHandler.h>
+#include <Flash/Mpp/ExchangeReceiver.h>
+#include <Flash/Mpp/GRPCReceiverContext.h>
 #include <Flash/Mpp/MPPTask.h>
 #include <Flash/Mpp/MPPTaskManager.h>
 #include <Flash/Mpp/MPPTunnelSet.h>
 #include <Flash/Mpp/MinTSOScheduler.h>
 #include <Flash/Mpp/Utils.h>
+#include <Flash/Statistics/traverseExecutors.h>
 #include <Interpreters/ProcessList.h>
 #include <Interpreters/executeQuery.h>
 #include <Storages/Transaction/KVStore.h>
@@ -48,6 +51,7 @@ extern const char exception_before_mpp_register_tunnel_for_root_mpp_task[];
 extern const char exception_during_mpp_register_tunnel_for_non_root_mpp_task[];
 extern const char exception_during_mpp_write_err_to_tunnel[];
 extern const char force_no_local_region_for_mpp_task[];
+extern const char random_task_lifecycle_failpoint[];
 } // namespace FailPoints
 
 MPPTask::MPPTask(const mpp::TaskMeta & meta_, const ContextPtr & context_)
@@ -56,6 +60,7 @@ MPPTask::MPPTask(const mpp::TaskMeta & meta_, const ContextPtr & context_)
     , id(meta.start_ts(), meta.task_id())
     , log(Logger::get("MPPTask", id.toString()))
     , mpp_task_statistics(id, meta.address())
+    , needed_threads(0)
     , schedule_state(ScheduleState::WAITING)
 {}
 
@@ -76,36 +81,108 @@ MPPTask::~MPPTask()
     LOG_FMT_DEBUG(log, "finish MPPTask: {}", id.toString());
 }
 
-void MPPTask::closeAllTunnels(const String & reason)
+void MPPTask::abortTunnels(const String & message, AbortType abort_type)
 {
-    for (auto & it : tunnel_map)
+    if (abort_type == AbortType::ONCANCELLATION)
+    {
+        closeAllTunnels(message);
+    }
+    else
     {
-        it.second->close(reason);
+        RUNTIME_ASSERT(tunnel_set != nullptr, log, "mpp task without tunnel set");
+        tunnel_set->writeError(message);
     }
 }
 
-void MPPTask::finishWrite()
+void MPPTask::abortReceivers()
 {
-    for (const auto & it : tunnel_map)
+    if (likely(receiver_set != nullptr))
     {
-        it.second->writeDone();
+        receiver_set->cancel();
     }
 }
 
+void MPPTask::abortDataStreams(AbortType abort_type)
+{
+    /// When abort type is ONERROR, it means MPPTask already known it meet error, so let the remaining task stop silently to avoid too many useless error message
+    bool is_kill = abort_type == AbortType::ONCANCELLATION;
+    context->getProcessList().sendCancelToQuery(context->getCurrentQueryId(), context->getClientInfo().current_user, is_kill);
+}
+
+void MPPTask::closeAllTunnels(const String & reason)
+{
+    if (likely(tunnel_set))
+        tunnel_set->close(reason);
+}
+
+void MPPTask::finishWrite()
+{
+    RUNTIME_ASSERT(tunnel_set != nullptr, log, "mpp task without tunnel set");
+    tunnel_set->finishWrite();
+}
+
 void MPPTask::run()
 {
     newThreadManager()->scheduleThenDetach(true, "MPPTask", [self = shared_from_this()] { self->runImpl(); });
 }
 
-void MPPTask::registerTunnel(const MPPTaskId & id, MPPTunnelPtr tunnel)
+void MPPTask::registerTunnels(const mpp::DispatchTaskRequest & task_request)
 {
-    if (status == CANCELLED)
-        throw Exception("the tunnel " + tunnel->id() + " can not been registered, because the task is cancelled");
+    tunnel_set = std::make_shared<MPPTunnelSet>(log->identifier());
+    std::chrono::seconds timeout(task_request.timeout());
+    const auto & exchange_sender = dag_req.root_executor().exchange_sender();
 
-    if (tunnel_map.find(id) != tunnel_map.end())
-        throw Exception("the tunnel " + tunnel->id() + " has been registered");
+    for (int i = 0; i < exchange_sender.encoded_task_meta_size(); ++i)
+    {
+        // exchange sender will register the tunnels and wait receiver to found a connection.
+        mpp::TaskMeta task_meta;
+        if (unlikely(!task_meta.ParseFromString(exchange_sender.encoded_task_meta(i))))
+            throw TiFlashException("Failed to decode task meta info in ExchangeSender", Errors::Coprocessor::BadRequest);
+        bool is_local = context->getSettingsRef().enable_local_tunnel && meta.address() == task_meta.address();
+        bool is_async = !is_local && context->getSettingsRef().enable_async_server;
+        MPPTunnelPtr tunnel = std::make_shared<MPPTunnel>(task_meta, task_request.meta(), timeout, context->getSettingsRef().max_threads, is_local, is_async, log->identifier());
+        LOG_FMT_DEBUG(log, "begin to register the tunnel {}", tunnel->id());
+        if (status != INITIALIZING)
+            throw Exception(fmt::format("The tunnel {} can not be registered, because the task is not in initializing state", tunnel->id()));
+        tunnel_set->registerTunnel(MPPTaskId{task_meta.start_ts(), task_meta.task_id()}, tunnel);
+        if (!dag_context->isRootMPPTask())
+        {
+            FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::exception_during_mpp_register_tunnel_for_non_root_mpp_task);
+        }
+    }
+}
 
-    tunnel_map[id] = tunnel;
+void MPPTask::initExchangeReceivers()
+{
+    receiver_set = std::make_shared<MPPReceiverSet>(log->identifier());
+    traverseExecutors(&dag_req, [&](const tipb::Executor & executor) {
+        if (executor.tp() == tipb::ExecType::TypeExchangeReceiver)
+        {
+            assert(executor.has_executor_id());
+            const auto & executor_id = executor.executor_id();
+            // In order to distinguish different exchange receivers.
+            auto exchange_receiver = std::make_shared<ExchangeReceiver>(
+                std::make_shared<GRPCReceiverContext>(
+                    executor.exchange_receiver(),
+                    dag_context->getMPPTaskMeta(),
+                    context->getTMTContext().getKVCluster(),
+                    context->getTMTContext().getMPPTaskManager(),
+                    context->getSettingsRef().enable_local_tunnel,
+                    context->getSettingsRef().enable_async_grpc_client),
+                executor.exchange_receiver().encoded_task_meta_size(),
+                context->getMaxStreams(),
+                log->identifier(),
+                executor_id,
+                executor.fine_grained_shuffle_stream_count());
+            if (status != RUNNING)
+                throw Exception("exchange receiver map can not be initialized, because the task is not in running state");
+
+            receiver_set->addExchangeReceiver(executor_id, exchange_receiver);
+            new_thread_count_of_exchange_receiver += exchange_receiver->computeNewThreadCount();
+        }
+        return true;
+    });
+    dag_context->setMPPReceiverSet(receiver_set);
 }
 
 std::pair<MPPTunnelPtr, String> MPPTask::getTunnel(const ::mpp::EstablishMPPConnectionRequest * request)
@@ -120,8 +197,9 @@ std::pair<MPPTunnelPtr, String> MPPTask::getTunnel(const ::mpp::EstablishMPPConn
     }
 
     MPPTaskId receiver_id{request->receiver_meta().start_ts(), request->receiver_meta().task_id()};
-    auto it = tunnel_map.find(receiver_id);
-    if (it == tunnel_map.end())
+    RUNTIME_ASSERT(tunnel_set != nullptr, log, "mpp task without tunnel set");
+    auto tunnel_ptr = tunnel_set->getTunnelByReceiverTaskId(receiver_id);
+    if (tunnel_ptr == nullptr)
     {
         auto err_msg = fmt::format(
             "can't find tunnel ({} + {})",
@@ -129,7 +207,7 @@ std::pair<MPPTunnelPtr, String> MPPTask::getTunnel(const ::mpp::EstablishMPPConn
             request->receiver_meta().task_id());
         return {nullptr, err_msg};
     }
-    return {it->second, ""};
+    return {tunnel_ptr, ""};
 }
 
 void MPPTask::unregisterTask()
@@ -211,26 +289,8 @@ void MPPTask::prepare(const mpp::DispatchTaskRequest & task_request)
     }
 
     // register tunnels
-    tunnel_set = std::make_shared<MPPTunnelSet>();
-    std::chrono::seconds timeout(task_request.timeout());
+    registerTunnels(task_request);
 
-    for (int i = 0; i < exchange_sender.encoded_task_meta_size(); i++)
-    {
-        // exchange sender will register the tunnels and wait receiver to found a connection.
-        mpp::TaskMeta task_meta;
-        if (!task_meta.ParseFromString(exchange_sender.encoded_task_meta(i)))
-            throw TiFlashException("Failed to decode task meta info in ExchangeSender", Errors::Coprocessor::BadRequest);
-        bool is_local = context->getSettingsRef().enable_local_tunnel && meta.address() == task_meta.address();
-        bool is_async = !is_local && context->getSettingsRef().enable_async_server;
-        MPPTunnelPtr tunnel = std::make_shared<MPPTunnel>(task_meta, task_request.meta(), timeout, context->getSettingsRef().max_threads, is_local, is_async, log->identifier());
-        LOG_FMT_DEBUG(log, "begin to register the tunnel {}", tunnel->id());
-        registerTunnel(MPPTaskId{task_meta.start_ts(), task_meta.task_id()}, tunnel);
-        tunnel_set->addTunnel(tunnel);
-        if (!dag_context->isRootMPPTask())
-        {
-            FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::exception_during_mpp_register_tunnel_for_non_root_mpp_task);
-        }
-    }
     dag_context->tunnel_set = tunnel_set;
     // register task.
     auto task_manager = tmt_context.getMPPTaskManager();
@@ -256,6 +316,7 @@ void MPPTask::prepare(const mpp::DispatchTaskRequest & task_request)
 void MPPTask::preprocess()
 {
     auto start_time = Clock::now();
+    initExchangeReceivers();
     DAGQuerySource dag(*context);
     executeQuery(dag, *context, false, QueryProcessingStage::Complete);
     auto end_time = Clock::now();
@@ -285,7 +346,7 @@ void MPPTask::runImpl()
         LOG_FMT_INFO(log, "task starts preprocessing");
         preprocess();
         needed_threads = estimateCountOfNewThreads();
-        LOG_FMT_DEBUG(log, "Estimate new thread count of query :{} including tunnel_threads: {} , receiver_threads: {}", needed_threads, dag_context->tunnel_set->getRemoteTunnelCnt(), dag_context->getNewThreadCountOfExchangeReceiver());
+        LOG_FMT_DEBUG(log, "Estimate new thread count of query :{} including tunnel_threads: {} , receiver_threads: {}", needed_threads, dag_context->tunnel_set->getRemoteTunnelCnt(), new_thread_count_of_exchange_receiver);
 
         scheduleOrWait();
 
@@ -317,104 +378,124 @@ void MPPTask::runImpl()
             return_statistics.blocks,
             return_statistics.bytes);
     }
-    catch (Exception & e)
-    {
-        err_msg = e.displayText();
-        LOG_FMT_ERROR(log, "task running meets error: {} Stack Trace : {}", err_msg, e.getStackTrace().toString());
-    }
-    catch (pingcap::Exception & e)
-    {
-        err_msg = e.message();
-        LOG_FMT_ERROR(log, "task running meets error: {}", err_msg);
-    }
-    catch (std::exception & e)
-    {
-        err_msg = e.what();
-        LOG_FMT_ERROR(log, "task running meets error: {}", err_msg);
-    }
     catch (...)
     {
-        err_msg = "unrecovered error";
-        LOG_FMT_ERROR(log, "task running meets error: {}", err_msg);
+        err_msg = getCurrentExceptionMessage(true, true);
     }
+
     if (err_msg.empty())
     {
-        // todo when error happens, should try to update the metrics if it is available
-        auto throughput = dag_context->getTableScanThroughput();
-        if (throughput.first)
-            GET_METRIC(tiflash_storage_logical_throughput_bytes).Observe(throughput.second);
-        auto process_info = context->getProcessListElement()->getInfo();
-        auto peak_memory = process_info.peak_memory_usage > 0 ? process_info.peak_memory_usage : 0;
-        GET_METRIC(tiflash_coprocessor_request_memory_usage, type_run_mpp_task).Observe(peak_memory);
-        mpp_task_statistics.setMemoryPeak(peak_memory);
+        if (switchStatus(RUNNING, FINISHED))
+            LOG_INFO(log, "finish task");
+        else
+            LOG_FMT_WARNING(log, "finish task which is in {} state", taskStatusToString(status));
+        if (status == FINISHED)
+        {
+            // todo when error happens, should try to update the metrics if it is available
+            auto throughput = dag_context->getTableScanThroughput();
+            if (throughput.first)
+                GET_METRIC(tiflash_storage_logical_throughput_bytes).Observe(throughput.second);
+            auto process_info = context->getProcessListElement()->getInfo();
+            auto peak_memory = process_info.peak_memory_usage > 0 ? process_info.peak_memory_usage : 0;
+            GET_METRIC(tiflash_coprocessor_request_memory_usage, type_run_mpp_task).Observe(peak_memory);
+            mpp_task_statistics.setMemoryPeak(peak_memory);
+        }
     }
     else
     {
-        context->getProcessList().sendCancelToQuery(context->getCurrentQueryId(), context->getClientInfo().current_user, true);
-        if (dag_context)
-            dag_context->cancelAllExchangeReceiver();
-        writeErrToAllTunnels(err_msg);
+        if (status == RUNNING)
+        {
+            LOG_FMT_ERROR(log, "task running meets error: {}", err_msg);
+            /// trim the stack trace to avoid too many useless information in log
+            trimStackTrace(err_msg);
+            try
+            {
+                handleError(err_msg);
+            }
+            catch (...)
+            {
+                tryLogCurrentException(log, "Meet error while try to handle error in MPPTask");
+            }
+        }
     }
     LOG_FMT_INFO(log, "task ends, time cost is {} ms.", stopwatch.elapsedMilliseconds());
-    unregisterTask();
-
-    if (switchStatus(RUNNING, FINISHED))
-        LOG_INFO(log, "finish task");
-    else
-        LOG_WARNING(log, "finish task which was cancelled before");
+    // unregister flag is only for FailPoint usage, to produce the situation that MPPTask is destructed
+    // by grpc CancelMPPTask thread;
+    bool unregister = true;
+    fiu_do_on(FailPoints::random_task_lifecycle_failpoint, {
+        if (!err_msg.empty())
+            unregister = false;
+    });
+    if (unregister)
+        unregisterTask();
 
-    mpp_task_statistics.end(status.load(), err_msg);
+    mpp_task_statistics.end(status.load(), err_string);
     mpp_task_statistics.logTracingJson();
 }
 
-void MPPTask::writeErrToAllTunnels(const String & e)
+void MPPTask::handleError(const String & error_msg)
 {
-    for (auto & it : tunnel_map)
-    {
-        try
-        {
-            FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::exception_during_mpp_write_err_to_tunnel);
-            it.second->write(getPacketWithError(e), true);
-        }
-        catch (...)
-        {
-            it.second->close("Failed to write error msg to tunnel");
-            tryLogCurrentException(log, "Failed to write error " + e + " to tunnel: " + it.second->id());
-        }
-    }
+    if (manager == nullptr || !manager->isTaskToBeCancelled(id))
+        abort(error_msg, AbortType::ONERROR);
 }
 
-void MPPTask::cancel(const String & reason)
+void MPPTask::abort(const String & message, AbortType abort_type)
 {
-    CPUAffinityManager::getInstance().bindSelfQueryThread();
-    LOG_FMT_WARNING(log, "Begin cancel task: {}", id.toString());
+    String abort_type_string;
+    TaskStatus next_task_status;
+    switch (abort_type)
+    {
+    case AbortType::ONCANCELLATION:
+        abort_type_string = "ONCANCELLATION";
+        next_task_status = CANCELLED;
+        break;
+    case AbortType::ONERROR:
+        abort_type_string = "ONERROR";
+        next_task_status = FAILED;
+        break;
+    }
+    LOG_FMT_WARNING(log, "Begin abort task: {}, abort type: {}", id.toString(), abort_type_string);
     while (true)
     {
         auto previous_status = status.load();
-        if (previous_status == FINISHED || previous_status == CANCELLED)
+        if (previous_status == FINISHED || previous_status == CANCELLED || previous_status == FAILED)
         {
-            LOG_FMT_WARNING(log, "task already {}", (previous_status == FINISHED ? "finished" : "cancelled"));
+            LOG_FMT_WARNING(log, "task already in {} state", taskStatusToString(previous_status));
             return;
         }
-        else if (previous_status == INITIALIZING && switchStatus(INITIALIZING, CANCELLED))
+        else if (previous_status == INITIALIZING && switchStatus(INITIALIZING, next_task_status))
         {
-            closeAllTunnels(reason);
+            err_string = message;
+            /// if the task is in initializing state, mpp task can return error to TiDB directly,
+            /// so just close all tunnels here
+            closeAllTunnels(message);
             unregisterTask();
-            LOG_WARNING(log, "Finish cancel task from uninitialized");
+            LOG_WARNING(log, "Finish abort task from uninitialized");
             return;
         }
-        else if (previous_status == RUNNING && switchStatus(RUNNING, CANCELLED))
+        else if (previous_status == RUNNING && switchStatus(RUNNING, next_task_status))
         {
+            /// abort the components from top to bottom because if bottom components are aborted
+            /// first, the top components may see an error caused by the abort, which is not
+            /// the original error
+            err_string = message;
+            abortTunnels(message, abort_type);
+            abortDataStreams(abort_type);
+            abortReceivers();
             scheduleThisTask(ScheduleState::FAILED);
-            context->getProcessList().sendCancelToQuery(context->getCurrentQueryId(), context->getClientInfo().current_user, true);
-            closeAllTunnels(reason);
             /// runImpl is running, leave remaining work to runImpl
-            LOG_WARNING(log, "Finish cancel task from running");
+            LOG_WARNING(log, "Finish abort task from running");
             return;
         }
     }
 }
 
+void MPPTask::cancel(const String & reason)
+{
+    CPUAffinityManager::getInstance().bindSelfQueryThread();
+    abort(reason, AbortType::ONCANCELLATION);
+}
+
 bool MPPTask::switchStatus(TaskStatus from, TaskStatus to)
 {
     return status.compare_exchange_strong(from, to);
diff --git a/dbms/src/Flash/Mpp/MPPTask.h b/dbms/src/Flash/Mpp/MPPTask.h
index c34cae49699..a30150b26e8 100644
--- a/dbms/src/Flash/Mpp/MPPTask.h
+++ b/dbms/src/Flash/Mpp/MPPTask.h
@@ -19,6 +19,7 @@
 #include <Common/MemoryTracker.h>
 #include <DataStreams/BlockIO.h>
 #include <Flash/Coprocessor/DAGContext.h>
+#include <Flash/Mpp/MPPReceiverSet.h>
 #include <Flash/Mpp/MPPTaskId.h>
 #include <Flash/Mpp/MPPTaskStatistics.h>
 #include <Flash/Mpp/MPPTunnel.h>
@@ -58,12 +59,12 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     void cancel(const String & reason);
 
+    void handleError(const String & error_msg);
+
     void prepare(const mpp::DispatchTaskRequest & task_request);
 
     void run();
 
-    void registerTunnel(const MPPTaskId & id, MPPTunnelPtr tunnel);
-
     int getNeededThreads();
 
     enum class ScheduleState
@@ -91,12 +92,22 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     void unregisterTask();
 
-    void writeErrToAllTunnels(const String & e);
-
     /// Similar to `writeErrToAllTunnels`, but it just try to write the error message to tunnel
     /// without waiting the tunnel to be connected
     void closeAllTunnels(const String & reason);
 
+    enum class AbortType
+    {
+        /// todo add ONKILL to distinguish between silent cancellation and kill
+        ONCANCELLATION,
+        ONERROR,
+    };
+    void abort(const String & message, AbortType abort_type);
+
+    void abortTunnels(const String & message, AbortType abort_type);
+    void abortReceivers();
+    void abortDataStreams(AbortType abort_type);
+
     void finishWrite();
 
     bool switchStatus(TaskStatus from, TaskStatus to);
@@ -107,6 +118,10 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     int estimateCountOfNewThreads();
 
+    void registerTunnels(const mpp::DispatchTaskRequest & task_request);
+
+    void initExchangeReceivers();
+
     tipb::DAGRequest dag_req;
 
     ContextPtr context;
@@ -116,6 +131,7 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
     MemoryTracker * memory_tracker = nullptr;
 
     std::atomic<TaskStatus> status{INITIALIZING};
+    String err_string;
 
     mpp::TaskMeta meta;
 
@@ -123,8 +139,9 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     MPPTunnelSetPtr tunnel_set;
 
-    // which targeted task we should send data by which tunnel.
-    std::unordered_map<MPPTaskId, MPPTunnelPtr> tunnel_map;
+    MPPReceiverSetPtr receiver_set;
+
+    int new_thread_count_of_exchange_receiver = 0;
 
     MPPTaskManager * manager = nullptr;
 
@@ -132,8 +149,6 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     MPPTaskStatistics mpp_task_statistics;
 
-    Exception err;
-
     friend class MPPTaskManager;
 
     int needed_threads;
diff --git a/dbms/src/Flash/Mpp/MPPTaskManager.cpp b/dbms/src/Flash/Mpp/MPPTaskManager.cpp
index 531f8f7a10d..c5499eda89d 100644
--- a/dbms/src/Flash/Mpp/MPPTaskManager.cpp
+++ b/dbms/src/Flash/Mpp/MPPTaskManager.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/FmtUtils.h>
 #include <Flash/Mpp/MPPTaskManager.h>
 #include <fmt/core.h>
@@ -22,6 +23,11 @@
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_task_manager_find_task_failure_failpoint[];
+} // namespace FailPoints
+
 MPPTaskManager::MPPTaskManager(MPPTaskSchedulerPtr scheduler_)
     : scheduler(std::move(scheduler_))
     , log(&Poco::Logger::get("TaskManager"))
@@ -50,6 +56,7 @@ MPPTaskPtr MPPTaskManager::findTaskWithTimeout(const mpp::TaskMeta & meta, std::
         it = query_it->second->task_map.find(id);
         return it != query_it->second->task_map.end();
     });
+    fiu_do_on(FailPoints::random_task_manager_find_task_failure_failpoint, ret = false;);
     if (cancelled)
     {
         errMsg = fmt::format("Task [{},{}] has been cancelled.", meta.start_ts(), meta.task_id());
@@ -140,6 +147,17 @@ bool MPPTaskManager::registerTask(MPPTaskPtr task)
     return true;
 }
 
+bool MPPTaskManager::isTaskToBeCancelled(const MPPTaskId & task_id)
+{
+    std::unique_lock lock(mu);
+    auto it = mpp_query_map.find(task_id.start_ts);
+    if (it != mpp_query_map.end() && it->second->to_be_cancelled)
+    {
+        return it->second->task_map.find(task_id) != it->second->task_map.end();
+    }
+    return false;
+}
+
 void MPPTaskManager::unregisterTask(MPPTask * task)
 {
     std::unique_lock lock(mu);
diff --git a/dbms/src/Flash/Mpp/MPPTaskManager.h b/dbms/src/Flash/Mpp/MPPTaskManager.h
index d7047804aca..770acea3853 100644
--- a/dbms/src/Flash/Mpp/MPPTaskManager.h
+++ b/dbms/src/Flash/Mpp/MPPTaskManager.h
@@ -73,6 +73,8 @@ class MPPTaskManager : private boost::noncopyable
 
     void unregisterTask(MPPTask * task);
 
+    bool isTaskToBeCancelled(const MPPTaskId & task_id);
+
     bool tryToScheduleTask(const MPPTaskPtr & task);
 
     void releaseThreadsFromScheduler(const int needed_threads);
diff --git a/dbms/src/Flash/Mpp/MPPTunnel.cpp b/dbms/src/Flash/Mpp/MPPTunnel.cpp
index 826e7fea88a..16fe4ae42cc 100644
--- a/dbms/src/Flash/Mpp/MPPTunnel.cpp
+++ b/dbms/src/Flash/Mpp/MPPTunnel.cpp
@@ -25,6 +25,7 @@ namespace DB
 namespace FailPoints
 {
 extern const char exception_during_mpp_close_tunnel[];
+extern const char random_tunnel_wait_timeout_failpoint[];
 } // namespace FailPoints
 
 template <typename Writer>
@@ -219,7 +220,11 @@ void MPPTunnelBase<Writer>::sendJob(bool need_lock)
         err_msg = "fatal error in sendJob()";
     }
     if (!err_msg.empty())
+    {
+        /// append tunnel id to error message
+        err_msg = fmt::format("{} meet error: {}", tunnel_id, err_msg);
         LOG_ERROR(log, err_msg);
+    }
     consumerFinish(err_msg, need_lock);
     if (is_async)
         writer->writeDone(grpc::Status::OK);
@@ -322,6 +327,7 @@ void MPPTunnelBase<Writer>::waitUntilConnectedOrFinished(std::unique_lock<std::m
         auto res = cv_for_connected_or_finished.wait_for(lk, timeout, connected_or_finished);
         LOG_FMT_TRACE(log, "end waitUntilConnectedOrFinished");
 
+        fiu_do_on(FailPoints::random_tunnel_wait_timeout_failpoint, res = false;);
         if (!res)
             throw Exception(tunnel_id + " is timeout");
     }
diff --git a/dbms/src/Flash/Mpp/MPPTunnelSet.cpp b/dbms/src/Flash/Mpp/MPPTunnelSet.cpp
index 12de07d4a18..8d709bb7d38 100644
--- a/dbms/src/Flash/Mpp/MPPTunnelSet.cpp
+++ b/dbms/src/Flash/Mpp/MPPTunnelSet.cpp
@@ -13,11 +13,17 @@
 // limitations under the License.
 
 #include <Common/Exception.h>
+#include <Common/FailPoint.h>
 #include <Flash/Mpp/MPPTunnelSet.h>
+#include <Flash/Mpp/Utils.h>
 #include <fmt/core.h>
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char exception_during_mpp_write_err_to_tunnel[];
+} // namespace FailPoints
 namespace
 {
 inline mpp::MPPDataPacket serializeToPacket(const tipb::SelectResponse & response)
@@ -108,6 +114,65 @@ void MPPTunnelSetBase<Tunnel>::write(mpp::MPPDataPacket & packet, int16_t partit
     tunnels[partition_id]->write(packet);
 }
 
+template <typename Tunnel>
+void MPPTunnelSetBase<Tunnel>::writeError(const String & msg)
+{
+    for (auto & tunnel : tunnels)
+    {
+        try
+        {
+            FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::exception_during_mpp_write_err_to_tunnel);
+            tunnel->write(getPacketWithError(msg), true);
+        }
+        catch (...)
+        {
+            tunnel->close("Failed to write error msg to tunnel");
+            tryLogCurrentException(log, "Failed to write error " + msg + " to tunnel: " + tunnel->id());
+        }
+    }
+}
+
+template <typename Tunnel>
+void MPPTunnelSetBase<Tunnel>::registerTunnel(const MPPTaskId & receiver_task_id, const TunnelPtr & tunnel)
+{
+    if (receiver_task_id_to_index_map.find(receiver_task_id) != receiver_task_id_to_index_map.end())
+        throw Exception(fmt::format("the tunnel {} has been registered", tunnel->id()));
+
+    receiver_task_id_to_index_map[receiver_task_id] = tunnels.size();
+    tunnels.push_back(tunnel);
+    if (!tunnel->isLocal())
+    {
+        remote_tunnel_cnt++;
+    }
+}
+
+template <typename Tunnel>
+void MPPTunnelSetBase<Tunnel>::close(const String & reason)
+{
+    for (auto & tunnel : tunnels)
+        tunnel->close(reason);
+}
+
+template <typename Tunnel>
+void MPPTunnelSetBase<Tunnel>::finishWrite()
+{
+    for (auto & tunnel : tunnels)
+    {
+        tunnel->writeDone();
+    }
+}
+
+template <typename Tunnel>
+typename MPPTunnelSetBase<Tunnel>::TunnelPtr MPPTunnelSetBase<Tunnel>::getTunnelByReceiverTaskId(const MPPTaskId & id)
+{
+    auto it = receiver_task_id_to_index_map.find(id);
+    if (it == receiver_task_id_to_index_map.end())
+    {
+        return nullptr;
+    }
+    return tunnels[it->second];
+}
+
 /// Explicit template instantiations - to avoid code bloat in headers.
 template class MPPTunnelSetBase<MPPTunnel>;
 
diff --git a/dbms/src/Flash/Mpp/MPPTunnelSet.h b/dbms/src/Flash/Mpp/MPPTunnelSet.h
index f2279b945cb..e4123db1be5 100644
--- a/dbms/src/Flash/Mpp/MPPTunnelSet.h
+++ b/dbms/src/Flash/Mpp/MPPTunnelSet.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <Flash/Mpp/MPPTaskId.h>
 #include <Flash/Mpp/MPPTunnel.h>
 #ifdef __clang__
 #pragma clang diagnostic push
@@ -32,6 +33,9 @@ class MPPTunnelSetBase : private boost::noncopyable
 {
 public:
     using TunnelPtr = std::shared_ptr<Tunnel>;
+    explicit MPPTunnelSetBase(const String & req_id)
+        : log(Logger::get("MPPTunnelSet", req_id))
+    {}
 
     void clearExecutionSummaries(tipb::SelectResponse & response);
 
@@ -50,17 +54,14 @@ class MPPTunnelSetBase : private boost::noncopyable
     // this is a partition writing.
     void write(tipb::SelectResponse & response, int16_t partition_id);
     void write(mpp::MPPDataPacket & packet, int16_t partition_id);
+    void writeError(const String & msg);
+    void close(const String & reason);
+    void finishWrite();
+    void registerTunnel(const MPPTaskId & id, const TunnelPtr & tunnel);
 
-    uint16_t getPartitionNum() const { return tunnels.size(); }
+    TunnelPtr getTunnelByReceiverTaskId(const MPPTaskId & id);
 
-    void addTunnel(const TunnelPtr & tunnel)
-    {
-        tunnels.push_back(tunnel);
-        if (!tunnel->isLocal())
-        {
-            remote_tunnel_cnt++;
-        }
-    }
+    uint16_t getPartitionNum() const { return tunnels.size(); }
 
     int getRemoteTunnelCnt()
     {
@@ -71,6 +72,8 @@ class MPPTunnelSetBase : private boost::noncopyable
 
 private:
     std::vector<TunnelPtr> tunnels;
+    std::unordered_map<MPPTaskId, size_t> receiver_task_id_to_index_map;
+    const LoggerPtr log;
 
     int remote_tunnel_cnt = 0;
 };
diff --git a/dbms/src/Flash/Mpp/MinTSOScheduler.cpp b/dbms/src/Flash/Mpp/MinTSOScheduler.cpp
index af525bd1a55..967bfcecfa3 100644
--- a/dbms/src/Flash/Mpp/MinTSOScheduler.cpp
+++ b/dbms/src/Flash/Mpp/MinTSOScheduler.cpp
@@ -12,12 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/TiFlashMetrics.h>
 #include <Flash/Mpp/MPPTaskManager.h>
 #include <Flash/Mpp/MinTSOScheduler.h>
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_min_tso_scheduler_failpoint[];
+} // namespace FailPoints
+
 constexpr UInt64 MAX_UINT64 = std::numeric_limits<UInt64>::max();
 constexpr UInt64 OS_THREAD_SOFT_LIMIT = 100000;
 
@@ -193,7 +199,9 @@ bool MinTSOScheduler::scheduleImp(const UInt64 tso, const MPPQueryTaskSetPtr & q
     }
     else
     {
-        if (tso <= min_tso) /// the min_tso query should fully run, otherwise throw errors here.
+        bool is_tso_min = tso <= min_tso;
+        fiu_do_on(FailPoints::random_min_tso_scheduler_failpoint, is_tso_min = true;);
+        if (is_tso_min) /// the min_tso query should fully run, otherwise throw errors here.
         {
             has_error = true;
             auto msg = fmt::format("threads are unavailable for the query {} ({} min_tso {}) {}, need {}, but used {} of the thread hard limit {}, {} active and {} waiting queries.", tso, tso == min_tso ? "is" : "is newer than", min_tso, isWaiting ? "from the waiting set" : "when directly schedule it", needed_threads, estimated_thread_usage, thread_hard_limit, active_set.size(), waiting_set.size());
diff --git a/dbms/src/Flash/Mpp/TaskStatus.cpp b/dbms/src/Flash/Mpp/TaskStatus.cpp
index 423b768faea..c87ae2b8eb4 100644
--- a/dbms/src/Flash/Mpp/TaskStatus.cpp
+++ b/dbms/src/Flash/Mpp/TaskStatus.cpp
@@ -29,6 +29,8 @@ StringRef taskStatusToString(const TaskStatus & status)
         return "FINISHED";
     case CANCELLED:
         return "CANCELLED";
+    case FAILED:
+        return "FAILED";
     default:
         throw Exception("Unknown TaskStatus");
     }
diff --git a/dbms/src/Flash/Mpp/TaskStatus.h b/dbms/src/Flash/Mpp/TaskStatus.h
index 999e30790bf..0997c8adc52 100644
--- a/dbms/src/Flash/Mpp/TaskStatus.h
+++ b/dbms/src/Flash/Mpp/TaskStatus.h
@@ -24,6 +24,7 @@ enum TaskStatus
     RUNNING,
     FINISHED,
     CANCELLED,
+    FAILED,
 };
 
 StringRef taskStatusToString(const TaskStatus & status);
diff --git a/dbms/src/Flash/Mpp/Utils.cpp b/dbms/src/Flash/Mpp/Utils.cpp
index 477c478eef7..21d89b3cd52 100644
--- a/dbms/src/Flash/Mpp/Utils.cpp
+++ b/dbms/src/Flash/Mpp/Utils.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <Flash/Mpp/Utils.h>
+#include <Poco/String.h>
 
 #include <memory>
 
@@ -27,4 +28,14 @@ mpp::MPPDataPacket getPacketWithError(String reason)
     return data;
 }
 
+void trimStackTrace(String & message)
+{
+    auto stack_trace_pos = message.find("Stack trace");
+    if (stack_trace_pos != String::npos)
+    {
+        message.resize(stack_trace_pos);
+        Poco::trimRightInPlace(message);
+    }
+}
+
 } // namespace DB
diff --git a/dbms/src/Flash/Mpp/Utils.h b/dbms/src/Flash/Mpp/Utils.h
index 67e2dc3f641..021dc4407d5 100644
--- a/dbms/src/Flash/Mpp/Utils.h
+++ b/dbms/src/Flash/Mpp/Utils.h
@@ -23,5 +23,6 @@
 namespace DB
 {
 mpp::MPPDataPacket getPacketWithError(String reason);
+void trimStackTrace(String & message);
 
 } // namespace DB
diff --git a/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp b/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp
index 47ce2ee6ee6..706c17ed036 100644
--- a/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp
+++ b/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp
@@ -382,7 +382,7 @@ TEST_F(TestMPPTunnelBase, WriteError)
     }
     catch (Exception & e)
     {
-        GTEST_ASSERT_EQ(e.message(), "Consumer exits unexpected, grpc writes failed.");
+        GTEST_ASSERT_EQ(e.message(), "Consumer exits unexpected, 0000_0001 meet error: grpc writes failed.");
     }
 }
 
@@ -631,7 +631,7 @@ TEST_F(TestMPPTunnelBase, AsyncWriteError)
     }
     catch (Exception & e)
     {
-        GTEST_ASSERT_EQ(e.message(), "Consumer exits unexpected, grpc writes failed.");
+        GTEST_ASSERT_EQ(e.message(), "Consumer exits unexpected, 0000_0001 meet error: grpc writes failed.");
     }
 }
 
diff --git a/dbms/src/Flash/Planner/Planner.cpp b/dbms/src/Flash/Planner/Planner.cpp
index b798123de71..8aae14b9420 100644
--- a/dbms/src/Flash/Planner/Planner.cpp
+++ b/dbms/src/Flash/Planner/Planner.cpp
@@ -87,9 +87,15 @@ BlockInputStreams Planner::execute()
 
 bool Planner::isSupported(const DAGQueryBlock & query_block)
 {
+    /// todo support fine grained shuffle
+    static auto disable_fine_frained_shuffle = [](const DAGQueryBlock & query_block) {
+        return !enableFineGrainedShuffle(query_block.source->fine_grained_shuffle_stream_count())
+            && (!query_block.exchange_sender || !enableFineGrainedShuffle(query_block.exchange_sender->fine_grained_shuffle_stream_count()));
+    };
     return query_block.source
         && (query_block.source->tp() == tipb::ExecType::TypeProjection
-            || query_block.source->tp() == tipb::ExecType::TypeExchangeReceiver);
+            || query_block.source->tp() == tipb::ExecType::TypeExchangeReceiver)
+        && disable_fine_frained_shuffle(query_block);
 }
 
 DAGContext & Planner::dagContext() const
diff --git a/dbms/src/Flash/Planner/plans/PhysicalAggregation.cpp b/dbms/src/Flash/Planner/plans/PhysicalAggregation.cpp
index 45e4586dd18..26a6fa574f2 100644
--- a/dbms/src/Flash/Planner/plans/PhysicalAggregation.cpp
+++ b/dbms/src/Flash/Planner/plans/PhysicalAggregation.cpp
@@ -106,34 +106,39 @@ void PhysicalAggregation::transformImpl(DAGPipeline & pipeline, Context & contex
         is_final_agg);
 
     /// If there are several sources, then we perform parallel aggregation
-    if (pipeline.streams.size() > 1)
+    if (pipeline.streams.size() > 1 || pipeline.streams_with_non_joined_data.size() > 1)
     {
         const Settings & settings = context.getSettingsRef();
-        BlockInputStreamPtr stream_with_non_joined_data = combinedNonJoinedDataStream(pipeline, max_streams, log);
-        pipeline.firstStream() = std::make_shared<ParallelAggregatingBlockInputStream>(
+        BlockInputStreamPtr stream = std::make_shared<ParallelAggregatingBlockInputStream>(
             pipeline.streams,
-            stream_with_non_joined_data,
+            pipeline.streams_with_non_joined_data,
             params,
             context.getFileProvider(),
             true,
             max_streams,
             settings.aggregation_memory_efficient_merge_threads ? static_cast<size_t>(settings.aggregation_memory_efficient_merge_threads) : static_cast<size_t>(settings.max_threads),
             log->identifier());
+
         pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
+        pipeline.firstStream() = std::move(stream);
+
         // should record for agg before restore concurrency. See #3804.
         recordProfileStreams(pipeline, context);
         restoreConcurrency(pipeline, context.getDAGContext()->final_concurrency, log);
     }
     else
     {
-        BlockInputStreamPtr stream_with_non_joined_data = combinedNonJoinedDataStream(pipeline, max_streams, log);
         BlockInputStreams inputs;
         if (!pipeline.streams.empty())
             inputs.push_back(pipeline.firstStream());
-        else
-            pipeline.streams.resize(1);
-        if (stream_with_non_joined_data)
-            inputs.push_back(stream_with_non_joined_data);
+
+        if (!pipeline.streams_with_non_joined_data.empty())
+            inputs.push_back(pipeline.streams_with_non_joined_data.at(0));
+
+        pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
+
         pipeline.firstStream() = std::make_shared<AggregatingBlockInputStream>(
             std::make_shared<ConcatBlockInputStream>(inputs, log->identifier()),
             params,
diff --git a/dbms/src/Flash/Planner/plans/PhysicalExchangeReceiver.cpp b/dbms/src/Flash/Planner/plans/PhysicalExchangeReceiver.cpp
index ee40e42e1aa..ca87a85ab17 100644
--- a/dbms/src/Flash/Planner/plans/PhysicalExchangeReceiver.cpp
+++ b/dbms/src/Flash/Planner/plans/PhysicalExchangeReceiver.cpp
@@ -43,15 +43,14 @@ PhysicalPlanNodePtr PhysicalExchangeReceiver::build(
     const String & executor_id,
     const LoggerPtr & log)
 {
-    const auto & mpp_exchange_receiver_map = context.getDAGContext()->getMPPExchangeReceiverMap();
-
-    auto it = mpp_exchange_receiver_map.find(executor_id);
-    if (unlikely(it == mpp_exchange_receiver_map.end()))
+    auto mpp_exchange_receiver = context.getDAGContext()->getMPPExchangeReceiver(executor_id);
+    if (unlikely(mpp_exchange_receiver == nullptr))
         throw TiFlashException(
             fmt::format("Can not find exchange receiver for {}", executor_id),
             Errors::Planner::Internal);
+    /// todo support fine grained shuffle
+    assert(!enableFineGrainedShuffle(mpp_exchange_receiver->getFineGrainedShuffleStreamCount()));
 
-    const auto & mpp_exchange_receiver = it->second;
     NamesAndTypes schema = toNamesAndTypes(mpp_exchange_receiver->getOutputSchema());
     auto physical_exchange_receiver = std::make_shared<PhysicalExchangeReceiver>(
         executor_id,
@@ -69,7 +68,7 @@ void PhysicalExchangeReceiver::transformImpl(DAGPipeline & pipeline, Context & c
     auto & exchange_receiver_io_input_streams = dag_context.getInBoundIOInputStreamsMap()[executor_id];
     for (size_t i = 0; i < max_streams; ++i)
     {
-        BlockInputStreamPtr stream = std::make_shared<ExchangeReceiverInputStream>(mpp_exchange_receiver, log->identifier(), executor_id);
+        BlockInputStreamPtr stream = std::make_shared<ExchangeReceiverInputStream>(mpp_exchange_receiver, log->identifier(), executor_id, /*stream_id=*/0);
         exchange_receiver_io_input_streams.push_back(stream);
         stream = std::make_shared<SquashingBlockInputStream>(stream, 8192, 0, log->identifier());
         stream->setExtraInfo("squashing after exchange receiver");
diff --git a/dbms/src/Flash/Planner/plans/PhysicalExchangeSender.cpp b/dbms/src/Flash/Planner/plans/PhysicalExchangeSender.cpp
index 373b04a3941..1f99656506e 100644
--- a/dbms/src/Flash/Planner/plans/PhysicalExchangeSender.cpp
+++ b/dbms/src/Flash/Planner/plans/PhysicalExchangeSender.cpp
@@ -55,10 +55,11 @@ void PhysicalExchangeSender::transformImpl(DAGPipeline & pipeline, Context & con
 
     RUNTIME_ASSERT(dag_context.isMPPTask() && dag_context.tunnel_set != nullptr, log, "exchange_sender only run in MPP");
 
+    /// todo support fine grained shuffle
     int stream_id = 0;
     pipeline.transform([&](auto & stream) {
         // construct writer
-        std::unique_ptr<DAGResponseWriter> response_writer = std::make_unique<StreamingDAGResponseWriter<MPPTunnelSetPtr>>(
+        std::unique_ptr<DAGResponseWriter> response_writer = std::make_unique<StreamingDAGResponseWriter<MPPTunnelSetPtr, false>>(
             dag_context.tunnel_set,
             partition_col_ids,
             partition_col_collators,
@@ -66,7 +67,9 @@ void PhysicalExchangeSender::transformImpl(DAGPipeline & pipeline, Context & con
             context.getSettingsRef().dag_records_per_chunk,
             context.getSettingsRef().batch_send_min_limit,
             stream_id++ == 0, /// only one stream needs to sending execution summaries for the last response
-            dag_context);
+            dag_context,
+            0,
+            0);
         stream = std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), log->identifier());
     });
 }
diff --git a/dbms/src/Flash/Planner/plans/PhysicalTopN.cpp b/dbms/src/Flash/Planner/plans/PhysicalTopN.cpp
index d572435d645..a78dd350264 100644
--- a/dbms/src/Flash/Planner/plans/PhysicalTopN.cpp
+++ b/dbms/src/Flash/Planner/plans/PhysicalTopN.cpp
@@ -62,7 +62,7 @@ void PhysicalTopN::transformImpl(DAGPipeline & pipeline, Context & context, size
 
     executeExpression(pipeline, before_sort_actions, log, "before TopN");
 
-    orderStreams(pipeline, max_streams, order_descr, limit, context, log);
+    orderStreams(pipeline, max_streams, order_descr, limit, false, context, log);
 }
 
 void PhysicalTopN::finalize(const Names & parent_require)
diff --git a/dbms/src/Flash/Planner/tests/gtest_physical_plan.cpp b/dbms/src/Flash/Planner/tests/gtest_physical_plan.cpp
index 7ad06fbeb04..45414b717df 100644
--- a/dbms/src/Flash/Planner/tests/gtest_physical_plan.cpp
+++ b/dbms/src/Flash/Planner/tests/gtest_physical_plan.cpp
@@ -69,7 +69,7 @@ class PhysicalPlanTestRunner : public DB::tests::ExecutorTest
             ASSERT_EQ(Poco::trim(expected_streams), Poco::trim(fb.toString()));
         }
 
-        readAndAssertBlock(final_stream, expect_columns);
+        ASSERT_COLUMNS_EQ_R(readBlock(final_stream), expect_columns);
     }
 
     LoggerPtr log = Logger::get("PhysicalPlanTestRunner", "test_physical_plan");
diff --git a/dbms/src/Flash/tests/CMakeLists.txt b/dbms/src/Flash/tests/CMakeLists.txt
index a34e4b23432..944908dcb25 100644
--- a/dbms/src/Flash/tests/CMakeLists.txt
+++ b/dbms/src/Flash/tests/CMakeLists.txt
@@ -13,14 +13,3 @@
 # limitations under the License.
 
 include_directories (${CMAKE_CURRENT_BINARY_DIR})
-
-add_executable (exchange_perftest
-    exchange_perftest.cpp
-    ${TiFlash_SOURCE_DIR}/dbms/src/Server/StorageConfigParser.cpp
-    ${TiFlash_SOURCE_DIR}/dbms/src/Functions/FunctionsConversion.cpp)
-target_link_libraries (exchange_perftest
-    gtest_main
-    dbms
-    clickhouse_functions
-    clickhouse_aggregate_functions
-    tiflash-dttool-lib)
diff --git a/dbms/src/Flash/tests/WindowTestUtil.h b/dbms/src/Flash/tests/WindowTestUtil.h
new file mode 100644
index 00000000000..b7385380419
--- /dev/null
+++ b/dbms/src/Flash/tests/WindowTestUtil.h
@@ -0,0 +1,67 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Flash/Coprocessor/DAGQueryBlockInterpreter.h>
+
+namespace DB
+{
+namespace tests
+{
+
+inline std::shared_ptr<DB::DAGQueryBlockInterpreter> mockInterpreter(Context & context, const std::vector<DB::NameAndTypePair> & source_columns, int concurrency)
+{
+    std::vector<BlockInputStreams> mock_input_streams_vec = {};
+    DAGQueryBlock mock_query_block(0, static_cast<const google::protobuf::RepeatedPtrField<tipb::Executor>>(nullptr));
+    std::vector<SubqueriesForSets> mock_subqueries_for_sets = {};
+    std::shared_ptr<DAGQueryBlockInterpreter> mock_interpreter = std::make_shared<DAGQueryBlockInterpreter>(context,
+                                                                                                            mock_input_streams_vec,
+                                                                                                            mock_query_block,
+                                                                                                            concurrency);
+    mock_interpreter->analyzer = std::make_unique<DAGExpressionAnalyzer>(std::move(source_columns), context);
+    return mock_interpreter;
+}
+
+inline void mockExecuteProject(std::shared_ptr<DAGQueryBlockInterpreter> & mock_interpreter, DAGPipeline & pipeline, NamesWithAliases & final_project)
+{
+    mock_interpreter->executeProject(pipeline, final_project);
+}
+
+inline void mockExecuteWindowOrder(std::shared_ptr<DAGQueryBlockInterpreter> & mock_interpreter, DAGPipeline & pipeline, const tipb::Sort & sort, uint64_t fine_grained_shuffle_stream_count)
+{
+    mock_interpreter->handleWindowOrder(pipeline, sort, ::DB::enableFineGrainedShuffle(fine_grained_shuffle_stream_count));
+    mock_interpreter->input_streams_vec[0] = pipeline.streams;
+    NamesWithAliases final_project;
+    for (const auto & column : (*mock_interpreter->analyzer).source_columns)
+    {
+        final_project.push_back({column.name, ""});
+    }
+    mockExecuteProject(mock_interpreter, pipeline, final_project);
+}
+
+inline void mockExecuteWindow(std::shared_ptr<DAGQueryBlockInterpreter> & mock_interpreter, DAGPipeline & pipeline, const tipb::Window & window, uint64_t fine_grained_shuffle_stream_count)
+{
+    mock_interpreter->handleWindow(pipeline, window, ::DB::enableFineGrainedShuffle(fine_grained_shuffle_stream_count));
+    mock_interpreter->input_streams_vec[0] = pipeline.streams;
+    NamesWithAliases final_project;
+    for (const auto & column : (*mock_interpreter->analyzer).source_columns)
+    {
+        final_project.push_back({column.name, ""});
+    }
+    mockExecuteProject(mock_interpreter, pipeline, final_project);
+}
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Flash/tests/bench_exchange.cpp b/dbms/src/Flash/tests/bench_exchange.cpp
new file mode 100644
index 00000000000..d6e3f3e825e
--- /dev/null
+++ b/dbms/src/Flash/tests/bench_exchange.cpp
@@ -0,0 +1,466 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Flash/tests/bench_exchange.h>
+#include <fmt/core.h>
+
+#include <Flash/Coprocessor/StreamingDAGResponseWriter.cpp> // to include the implementation of StreamingDAGResponseWriter
+#include <Flash/Mpp/ExchangeReceiver.cpp> // to include the implementation of ExchangeReceiver
+#include <Flash/Mpp/MPPTunnel.cpp> // to include the implementation of MPPTunnel
+#include <Flash/Mpp/MPPTunnelSet.cpp> // to include the implementation of MPPTunnelSet
+#include <atomic>
+#include <chrono>
+
+
+namespace DB
+{
+namespace tests
+{
+
+std::random_device rd;
+
+MockBlockInputStream::MockBlockInputStream(const std::vector<Block> & blocks_, StopFlag & stop_flag_)
+    : blocks(blocks_)
+    , header(blocks[0].cloneEmpty())
+    , mt(rd())
+    , dist(0, blocks.size() - 1)
+    , stop_flag(stop_flag_)
+{}
+
+MockFixedRowsBlockInputStream::MockFixedRowsBlockInputStream(size_t total_rows_, const std::vector<Block> & blocks_)
+    : header(blocks_[0].cloneEmpty())
+    , mt(rd())
+    , dist(0, blocks_.size() - 1)
+    , current_rows(0)
+    , total_rows(total_rows_)
+    , blocks(blocks_)
+{}
+
+Block makeBlock(int row_num, bool skew)
+{
+    InferredDataVector<Nullable<Int64>> int64_vec;
+    InferredDataVector<Nullable<Int64>> int64_vec2;
+    InferredDataVector<Nullable<String>> string_vec;
+
+    if (skew)
+    {
+        for (int i = 0; i < row_num; ++i)
+        {
+            int64_vec.emplace_back(100);
+            int64_vec2.emplace_back(100);
+        }
+
+        for (int i = 0; i < row_num; ++i)
+        {
+            string_vec.push_back("abcdefg");
+        }
+    }
+    else
+    {
+        std::mt19937 mt(rd());
+        std::uniform_int_distribution<Int64> int64_dist;
+        std::uniform_int_distribution<int> len_dist(10, 20);
+        std::uniform_int_distribution<char> char_dist;
+
+        for (int i = 0; i < row_num; ++i)
+        {
+            int64_vec.emplace_back(int64_dist(mt));
+            int64_vec2.emplace_back(int64_dist(mt));
+        }
+
+        for (int i = 0; i < row_num; ++i)
+        {
+            int len = len_dist(mt);
+            String s;
+            for (int j = 0; j < len; ++j)
+                s.push_back(char_dist(mt));
+            string_vec.push_back(std::move(s));
+        }
+    }
+
+    auto int64_data_type = makeDataType<Nullable<Int64>>();
+    ColumnWithTypeAndName int64_column(makeColumn<Nullable<Int64>>(int64_data_type, int64_vec), int64_data_type, "int64_1");
+    ColumnWithTypeAndName int64_column2(makeColumn<Nullable<Int64>>(int64_data_type, int64_vec2), int64_data_type, "int64_2");
+
+    auto string_data_type = makeDataType<Nullable<String>>();
+    ColumnWithTypeAndName string_column(makeColumn<Nullable<String>>(string_data_type, string_vec), string_data_type, "string");
+
+    return Block({int64_column, string_column, int64_column2});
+}
+
+std::vector<Block> makeBlocks(int block_num, int row_num, bool skew)
+{
+    std::vector<Block> blocks;
+    for (int i = 0; i < block_num; ++i)
+        blocks.push_back(makeBlock(row_num, skew));
+    return blocks;
+}
+
+mpp::MPPDataPacket makePacket(ChunkCodecStream & codec, int row_num)
+{
+    auto block = makeBlock(row_num);
+    codec.encode(block, 0, row_num);
+
+    mpp::MPPDataPacket packet;
+    packet.add_chunks(codec.getString());
+    codec.clear();
+
+    return packet;
+}
+
+std::vector<PacketPtr> makePackets(ChunkCodecStream & codec, int packet_num, int row_num)
+{
+    std::vector<PacketPtr> packets;
+    for (int i = 0; i < packet_num; ++i)
+        packets.push_back(std::make_shared<Packet>(makePacket(codec, row_num)));
+    return packets;
+}
+
+std::vector<PacketQueuePtr> makePacketQueues(int source_num, int queue_size)
+{
+    std::vector<PacketQueuePtr> queues(source_num);
+    for (int i = 0; i < source_num; ++i)
+        queues[i] = std::make_shared<PacketQueue>(queue_size);
+    return queues;
+}
+
+std::vector<tipb::FieldType> makeFields()
+{
+    std::vector<tipb::FieldType> fields(3);
+    fields[0].set_tp(TiDB::TypeLongLong);
+    fields[1].set_tp(TiDB::TypeString);
+    fields[2].set_tp(TiDB::TypeLongLong);
+    return fields;
+}
+
+void printException(const Exception & e)
+{
+    std::string text = e.displayText();
+
+    auto embedded_stack_trace_pos = text.find("Stack trace");
+    std::cerr << "Code: " << e.code() << ". " << text << std::endl
+              << std::endl;
+    if (std::string::npos == embedded_stack_trace_pos)
+        std::cerr << "Stack trace:" << std::endl
+                  << e.getStackTrace().toString() << std::endl;
+}
+
+ReceiverHelper::ReceiverHelper(int concurrency_, int source_num_, uint32_t fine_grained_shuffle_stream_count_)
+    : concurrency(concurrency_)
+    , source_num(source_num_)
+    , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
+{
+    pb_exchange_receiver.set_tp(tipb::Hash);
+    for (int i = 0; i < source_num; ++i)
+    {
+        mpp::TaskMeta task;
+        task.set_start_ts(0);
+        task.set_task_id(i);
+        task.set_partition_id(i);
+        task.set_address("");
+
+        String encoded_task;
+        task.SerializeToString(&encoded_task);
+
+        pb_exchange_receiver.add_encoded_task_meta(encoded_task);
+    }
+
+    fields = makeFields();
+    *pb_exchange_receiver.add_field_types() = fields[0];
+    *pb_exchange_receiver.add_field_types() = fields[1];
+    *pb_exchange_receiver.add_field_types() = fields[2];
+
+    task_meta.set_task_id(100);
+
+    queues = makePacketQueues(source_num, 10);
+}
+
+MockExchangeReceiverPtr ReceiverHelper::buildReceiver()
+{
+    return std::make_shared<MockExchangeReceiver>(
+        std::make_shared<MockReceiverContext>(queues, fields),
+        source_num,
+        concurrency,
+        "mock_req_id",
+        "mock_exchange_receiver_id",
+        fine_grained_shuffle_stream_count);
+}
+
+std::vector<BlockInputStreamPtr> ReceiverHelper::buildExchangeReceiverStream()
+{
+    auto receiver = buildReceiver();
+    std::vector<BlockInputStreamPtr> streams(concurrency);
+    // NOTE: check if need fine_grained_shuffle_stream_count
+    for (int i = 0; i < concurrency; ++i)
+    {
+        streams[i] = std::make_shared<MockExchangeReceiverInputStream>(receiver,
+                                                                       "mock_req_id",
+                                                                       "mock_executor_id" + std::to_string(i),
+                                                                       /*stream_id=*/enableFineGrainedShuffle(fine_grained_shuffle_stream_count) ? i : 0);
+    }
+    return streams;
+}
+
+BlockInputStreamPtr ReceiverHelper::buildUnionStream()
+{
+    auto streams = buildExchangeReceiverStream();
+    return std::make_shared<UnionBlockInputStream<>>(streams, BlockInputStreams{}, concurrency, /*req_id=*/"");
+}
+
+void ReceiverHelper::finish()
+{
+    if (join_ptr)
+    {
+        join_ptr->setBuildTableState(Join::BuildTableState::SUCCEED);
+        std::cout << fmt::format("Hash table size: {} bytes", join_ptr->getTotalByteCount()) << std::endl;
+    }
+}
+
+SenderHelper::SenderHelper(
+    int source_num_,
+    int concurrency_,
+    uint32_t fine_grained_shuffle_stream_count_,
+    int64_t fine_grained_shuffle_batch_size_,
+    const std::vector<PacketQueuePtr> & queues_,
+    const std::vector<tipb::FieldType> & fields)
+    : source_num(source_num_)
+    , concurrency(concurrency_)
+    , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
+    , fine_grained_shuffle_batch_size(fine_grained_shuffle_batch_size_)
+    , queues(queues_)
+{
+    mpp::TaskMeta task_meta;
+    tunnel_set = std::make_shared<MockTunnelSet>("mock_req_id");
+    for (int i = 0; i < source_num; ++i)
+    {
+        auto writer = std::make_shared<MockWriter>(queues[i]);
+        mock_writers.push_back(writer);
+
+        auto tunnel = std::make_shared<MockTunnel>(
+            task_meta,
+            task_meta,
+            std::chrono::seconds(60),
+            concurrency,
+            false,
+            false,
+            "mock_req_id");
+        tunnel->connect(writer.get());
+        tunnels.push_back(tunnel);
+        MPPTaskId id(0, i);
+        tunnel_set->registerTunnel(id, tunnel);
+    }
+
+    tipb::DAGRequest dag_request;
+    tipb::Executor root_executor;
+    root_executor.set_executor_id("ExchangeSender_100");
+    *dag_request.mutable_root_executor() = root_executor;
+
+    dag_context = std::make_unique<DAGContext>(dag_request);
+    dag_context->is_mpp_task = true;
+    dag_context->is_root_mpp_task = false;
+    dag_context->encode_type = tipb::EncodeType::TypeCHBlock;
+    dag_context->result_field_types = fields;
+}
+
+BlockInputStreamPtr SenderHelper::buildUnionStream(
+    StopFlag & stop_flag,
+    const std::vector<Block> & blocks)
+{
+    std::vector<BlockInputStreamPtr> send_streams;
+    for (int i = 0; i < concurrency; ++i)
+    {
+        BlockInputStreamPtr stream = std::make_shared<MockBlockInputStream>(blocks, stop_flag);
+        if (enableFineGrainedShuffle(fine_grained_shuffle_stream_count))
+        {
+            std::unique_ptr<DAGResponseWriter> response_writer(
+                new StreamingDAGResponseWriter<MockTunnelSetPtr, true>(
+                    tunnel_set,
+                    {0, 1, 2},
+                    TiDB::TiDBCollators(3),
+                    tipb::Hash,
+                    -1,
+                    -1,
+                    true,
+                    *dag_context,
+                    fine_grained_shuffle_stream_count,
+                    fine_grained_shuffle_batch_size));
+            send_streams.push_back(std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), /*req_id=*/""));
+        }
+        else
+        {
+            std::unique_ptr<DAGResponseWriter> response_writer(
+                new StreamingDAGResponseWriter<MockTunnelSetPtr, false>(
+                    tunnel_set,
+                    {0, 1, 2},
+                    TiDB::TiDBCollators(3),
+                    tipb::Hash,
+                    -1,
+                    -1,
+                    true,
+                    *dag_context,
+                    fine_grained_shuffle_stream_count,
+                    fine_grained_shuffle_batch_size));
+            send_streams.push_back(std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), /*req_id=*/""));
+        }
+    }
+
+    return std::make_shared<UnionBlockInputStream<>>(send_streams, BlockInputStreams{}, concurrency, /*req_id=*/"");
+}
+
+BlockInputStreamPtr SenderHelper::buildUnionStream(size_t total_rows, const std::vector<Block> & blocks)
+{
+    std::vector<BlockInputStreamPtr> send_streams;
+    for (int i = 0; i < concurrency; ++i)
+    {
+        BlockInputStreamPtr stream = std::make_shared<MockFixedRowsBlockInputStream>(total_rows / concurrency, blocks);
+        if (enableFineGrainedShuffle(fine_grained_shuffle_stream_count))
+        {
+            std::unique_ptr<DAGResponseWriter> response_writer(
+                new StreamingDAGResponseWriter<MockTunnelSetPtr, true>(
+                    tunnel_set,
+                    {0, 1, 2},
+                    TiDB::TiDBCollators(3),
+                    tipb::Hash,
+                    -1,
+                    -1,
+                    true,
+                    *dag_context,
+                    fine_grained_shuffle_stream_count,
+                    fine_grained_shuffle_batch_size));
+            send_streams.push_back(std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), /*req_id=*/""));
+        }
+        else
+        {
+            std::unique_ptr<DAGResponseWriter> response_writer(
+                new StreamingDAGResponseWriter<MockTunnelSetPtr, false>(
+                    tunnel_set,
+                    {0, 1, 2},
+                    TiDB::TiDBCollators(3),
+                    tipb::Hash,
+                    -1,
+                    -1,
+                    true,
+                    *dag_context,
+                    fine_grained_shuffle_stream_count,
+                    fine_grained_shuffle_batch_size));
+            send_streams.push_back(std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), /*req_id=*/""));
+        }
+    }
+
+    return std::make_shared<UnionBlockInputStream<>>(send_streams, BlockInputStreams{}, concurrency, /*req_id=*/"");
+}
+
+void SenderHelper::finish()
+{
+    for (size_t i = 0; i < tunnels.size(); ++i)
+    {
+        tunnels[i]->writeDone();
+        tunnels[i]->waitForFinish();
+        mock_writers[i]->finish();
+    }
+}
+
+void ExchangeBench::SetUp(const benchmark::State &)
+{
+    DynamicThreadPool::global_instance = std::make_unique<DynamicThreadPool>(
+        /*fixed_thread_num=*/300,
+        std::chrono::milliseconds(100000));
+
+    uniform_blocks = makeBlocks(/*block_num=*/100, /*row_num=*/1024);
+    skew_blocks = makeBlocks(/*block_num=*/100, /*row_num=*/1024, /*skew=*/true);
+
+    try
+    {
+        DB::registerWindowFunctions();
+        DB::registerFunctions();
+    }
+    catch (DB::Exception &)
+    {
+        // Maybe another test has already registered, ignore exception here.
+    }
+}
+
+void ExchangeBench::TearDown(const benchmark::State &)
+{
+    uniform_blocks.clear();
+    skew_blocks.clear();
+    // NOTE: Must reset here, otherwise DynamicThreadPool::fixedWork() may core because metrics already destroyed.
+    DynamicThreadPool::global_instance.reset();
+}
+
+void ExchangeBench::runAndWait(std::shared_ptr<ReceiverHelper> receiver_helper,
+                               BlockInputStreamPtr receiver_stream,
+                               std::shared_ptr<SenderHelper> & sender_helper,
+                               BlockInputStreamPtr sender_stream)
+{
+    std::future<void> sender_future = DynamicThreadPool::global_instance->schedule(/*memory_tracker=*/false,
+                                                                                   [sender_stream, sender_helper] {
+                                                                                       sender_stream->readPrefix();
+                                                                                       while (const auto & block = sender_stream->read()) {}
+                                                                                       sender_stream->readSuffix();
+                                                                                       sender_helper->finish();
+                                                                                   });
+    std::future<void> receiver_future = DynamicThreadPool::global_instance->schedule(/*memory_tracker=*/false,
+                                                                                     [receiver_stream, receiver_helper] {
+                                                                                         receiver_stream->readPrefix();
+                                                                                         while (const auto & block = receiver_stream->read()) {}
+                                                                                         receiver_stream->readSuffix();
+                                                                                         receiver_helper->finish();
+                                                                                     });
+    sender_future.get();
+    receiver_future.get();
+}
+
+BENCHMARK_DEFINE_F(ExchangeBench, basic_send_receive)
+(benchmark::State & state)
+try
+{
+    const int concurrency = state.range(0);
+    const int source_num = state.range(1);
+    const int total_rows = state.range(2);
+    const int fine_grained_shuffle_stream_count = state.range(3);
+    const int fine_grained_shuffle_batch_size = state.range(4);
+    Context context = TiFlashTestEnv::getContext();
+
+    for (auto _ : state)
+    {
+        std::shared_ptr<ReceiverHelper> receiver_helper = std::make_shared<ReceiverHelper>(concurrency, source_num, fine_grained_shuffle_stream_count);
+        BlockInputStreamPtr receiver_stream = receiver_helper->buildUnionStream();
+
+        std::shared_ptr<SenderHelper> sender_helper = std::make_shared<SenderHelper>(source_num,
+                                                                                     concurrency,
+                                                                                     fine_grained_shuffle_stream_count,
+                                                                                     fine_grained_shuffle_batch_size,
+                                                                                     receiver_helper->queues,
+                                                                                     receiver_helper->fields);
+        BlockInputStreamPtr sender_stream = sender_helper->buildUnionStream(total_rows, uniform_blocks);
+
+        runAndWait(receiver_helper, receiver_stream, sender_helper, sender_stream);
+    }
+}
+CATCH
+BENCHMARK_REGISTER_F(ExchangeBench, basic_send_receive)
+    ->Args({8, 1, 1024 * 1000, 0, 4096})
+    ->Args({8, 1, 1024 * 1000, 4, 4096})
+    ->Args({8, 1, 1024 * 1000, 8, 4096})
+    ->Args({8, 1, 1024 * 1000, 16, 4096})
+    ->Args({8, 1, 1024 * 1000, 32, 4096})
+    ->Args({8, 1, 1024 * 1000, 8, 1})
+    ->Args({8, 1, 1024 * 1000, 8, 1000})
+    ->Args({8, 1, 1024 * 1000, 8, 10000})
+    ->Args({8, 1, 1024 * 1000, 8, 100000});
+
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Flash/tests/bench_exchange.h b/dbms/src/Flash/tests/bench_exchange.h
new file mode 100644
index 00000000000..d8300d45740
--- /dev/null
+++ b/dbms/src/Flash/tests/bench_exchange.h
@@ -0,0 +1,299 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Common/DynamicThreadPool.h>
+#include <Common/MPMCQueue.h>
+#include <Common/ThreadManager.h>
+#include <DataStreams/ExchangeSenderBlockInputStream.h>
+#include <DataStreams/HashJoinBuildBlockInputStream.h>
+#include <DataStreams/SquashingBlockInputStream.h>
+#include <DataStreams/SquashingBlockOutputStream.h>
+#include <DataStreams/TiRemoteBlockInputStream.h>
+#include <DataStreams/UnionBlockInputStream.h>
+#include <Flash/Coprocessor/DAGBlockOutputStream.h>
+#include <Flash/Coprocessor/DAGPipeline.h>
+#include <Flash/Coprocessor/DAGQueryBlockInterpreter.h>
+#include <Flash/tests/WindowTestUtil.h>
+#include <Interpreters/Join.h>
+#include <TestUtils/FunctionTestUtils.h>
+#include <TestUtils/TiFlashTestEnv.h>
+#include <WindowFunctions/registerWindowFunctions.h>
+#include <benchmark/benchmark.h>
+
+#include <random>
+
+namespace DB
+{
+namespace tests
+{
+
+
+using Packet = mpp::MPPDataPacket;
+using PacketPtr = std::shared_ptr<Packet>;
+using PacketQueue = MPMCQueue<PacketPtr>;
+using PacketQueuePtr = std::shared_ptr<PacketQueue>;
+using StopFlag = std::atomic<bool>;
+
+// NOLINTBEGIN(readability-convert-member-functions-to-static)
+struct MockReceiverContext
+{
+    using Status = ::grpc::Status;
+    struct Request
+    {
+        String debugString() const
+        {
+            return "{Request}";
+        }
+
+        int source_index = 0;
+        int send_task_id = 0;
+        int recv_task_id = -1;
+    };
+
+    struct Reader
+    {
+        explicit Reader(const PacketQueuePtr & queue_)
+            : queue(queue_)
+        {}
+
+        void initialize() const
+        {
+        }
+
+        bool read(PacketPtr & packet [[maybe_unused]]) const
+        {
+            PacketPtr res;
+            if (queue->pop(res))
+            {
+                *packet = *res; // avoid change shared packets
+                return true;
+            }
+            return false;
+        }
+
+        Status finish() const
+        {
+            return ::grpc::Status();
+        }
+
+        PacketQueuePtr queue;
+    };
+
+    struct MockAsyncGrpcExchangePacketReader
+    {
+        // Not implement benchmark for Async GRPC for now.
+        void init(UnaryCallback<bool> *) { assert(0); }
+        void read(MPPDataPacketPtr &, UnaryCallback<bool> *) { assert(0); }
+        void finish(::grpc::Status &, UnaryCallback<bool> *) { assert(0); }
+    };
+
+    using AsyncReader = MockAsyncGrpcExchangePacketReader;
+
+    MockReceiverContext(
+        const std::vector<PacketQueuePtr> & queues_,
+        const std::vector<tipb::FieldType> & field_types_)
+        : queues(queues_)
+        , field_types(field_types_)
+    {
+    }
+
+    void fillSchema(DAGSchema & schema) const
+    {
+        schema.clear();
+        for (size_t i = 0; i < field_types.size(); ++i)
+        {
+            String name = "exchange_receiver_" + std::to_string(i);
+            ColumnInfo info = TiDB::fieldTypeToColumnInfo(field_types[i]);
+            schema.emplace_back(std::move(name), std::move(info));
+        }
+    }
+
+    Request makeRequest(int index) const
+    {
+        return {index, index, -1};
+    }
+
+    std::shared_ptr<Reader> makeReader(const Request & request)
+    {
+        return std::make_shared<Reader>(queues[request.send_task_id]);
+    }
+
+    static Status getStatusOK()
+    {
+        return ::grpc::Status();
+    }
+
+    bool supportAsync(const Request &) const { return false; }
+    void makeAsyncReader(
+        const Request &,
+        std::shared_ptr<AsyncReader> &,
+        UnaryCallback<bool> *) const {}
+
+    std::vector<PacketQueuePtr> queues;
+    std::vector<tipb::FieldType> field_types;
+};
+// NOLINTEND(readability-convert-member-functions-to-static)
+
+using MockExchangeReceiver = ExchangeReceiverBase<MockReceiverContext>;
+using MockExchangeReceiverPtr = std::shared_ptr<MockExchangeReceiver>;
+using MockExchangeReceiverInputStream = TiRemoteBlockInputStream<MockExchangeReceiver>;
+
+struct MockWriter : public PacketWriter
+{
+    explicit MockWriter(PacketQueuePtr queue_)
+        : queue(std::move(queue_))
+    {}
+
+    bool write(const Packet & packet) override
+    {
+        queue->push(std::make_shared<Packet>(packet));
+        return true;
+    }
+
+    void finish()
+    {
+        queue->finish();
+    }
+
+    PacketQueuePtr queue;
+};
+
+using MockWriterPtr = std::shared_ptr<MockWriter>;
+using MockTunnel = MPPTunnelBase<MockWriter>;
+using MockTunnelPtr = std::shared_ptr<MockTunnel>;
+using MockTunnelSet = MPPTunnelSetBase<MockTunnel>;
+using MockTunnelSetPtr = std::shared_ptr<MockTunnelSet>;
+
+struct MockBlockInputStream : public IProfilingBlockInputStream
+{
+    const std::vector<Block> & blocks;
+    Block header;
+    std::mt19937 mt;
+    std::uniform_int_distribution<int> dist;
+    StopFlag & stop_flag;
+
+    MockBlockInputStream(const std::vector<Block> & blocks_, StopFlag & stop_flag_);
+
+    String getName() const override { return "MockBlockInputStream"; }
+    Block getHeader() const override { return header; }
+
+    Block readImpl() override
+    {
+        if (stop_flag.load(std::memory_order_relaxed))
+            return Block{};
+        return blocks[dist(mt)];
+    }
+};
+
+// Similar to MockBlockInputStream, but return fixed count of rows.
+struct MockFixedRowsBlockInputStream : public IProfilingBlockInputStream
+{
+    Block header;
+    std::mt19937 mt;
+    std::uniform_int_distribution<int> dist;
+    size_t current_rows;
+    size_t total_rows;
+    const std::vector<Block> & blocks;
+
+    MockFixedRowsBlockInputStream(size_t total_rows_, const std::vector<Block> & blocks_);
+
+    String getName() const override { return "MockBlockInputStream"; }
+    Block getHeader() const override { return header; }
+
+    Block readImpl() override
+    {
+        if (current_rows >= total_rows)
+            return Block{};
+        Block res = blocks[dist(mt)];
+        current_rows += res.rows();
+        return res;
+    }
+};
+
+Block makeBlock(int row_num, bool skew = false);
+std::vector<Block> makeBlocks(int block_num, int row_num, bool skew = false);
+mpp::MPPDataPacket makePacket(ChunkCodecStream & codec, int row_num);
+std::vector<PacketPtr> makePackets(ChunkCodecStream & codec, int packet_num, int row_num);
+std::vector<PacketQueuePtr> makePacketQueues(int source_num, int queue_size);
+std::vector<tipb::FieldType> makeFields();
+void printException(const Exception & e);
+void sendPacket(const std::vector<PacketPtr> & packets, const PacketQueuePtr & queue, StopFlag & stop_flag);
+void receivePacket(const PacketQueuePtr & queue);
+
+struct ReceiverHelper
+{
+    const int concurrency;
+    const int source_num;
+    const uint32_t fine_grained_shuffle_stream_count;
+    tipb::ExchangeReceiver pb_exchange_receiver;
+    std::vector<tipb::FieldType> fields;
+    mpp::TaskMeta task_meta;
+    std::vector<PacketQueuePtr> queues;
+    std::shared_ptr<Join> join_ptr;
+
+    explicit ReceiverHelper(int concurrency_, int source_num_, uint32_t fine_grained_shuffle_stream_count_);
+    MockExchangeReceiverPtr buildReceiver();
+    std::vector<BlockInputStreamPtr> buildExchangeReceiverStream();
+    BlockInputStreamPtr buildUnionStream();
+    void finish();
+};
+
+struct SenderHelper
+{
+    const int source_num;
+    const int concurrency;
+    const uint32_t fine_grained_shuffle_stream_count;
+    const int64_t fine_grained_shuffle_batch_size;
+
+    std::vector<PacketQueuePtr> queues;
+    std::vector<MockWriterPtr> mock_writers;
+    std::vector<MockTunnelPtr> tunnels;
+    MockTunnelSetPtr tunnel_set;
+    std::unique_ptr<DAGContext> dag_context;
+
+    SenderHelper(
+        int source_num_,
+        int concurrency_,
+        uint32_t fine_grained_shuffle_stream_count_,
+        int64_t fine_grained_shuffle_batch_size_,
+        const std::vector<PacketQueuePtr> & queues_,
+        const std::vector<tipb::FieldType> & fields);
+
+    // Using MockBlockInputStream to build streams.
+    BlockInputStreamPtr buildUnionStream(StopFlag & stop_flag, const std::vector<Block> & blocks);
+    // Using MockFixedRowsBlockInputStream to build streams.
+    BlockInputStreamPtr buildUnionStream(size_t total_rows, const std::vector<Block> & blocks);
+
+    void finish();
+};
+
+class ExchangeBench : public benchmark::Fixture
+{
+public:
+    void SetUp(const benchmark::State &) override;
+    void TearDown(const benchmark::State &) override;
+    void runAndWait(std::shared_ptr<ReceiverHelper> receiver_helper,
+                    BlockInputStreamPtr receiver_stream,
+                    std::shared_ptr<SenderHelper> & sender_helper,
+                    BlockInputStreamPtr sender_stream);
+
+    std::vector<Block> uniform_blocks;
+    std::vector<Block> skew_blocks;
+};
+
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Flash/tests/bench_window.cpp b/dbms/src/Flash/tests/bench_window.cpp
new file mode 100644
index 00000000000..75dc53b065b
--- /dev/null
+++ b/dbms/src/Flash/tests/bench_window.cpp
@@ -0,0 +1,167 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Flash/tests/bench_exchange.h>
+#include <TestUtils/mockExecutor.h>
+
+namespace DB
+{
+namespace tests
+{
+class WindowFunctionBench : public ExchangeBench
+{
+public:
+    void SetUp(const benchmark::State & state) override
+    {
+        // Using DAGRequestBuilder to build tipb::Window and tipb::Sort.
+        // select row_number() over w1 from t1 window w1 as (partition by c1, c2, c3 order by c1, c2, c3);
+        ExchangeBench::SetUp(state);
+    }
+
+    static void setupPB(uint64_t fine_grained_shuffle_stream_count, tipb::Window & window, tipb::Sort & sort)
+    {
+        MockColumnInfoVec columns{
+            {"c1", TiDB::TP::TypeLongLong},
+            {"c2", TiDB::TP::TypeString},
+            {"c3", TiDB::TP::TypeLongLong},
+        };
+        size_t executor_index = 0;
+        DAGRequestBuilder builder(executor_index);
+        builder
+            .mockTable("test", "t1", columns)
+            .sort({{"c1", false}, {"c2", false}, {"c3", false}}, true, fine_grained_shuffle_stream_count)
+            .window(RowNumber(),
+                    {{"c1", false}, {"c2", false}, {"c3", false}},
+                    {{"c1", false}, {"c2", false}, {"c3", false}},
+                    buildDefaultRowsFrame(),
+                    fine_grained_shuffle_stream_count);
+        tipb::DAGRequest req;
+        MPPInfo mpp_info(0, -1, -1, {}, std::unordered_map<String, std::vector<Int64>>{});
+        builder.getRoot()->toTiPBExecutor(req.mutable_root_executor(), /*collator_id=*/0, mpp_info, TiFlashTestEnv::getContext());
+        assert(req.root_executor().tp() == tipb::TypeWindow);
+        window = req.root_executor().window();
+        assert(window.child().tp() == tipb::TypeSort);
+        sort = window.child().sort();
+    }
+
+    static void prepareWindowStream(Context & context, int concurrency, int source_num, int total_rows, uint32_t fine_grained_shuffle_stream_count, uint64_t fine_grained_shuffle_batch_size, const std::vector<Block> & blocks, BlockInputStreamPtr & sender_stream, BlockInputStreamPtr & receiver_stream, std::shared_ptr<SenderHelper> & sender_helper, std::shared_ptr<ReceiverHelper> & receiver_helper, bool build_window = true)
+    {
+        tipb::Window window;
+        tipb::Sort sort;
+        setupPB(fine_grained_shuffle_stream_count, window, sort);
+
+        DAGPipeline pipeline;
+        receiver_helper = std::make_shared<ReceiverHelper>(concurrency, source_num, fine_grained_shuffle_stream_count);
+        pipeline.streams = receiver_helper->buildExchangeReceiverStream();
+
+        sender_helper = std::make_shared<SenderHelper>(source_num, concurrency, fine_grained_shuffle_stream_count, fine_grained_shuffle_batch_size, receiver_helper->queues, receiver_helper->fields);
+        sender_stream = sender_helper->buildUnionStream(total_rows, blocks);
+
+        context.setDAGContext(sender_helper->dag_context.get());
+        std::vector<NameAndTypePair> source_columns{
+            NameAndTypePair("c1", makeNullable(std::make_shared<DataTypeInt64>())),
+            NameAndTypePair("c2", makeNullable(std::make_shared<DataTypeString>())),
+            NameAndTypePair("c3", makeNullable(std::make_shared<DataTypeInt64>()))};
+        auto mock_interpreter = mockInterpreter(context, source_columns, concurrency);
+        mock_interpreter->input_streams_vec.push_back(pipeline.streams);
+        mockExecuteWindowOrder(mock_interpreter, pipeline, sort, fine_grained_shuffle_stream_count);
+        if (build_window)
+        {
+            mockExecuteWindow(mock_interpreter, pipeline, window, fine_grained_shuffle_stream_count);
+        }
+        pipeline.transform([&](auto & stream) {
+            stream = std::make_shared<SquashingBlockInputStream>(stream, 8192, 0, "mock_executor_id_squashing");
+        });
+        receiver_stream = std::make_shared<UnionBlockInputStream<>>(pipeline.streams, BlockInputStreams{}, concurrency, /*req_id=*/"");
+    }
+};
+
+BENCHMARK_DEFINE_F(WindowFunctionBench, basic_row_number)
+(benchmark::State & state)
+try
+{
+    const int concurrency = state.range(0);
+    const int source_num = state.range(1);
+    const int total_rows = state.range(2);
+    const int fine_grained_shuffle_stream_count = state.range(3);
+    const int fine_grained_shuffle_batch_size = state.range(4);
+    const bool skew = state.range(5);
+    Context context = TiFlashTestEnv::getContext();
+
+    std::vector<Block> * blocks = &uniform_blocks;
+    if (skew)
+        blocks = &skew_blocks;
+
+    for (auto _ : state)
+    {
+        std::shared_ptr<SenderHelper> sender_helper;
+        std::shared_ptr<ReceiverHelper> receiver_helper;
+        BlockInputStreamPtr sender_stream;
+        BlockInputStreamPtr receiver_stream;
+
+        prepareWindowStream(context, concurrency, source_num, total_rows, fine_grained_shuffle_stream_count, fine_grained_shuffle_batch_size, *blocks, sender_stream, receiver_stream, sender_helper, receiver_helper);
+
+        runAndWait(receiver_helper, receiver_stream, sender_helper, sender_stream);
+    }
+}
+CATCH
+BENCHMARK_REGISTER_F(WindowFunctionBench, basic_row_number)
+    ->Args({8, 1, 1024 * 1000, 0, 4096, false}) // Test fine_grained_shuffle_stream_count.
+    ->Args({8, 1, 1024 * 1000, 4, 4096, false})
+    ->Args({8, 1, 1024 * 1000, 8, 4096, false})
+    ->Args({8, 1, 1024 * 1000, 16, 4096, false})
+    ->Args({8, 1, 1024 * 1000, 32, 4096, false})
+    ->Args({8, 1, 1024 * 1000, 8, 1, false}) // Test fine_grained_shuffle_batch_size.
+    ->Args({8, 1, 1024 * 1000, 8, 1000, false})
+    ->Args({8, 1, 1024 * 1000, 8, 10000, false})
+    ->Args({8, 1, 1024 * 1000, 8, 100000, false})
+    ->Args({8, 1, 1024 * 1000, 0, 4096, true}) // Test skew dataset.
+    ->Args({8, 1, 1024 * 1000, 4, 4096, true})
+    ->Args({8, 1, 1024 * 1000, 8, 4096, true})
+    ->Args({8, 1, 1024 * 1000, 16, 4096, true});
+
+BENCHMARK_DEFINE_F(WindowFunctionBench, partial_sort_skew_dataset)
+(benchmark::State & state)
+try
+{
+    const int concurrency = state.range(0);
+    const int source_num = state.range(1);
+    const int total_rows = state.range(2);
+    const int fine_grained_shuffle_stream_count = state.range(3);
+    const int fine_grained_shuffle_batch_size = state.range(4);
+    Context context = TiFlashTestEnv::getContext();
+
+    std::vector<Block> * blocks = &skew_blocks;
+
+    for (auto _ : state)
+    {
+        std::shared_ptr<SenderHelper> sender_helper;
+        std::shared_ptr<ReceiverHelper> receiver_helper;
+        BlockInputStreamPtr sender_stream;
+        BlockInputStreamPtr receiver_stream;
+
+        // Only build partial sort.
+        prepareWindowStream(context, concurrency, source_num, total_rows, fine_grained_shuffle_stream_count, fine_grained_shuffle_batch_size, *blocks, sender_stream, receiver_stream, sender_helper, receiver_helper, /*build_window=*/false);
+
+        runAndWait(receiver_helper, receiver_stream, sender_helper, sender_stream);
+    }
+}
+CATCH
+BENCHMARK_REGISTER_F(WindowFunctionBench, partial_sort_skew_dataset)
+    ->Args({1, 1, 1024 * 10000, 0, 4096}) // Test how much multiple-thread improves performance for partial sort.
+    ->Args({2, 1, 1024 * 10000, 0, 4096})
+    ->Args({4, 1, 1024 * 10000, 0, 4096})
+    ->Args({8, 1, 1024 * 10000, 0, 4096});
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Flash/tests/exchange_perftest.cpp b/dbms/src/Flash/tests/exchange_perftest.cpp
deleted file mode 100644
index c2e047bec62..00000000000
--- a/dbms/src/Flash/tests/exchange_perftest.cpp
+++ /dev/null
@@ -1,699 +0,0 @@
-// Copyright 2022 PingCAP, Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <Common/ConcurrentBoundedQueue.h>
-#include <Common/MPMCQueue.h>
-#include <DataStreams/ExchangeSenderBlockInputStream.h>
-#include <DataStreams/HashJoinBuildBlockInputStream.h>
-#include <DataStreams/SquashingBlockOutputStream.h>
-#include <DataStreams/TiRemoteBlockInputStream.h>
-#include <DataStreams/UnionBlockInputStream.h>
-#include <Flash/Coprocessor/DAGBlockOutputStream.h>
-#include <Interpreters/Join.h>
-#include <TestUtils/FunctionTestUtils.h>
-#include <fmt/core.h>
-
-#include <Flash/Coprocessor/StreamingDAGResponseWriter.cpp> // to include the implementation of StreamingDAGResponseWriter
-#include <Flash/Mpp/ExchangeReceiver.cpp> // to include the implementation of ExchangeReceiver
-#include <Flash/Mpp/MPPTunnel.cpp> // to include the implementation of MPPTunnel
-#include <Flash/Mpp/MPPTunnelSet.cpp> // to include the implementation of MPPTunnelSet
-#include <atomic>
-#include <chrono>
-#include <random>
-
-namespace DB::tests
-{
-namespace
-{
-std::random_device rd;
-
-using Packet = mpp::MPPDataPacket;
-using PacketPtr = std::shared_ptr<Packet>;
-using PacketQueue = MPMCQueue<PacketPtr>;
-using PacketQueuePtr = std::shared_ptr<PacketQueue>;
-using StopFlag = std::atomic<bool>;
-
-std::atomic<Int64> received_data_size{0};
-
-struct MockReceiverContext
-{
-    struct Status
-    {
-        int status_code = 0;
-        String error_msg;
-
-        bool ok() const
-        {
-            return status_code == 0;
-        }
-
-        const String & error_message() const
-        {
-            return error_msg;
-        }
-
-        int error_code() const
-        {
-            return status_code;
-        }
-    };
-
-    struct Request
-    {
-        String debugString() const
-        {
-            return "{Request}";
-        }
-
-        int source_index = 0;
-        int send_task_id = 0;
-        int recv_task_id = -1;
-    };
-
-    struct Reader
-    {
-        explicit Reader(const PacketQueuePtr & queue_)
-            : queue(queue_)
-        {}
-
-        void initialize() const
-        {
-        }
-
-        bool read(PacketPtr & packet [[maybe_unused]]) const
-        {
-            PacketPtr res;
-            if (queue->pop(res))
-            {
-                received_data_size.fetch_add(res->ByteSizeLong());
-                *packet = *res; // avoid change shared packets
-                return true;
-            }
-            return false;
-        }
-
-        Status finish() const
-        {
-            return {0, ""};
-        }
-
-        PacketQueuePtr queue;
-    };
-
-    MockReceiverContext(
-        const std::vector<PacketQueuePtr> & queues_,
-        const std::vector<tipb::FieldType> & field_types_)
-        : queues(queues_)
-        , field_types(field_types_)
-    {
-    }
-
-    void fillSchema(DAGSchema & schema) const
-    {
-        schema.clear();
-        for (size_t i = 0; i < field_types.size(); ++i)
-        {
-            String name = "exchange_receiver_" + std::to_string(i);
-            ColumnInfo info = TiDB::fieldTypeToColumnInfo(field_types[i]);
-            schema.emplace_back(std::move(name), std::move(info));
-        }
-    }
-
-    Request makeRequest(int index) const
-    {
-        return {index, index, -1};
-    }
-
-    std::shared_ptr<Reader> makeReader(const Request & request)
-    {
-        return std::make_shared<Reader>(queues[request.send_task_id]);
-    }
-
-    static Status getStatusOK()
-    {
-        return {0, ""};
-    }
-
-    std::vector<PacketQueuePtr> queues;
-    std::vector<tipb::FieldType> field_types;
-};
-
-using MockExchangeReceiver = ExchangeReceiverBase<MockReceiverContext>;
-using MockExchangeReceiverPtr = std::shared_ptr<MockExchangeReceiver>;
-using MockExchangeReceiverInputStream = TiRemoteBlockInputStream<MockExchangeReceiver>;
-
-struct MockWriter
-{
-    explicit MockWriter(PacketQueuePtr queue_)
-        : queue(std::move(queue_))
-    {}
-
-    bool Write(const Packet & packet)
-    {
-        queue->push(std::make_shared<Packet>(packet));
-        return true;
-    }
-
-    void finish()
-    {
-        queue->finish();
-    }
-
-    PacketQueuePtr queue;
-};
-
-using MockWriterPtr = std::shared_ptr<MockWriter>;
-using MockTunnel = MPPTunnelBase<MockWriter>;
-using MockTunnelPtr = std::shared_ptr<MockTunnel>;
-using MockTunnelSet = MPPTunnelSetBase<MockTunnel>;
-using MockTunnelSetPtr = std::shared_ptr<MockTunnelSet>;
-
-struct MockBlockInputStream : public IProfilingBlockInputStream
-{
-    const std::vector<Block> & blocks;
-    Block header;
-    std::mt19937 mt;
-    std::uniform_int_distribution<int> dist;
-    StopFlag & stop_flag;
-
-    MockBlockInputStream(const std::vector<Block> & blocks_, StopFlag & stop_flag_)
-        : blocks(blocks_)
-        , header(blocks[0].cloneEmpty())
-        , mt(rd())
-        , dist(0, blocks.size() - 1)
-        , stop_flag(stop_flag_)
-    {}
-
-    String getName() const override { return "MockBlockInputStream"; }
-    Block getHeader() const override { return header; }
-
-    Block readImpl() override
-    {
-        if (stop_flag.load(std::memory_order_relaxed))
-            return Block{};
-        return blocks[dist(mt)];
-    }
-};
-
-Block makeBlock(int row_num)
-{
-    std::mt19937 mt(rd());
-    std::uniform_int_distribution<Int64> int64_dist;
-    std::uniform_int_distribution<int> len_dist(10, 20);
-    std::uniform_int_distribution<char> char_dist;
-
-    InferredDataVector<Nullable<Int64>> int64_vec;
-    InferredDataVector<Nullable<Int64>> int64_vec2;
-    for (int i = 0; i < row_num; ++i)
-    {
-        int64_vec.emplace_back(int64_dist(mt));
-        int64_vec2.emplace_back(int64_dist(mt));
-    }
-
-    InferredDataVector<Nullable<String>> string_vec;
-    for (int i = 0; i < row_num; ++i)
-    {
-        int len = len_dist(mt);
-        String s;
-        for (int j = 0; j < len; ++j)
-            s.push_back(char_dist(mt));
-        string_vec.push_back(std::move(s));
-    }
-
-    auto int64_data_type = makeDataType<Nullable<Int64>>();
-    ColumnWithTypeAndName int64_column(makeColumn<Nullable<Int64>>(int64_data_type, int64_vec), int64_data_type, "int64_1");
-    ColumnWithTypeAndName int64_column2(makeColumn<Nullable<Int64>>(int64_data_type, int64_vec2), int64_data_type, "int64_2");
-
-    auto string_data_type = makeDataType<Nullable<String>>();
-    ColumnWithTypeAndName string_column(makeColumn<Nullable<String>>(string_data_type, string_vec), string_data_type, "string");
-
-    return Block({int64_column, string_column, int64_column2});
-}
-
-std::vector<Block> makeBlocks(int block_num, int row_num)
-{
-    std::vector<Block> blocks;
-    for (int i = 0; i < block_num; ++i)
-        blocks.push_back(makeBlock(row_num));
-    return blocks;
-}
-
-mpp::MPPDataPacket makePacket(ChunkCodecStream & codec, int row_num)
-{
-    auto block = makeBlock(row_num);
-    codec.encode(block, 0, row_num);
-
-    mpp::MPPDataPacket packet;
-    packet.add_chunks(codec.getString());
-    codec.clear();
-
-    return packet;
-}
-
-std::vector<PacketPtr> makePackets(ChunkCodecStream & codec, int packet_num, int row_num)
-{
-    std::vector<PacketPtr> packets;
-    for (int i = 0; i < packet_num; ++i)
-        packets.push_back(std::make_shared<Packet>(makePacket(codec, row_num)));
-    return packets;
-}
-
-std::vector<PacketQueuePtr> makePacketQueues(int source_num, int queue_size)
-{
-    std::vector<PacketQueuePtr> queues;
-    for (int i = 0; i < source_num; ++i)
-        queues.push_back(std::make_shared<PacketQueue>(queue_size));
-    return queues;
-}
-
-std::vector<tipb::FieldType> makeFields()
-{
-    std::vector<tipb::FieldType> fields(3);
-    fields[0].set_tp(TiDB::TypeLongLong);
-    fields[1].set_tp(TiDB::TypeString);
-    fields[2].set_tp(TiDB::TypeLongLong);
-    return fields;
-}
-
-void printException(const Exception & e)
-{
-    std::string text = e.displayText();
-
-    auto embedded_stack_trace_pos = text.find("Stack trace");
-    std::cerr << "Code: " << e.code() << ". " << text << std::endl
-              << std::endl;
-    if (std::string::npos == embedded_stack_trace_pos)
-        std::cerr << "Stack trace:" << std::endl
-                  << e.getStackTrace().toString() << std::endl;
-}
-
-void sendPacket(const std::vector<PacketPtr> & packets, const PacketQueuePtr & queue, StopFlag & stop_flag)
-{
-    std::mt19937 mt(rd());
-    std::uniform_int_distribution<int> dist(0, packets.size() - 1);
-
-    while (!stop_flag.load())
-    {
-        int i = dist(mt);
-        queue->tryPush(packets[i], std::chrono::milliseconds(10));
-    }
-    queue->finish();
-}
-
-void receivePacket(const PacketQueuePtr & queue)
-{
-    while (true)
-    {
-        PacketPtr packet;
-        if (queue->pop(packet))
-            received_data_size.fetch_add(packet->ByteSizeLong());
-        else
-            break;
-    }
-}
-
-template <bool print_progress>
-void readBlock(BlockInputStreamPtr stream)
-{
-    [[maybe_unused]] auto get_rate = [](auto count, auto duration) {
-        return count * 1000 / duration.count();
-    };
-
-    [[maybe_unused]] auto get_mib = [](auto v) {
-        return v / 1024 / 1024;
-    };
-
-    [[maybe_unused]] auto start = std::chrono::high_resolution_clock::now();
-    [[maybe_unused]] auto second_ago = start;
-    [[maybe_unused]] Int64 block_count = 0;
-    [[maybe_unused]] Int64 last_block_count = 0;
-    [[maybe_unused]] Int64 last_data_size = received_data_size.load();
-    try
-    {
-        stream->readPrefix();
-        while (auto block = stream->read())
-        {
-            if constexpr (print_progress)
-            {
-                ++block_count;
-                auto cur = std::chrono::high_resolution_clock::now();
-                auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(cur - second_ago);
-                if (duration.count() >= 1000)
-                {
-                    Int64 data_size = received_data_size.load();
-                    std::cout
-                        << fmt::format(
-                               "Blocks: {:<10} Data(MiB): {:<8} Block/s: {:<6} Data/s(MiB): {:<6}",
-                               block_count,
-                               get_mib(data_size),
-                               get_rate(block_count - last_block_count, duration),
-                               get_mib(get_rate(data_size - last_data_size, duration)))
-                        << std::endl;
-                    second_ago = cur;
-                    last_block_count = block_count;
-                    last_data_size = data_size;
-                }
-            }
-        }
-        stream->readSuffix();
-
-        if constexpr (print_progress)
-        {
-            auto cur = std::chrono::high_resolution_clock::now();
-            auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(cur - start);
-            Int64 data_size = received_data_size.load();
-            std::cout
-                << fmt::format(
-                       "End. Blocks: {:<10} Data(MiB): {:<8} Block/s: {:<6} Data/s(MiB): {:<6}",
-                       block_count,
-                       get_mib(data_size),
-                       get_rate(block_count, duration),
-                       get_mib(get_rate(data_size, duration)))
-                << std::endl;
-        }
-    }
-    catch (const Exception & e)
-    {
-        printException(e);
-        throw;
-    }
-}
-
-struct ReceiverHelper
-{
-    const int source_num;
-    tipb::ExchangeReceiver pb_exchange_receiver;
-    std::vector<tipb::FieldType> fields;
-    mpp::TaskMeta task_meta;
-    std::vector<PacketQueuePtr> queues;
-    std::shared_ptr<Join> join_ptr;
-
-    explicit ReceiverHelper(int source_num_)
-        : source_num(source_num_)
-    {
-        pb_exchange_receiver.set_tp(tipb::Hash);
-        for (int i = 0; i < source_num; ++i)
-        {
-            mpp::TaskMeta task;
-            task.set_start_ts(0);
-            task.set_task_id(i);
-            task.set_partition_id(i);
-            task.set_address("");
-
-            String encoded_task;
-            task.SerializeToString(&encoded_task);
-
-            pb_exchange_receiver.add_encoded_task_meta(encoded_task);
-        }
-
-        fields = makeFields();
-        *pb_exchange_receiver.add_field_types() = fields[0];
-        *pb_exchange_receiver.add_field_types() = fields[1];
-        *pb_exchange_receiver.add_field_types() = fields[2];
-
-        task_meta.set_task_id(100);
-
-        queues = makePacketQueues(source_num, 10);
-    }
-
-    MockExchangeReceiverPtr buildReceiver()
-    {
-        return std::make_shared<MockExchangeReceiver>(
-            std::make_shared<MockReceiverContext>(queues, fields),
-            source_num,
-            source_num * 5,
-            nullptr);
-    }
-
-    BlockInputStreamPtr buildUnionStream(int concurrency)
-    {
-        auto receiver = buildReceiver();
-        std::vector<BlockInputStreamPtr> streams;
-        for (int i = 0; i < concurrency; ++i)
-            streams.push_back(std::make_shared<MockExchangeReceiverInputStream>(receiver, nullptr));
-        return std::make_shared<UnionBlockInputStream<>>(streams, nullptr, concurrency, /*req_id=*/"");
-    }
-
-    BlockInputStreamPtr buildUnionStreamWithHashJoinBuildStream(int concurrency)
-    {
-        auto receiver = buildReceiver();
-        std::vector<BlockInputStreamPtr> streams;
-        for (int i = 0; i < concurrency; ++i)
-            streams.push_back(std::make_shared<MockExchangeReceiverInputStream>(receiver, nullptr));
-
-        auto receiver_header = streams.front()->getHeader();
-        auto key_name = receiver_header.getByPosition(0).name;
-
-        join_ptr = std::make_shared<Join>(
-            Names{key_name},
-            Names{key_name},
-            true,
-            SizeLimits(0, 0, OverflowMode::THROW),
-            ASTTableJoin::Kind::Inner,
-            ASTTableJoin::Strictness::All,
-            /*req_id=*/"",
-            TiDB::TiDBCollators{nullptr},
-            "",
-            "",
-            "",
-            "",
-            nullptr,
-            65536);
-
-        join_ptr->init(receiver_header, concurrency);
-
-        for (int i = 0; i < concurrency; ++i)
-            streams[i] = std::make_shared<HashJoinBuildBlockInputStream>(streams[i], join_ptr, i, /*req_id=*/"");
-
-        return std::make_shared<UnionBlockInputStream<>>(streams, nullptr, concurrency, /*req_id=*/"");
-    }
-
-    void finish()
-    {
-        if (join_ptr)
-        {
-            join_ptr->setBuildTableState(Join::BuildTableState::SUCCEED);
-            std::cout << fmt::format("Hash table size: {} bytes", join_ptr->getTotalByteCount()) << std::endl;
-        }
-    }
-};
-
-struct SenderHelper
-{
-    const int source_num;
-    const int concurrency;
-
-    std::vector<PacketQueuePtr> queues;
-    std::vector<MockWriterPtr> mock_writers;
-    std::vector<MockTunnelPtr> tunnels;
-    MockTunnelSetPtr tunnel_set;
-    std::unique_ptr<DAGContext> dag_context;
-
-    SenderHelper(
-        int source_num_,
-        int concurrency_,
-        const std::vector<PacketQueuePtr> & queues_,
-        const std::vector<tipb::FieldType> & fields)
-        : source_num(source_num_)
-        , concurrency(concurrency_)
-        , queues(queues_)
-    {
-        mpp::TaskMeta task_meta;
-        tunnel_set = std::make_shared<MockTunnelSet>();
-        for (int i = 0; i < source_num; ++i)
-        {
-            auto writer = std::make_shared<MockWriter>(queues[i]);
-            mock_writers.push_back(writer);
-
-            auto tunnel = std::make_shared<MockTunnel>(
-                task_meta,
-                task_meta,
-                std::chrono::seconds(60),
-                concurrency,
-                false);
-            tunnel->connect(writer.get());
-            tunnels.push_back(tunnel);
-            tunnel_set->addTunnel(tunnel);
-        }
-
-        tipb::DAGRequest dag_request;
-        tipb::Executor root_executor;
-        root_executor.set_executor_id("ExchangeSender_100");
-        *dag_request.mutable_root_executor() = root_executor;
-
-        dag_context = std::make_unique<DAGContext>(dag_request);
-        dag_context->is_mpp_task = true;
-        dag_context->is_root_mpp_task = false;
-        dag_context->encode_type = tipb::EncodeType::TypeCHBlock;
-        dag_context->result_field_types = fields;
-    }
-
-    BlockInputStreamPtr buildUnionStream(
-        StopFlag & stop_flag,
-        const std::vector<Block> & blocks)
-    {
-        std::vector<BlockInputStreamPtr> send_streams;
-        for (int i = 0; i < concurrency; ++i)
-        {
-            BlockInputStreamPtr stream = std::make_shared<MockBlockInputStream>(blocks, stop_flag);
-            std::unique_ptr<DAGResponseWriter> response_writer(
-                new StreamingDAGResponseWriter<MockTunnelSetPtr>(
-                    tunnel_set,
-                    {0, 1, 2},
-                    TiDB::TiDBCollators(3),
-                    tipb::Hash,
-                    -1,
-                    -1,
-                    true,
-                    *dag_context));
-            send_streams.push_back(std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), /*req_id=*/""));
-        }
-
-        return std::make_shared<UnionBlockInputStream<>>(send_streams, nullptr, concurrency, /*req_id=*/"");
-    }
-
-    void finish()
-    {
-        for (size_t i = 0; i < tunnels.size(); ++i)
-        {
-            tunnels[i]->writeDone();
-            tunnels[i]->waitForFinish();
-            mock_writers[i]->finish();
-        }
-    }
-};
-
-void testOnlyReceiver(int concurrency, int source_num, int block_rows, int seconds)
-{
-    ReceiverHelper receiver_helper(source_num);
-    auto union_input_stream = receiver_helper.buildUnionStream(concurrency);
-
-    auto chunk_codec_stream = CHBlockChunkCodec().newCodecStream(receiver_helper.fields);
-    auto packets = makePackets(*chunk_codec_stream, 100, block_rows);
-
-    StopFlag stop_flag(false);
-
-    std::vector<std::thread> threads;
-    for (const auto & queue : receiver_helper.queues)
-        threads.emplace_back(sendPacket, std::cref(packets), queue, std::ref(stop_flag));
-    threads.emplace_back(readBlock<true>, union_input_stream);
-
-    std::this_thread::sleep_for(std::chrono::seconds(seconds));
-    stop_flag.store(true);
-    for (auto & thread : threads)
-        thread.join();
-
-    receiver_helper.finish();
-}
-
-template <bool with_join>
-void testSenderReceiver(int concurrency, int source_num, int block_rows, int seconds)
-{
-    ReceiverHelper receiver_helper(source_num);
-    BlockInputStreamPtr union_receive_stream;
-    if constexpr (with_join)
-        union_receive_stream = receiver_helper.buildUnionStreamWithHashJoinBuildStream(concurrency);
-    else
-        union_receive_stream = receiver_helper.buildUnionStream(concurrency);
-
-    StopFlag stop_flag(false);
-    auto blocks = makeBlocks(100, block_rows);
-
-    SenderHelper sender_helper(source_num, concurrency, receiver_helper.queues, receiver_helper.fields);
-    auto union_send_stream = sender_helper.buildUnionStream(stop_flag, blocks);
-
-    auto write_thread = std::thread(readBlock<false>, union_send_stream);
-    auto read_thread = std::thread(readBlock<true>, union_receive_stream);
-
-    std::this_thread::sleep_for(std::chrono::seconds(seconds));
-    stop_flag.store(true);
-
-    write_thread.join();
-    sender_helper.finish();
-
-    read_thread.join();
-    receiver_helper.finish();
-}
-
-void testOnlySender(int concurrency, int source_num, int block_rows, int seconds)
-{
-    auto queues = makePacketQueues(source_num, 10);
-    auto fields = makeFields();
-
-    StopFlag stop_flag(false);
-    auto blocks = makeBlocks(100, block_rows);
-
-    SenderHelper sender_helper(source_num, concurrency, queues, fields);
-    auto union_send_stream = sender_helper.buildUnionStream(stop_flag, blocks);
-
-    auto write_thread = std::thread(readBlock<true>, union_send_stream);
-    std::vector<std::thread> read_threads;
-    for (int i = 0; i < source_num; ++i)
-        read_threads.emplace_back(receivePacket, queues[i]);
-
-    std::this_thread::sleep_for(std::chrono::seconds(seconds));
-    stop_flag.store(true);
-
-    write_thread.join();
-    sender_helper.finish();
-
-    for (auto & t : read_threads)
-        t.join();
-}
-
-} // namespace
-} // namespace DB::tests
-
-int main(int argc [[maybe_unused]], char ** argv [[maybe_unused]])
-{
-    if (argc < 2 || argc > 6)
-    {
-        std::cerr << fmt::format("Usage: {} [receiver|sender|sender_receiver|sender_receiver_join] <concurrency=5> <source_num=2> <block_rows=5000> <seconds=10>", argv[0]) << std::endl;
-        exit(1);
-    }
-
-    String method = argv[1];
-    int concurrency = argc >= 3 ? atoi(argv[2]) : 5;
-    int source_num = argc >= 4 ? atoi(argv[3]) : 2;
-    int block_rows = argc >= 5 ? atoi(argv[4]) : 5000;
-    int seconds = argc >= 6 ? atoi(argv[5]) : 10;
-
-    using TestHandler = std::function<void(int concurrency, int source_num, int block_rows, int seconds)>;
-    std::unordered_map<String, TestHandler> handlers = {
-        {"receiver", DB::tests::testOnlyReceiver},
-        {"sender", DB::tests::testOnlySender},
-        {"sender_receiver", DB::tests::testSenderReceiver<false>},
-        {"sender_receiver_join", DB::tests::testSenderReceiver<true>},
-    };
-
-    auto it = handlers.find(method);
-    if (it != handlers.end())
-    {
-        std::cout
-            << fmt::format(
-                   "{}. concurrency = {}. source_num = {}. block_rows = {}. seconds = {}",
-                   method,
-                   concurrency,
-                   source_num,
-                   block_rows,
-                   seconds)
-            << std::endl;
-        it->second(concurrency, source_num, block_rows, seconds);
-    }
-    else
-    {
-        std::cerr << "Unknown method: " << method << std::endl;
-        exit(1);
-    }
-}
diff --git a/dbms/src/Flash/tests/gtest_executor.cpp b/dbms/src/Flash/tests/gtest_executor.cpp
index 49512b9271f..ee35af0c03d 100644
--- a/dbms/src/Flash/tests/gtest_executor.cpp
+++ b/dbms/src/Flash/tests/gtest_executor.cpp
@@ -59,180 +59,170 @@ class ExecutorTestRunner : public DB::tests::ExecutorTest
                              {toVec<String>("s", {"banana", "banana"}),
                               toVec<String>("join_c", {"apple", "banana"})});
     }
-
-    void executeExecutor(const std::shared_ptr<tipb::DAGRequest> & request, const ColumnsWithTypeAndName & expect_columns, size_t concurrency = 1)
-    {
-        std::vector<String> enable_planners{"true", "false"};
-        for (auto enable : enable_planners)
-        {
-            context.context.setSetting("enable_planner", enable);
-            executeStreams(request, expect_columns, concurrency);
-        }
-    }
 };
 
 TEST_F(ExecutorTestRunner, Filter)
 try
 {
-    auto request = context
-                       .scan("test_db", "test_table")
-                       .filter(eq(col("s1"), col("s2")))
-                       .build(context);
-    {
-        executeExecutor(request,
-                        {toNullableVec<String>({"banana"}),
-                         toNullableVec<String>({"banana"})});
-    }
+    wrapForDisEnablePlanner([&]() {
+        auto request = context
+                           .scan("test_db", "test_table")
+                           .filter(eq(col("s1"), col("s2")))
+                           .build(context);
+        {
+            ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                                createColumns({toNullableVec<String>({"banana"}),
+                                               toNullableVec<String>({"banana"})}));
+        }
 
-    request = context.receive("exchange1")
-                  .filter(eq(col("s1"), col("s2")))
-                  .build(context);
-    {
-        executeExecutor(request,
-                        {toNullableVec<String>({"banana"}),
-                         toNullableVec<String>({"banana"})});
-    }
+        request = context.receive("exchange1")
+                      .filter(eq(col("s1"), col("s2")))
+                      .build(context);
+        {
+            ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                                createColumns({toNullableVec<String>({"banana"}),
+                                               toNullableVec<String>({"banana"})}));
+        }
+    });
 }
 CATCH
 
 TEST_F(ExecutorTestRunner, JoinWithTableScan)
 try
 {
-    auto request = context
-                       .scan("test_db", "l_table")
-                       .join(context.scan("test_db", "r_table"), {col("join_c")}, ASTTableJoin::Kind::Left)
-                       .topN("join_c", false, 2)
-                       .build(context);
-    {
-        String expected = "topn_3 | order_by: {(<1, String>, desc: false)}, limit: 2\n"
-                          " Join_2 | LeftOuterJoin, HashJoin. left_join_keys: {<0, String>}, right_join_keys: {<0, String>}\n"
-                          "  table_scan_0 | {<0, String>, <1, String>}\n"
-                          "  table_scan_1 | {<0, String>, <1, String>}\n";
-        ASSERT_DAGREQUEST_EQAUL(expected, request);
-        executeExecutor(request,
-                        {toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"}),
-                         toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"})},
-                        2);
-
-        executeExecutor(request,
-                        {toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"}),
-                         toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"})},
-                        5);
-
-        executeExecutor(request,
-                        {toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"}),
-                         toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"})});
-    }
-    request = context
-                  .scan("test_db", "l_table")
-                  .join(context.scan("test_db", "r_table"), {col("join_c")}, ASTTableJoin::Kind::Left)
-                  .project({"s", "join_c"})
-                  .topN("join_c", false, 2)
-                  .build(context);
-    {
-        String expected = "topn_4 | order_by: {(<1, String>, desc: false)}, limit: 2\n"
-                          " project_3 | {<0, String>, <1, String>}\n"
-                          "  Join_2 | LeftOuterJoin, HashJoin. left_join_keys: {<0, String>}, right_join_keys: {<0, String>}\n"
-                          "   table_scan_0 | {<0, String>, <1, String>}\n"
-                          "   table_scan_1 | {<0, String>, <1, String>}\n";
-        ASSERT_DAGREQUEST_EQAUL(expected, request);
-        executeExecutor(request,
-                        {toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"})},
-                        2);
-    }
+    wrapForDisEnablePlanner([&]() {
+        auto request = context
+                           .scan("test_db", "l_table")
+                           .join(context.scan("test_db", "r_table"), {col("join_c")}, ASTTableJoin::Kind::Left)
+                           .topN("join_c", false, 2)
+                           .build(context);
+        {
+            String expected = "topn_3 | order_by: {(<1, String>, desc: false)}, limit: 2\n"
+                              " Join_2 | LeftOuterJoin, HashJoin. left_join_keys: {<0, String>}, right_join_keys: {<0, String>}\n"
+                              "  table_scan_0 | {<0, String>, <1, String>}\n"
+                              "  table_scan_1 | {<0, String>, <1, String>}\n";
+            ASSERT_DAGREQUEST_EQAUL(expected, request);
+            ASSERT_COLUMNS_EQ_R(executeStreams(request, 2),
+                                createColumns({toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"}),
+                                               toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"})}));
+
+            ASSERT_COLUMNS_EQ_R(executeStreams(request, 5),
+                                createColumns({toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"}),
+                                               toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"})}));
+
+            ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                                createColumns({toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"}),
+                                               toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"})}));
+        }
+        request = context
+                      .scan("test_db", "l_table")
+                      .join(context.scan("test_db", "r_table"), {col("join_c")}, ASTTableJoin::Kind::Left)
+                      .project({"s", "join_c"})
+                      .topN("join_c", false, 2)
+                      .build(context);
+        {
+            String expected = "topn_4 | order_by: {(<1, String>, desc: false)}, limit: 2\n"
+                              " project_3 | {<0, String>, <1, String>}\n"
+                              "  Join_2 | LeftOuterJoin, HashJoin. left_join_keys: {<0, String>}, right_join_keys: {<0, String>}\n"
+                              "   table_scan_0 | {<0, String>, <1, String>}\n"
+                              "   table_scan_1 | {<0, String>, <1, String>}\n";
+            ASSERT_DAGREQUEST_EQAUL(expected, request);
+            ASSERT_COLUMNS_EQ_R(executeStreams(request, 2),
+                                createColumns({toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"})}));
+        }
 
-    request = context
-                  .scan("test_db", "l_table")
-                  .join(context.scan("test_db", "r_table_2"), {col("join_c")}, ASTTableJoin::Kind::Left)
-                  .topN("join_c", false, 4)
-                  .build(context);
-    {
-        String expected = "topn_3 | order_by: {(<1, String>, desc: false)}, limit: 4\n"
-                          " Join_2 | LeftOuterJoin, HashJoin. left_join_keys: {<0, String>}, right_join_keys: {<0, String>}\n"
-                          "  table_scan_0 | {<0, String>, <1, String>}\n"
-                          "  table_scan_1 | {<0, String>, <1, String>}\n";
-        ASSERT_DAGREQUEST_EQAUL(expected, request);
-        executeExecutor(request,
-                        {toNullableVec<String>({"banana", "banana", "banana", "banana"}),
-                         toNullableVec<String>({"apple", "apple", "apple", "banana"}),
-                         toNullableVec<String>({"banana", "banana", "banana", {}}),
-                         toNullableVec<String>({"apple", "apple", "apple", {}})},
-                        2);
-        executeExecutor(request,
-                        {toNullableVec<String>({"banana", "banana", "banana", "banana"}),
-                         toNullableVec<String>({"apple", "apple", "apple", "banana"}),
-                         toNullableVec<String>({"banana", "banana", "banana", {}}),
-                         toNullableVec<String>({"apple", "apple", "apple", {}})},
-                        3);
-    }
+        request = context
+                      .scan("test_db", "l_table")
+                      .join(context.scan("test_db", "r_table_2"), {col("join_c")}, ASTTableJoin::Kind::Left)
+                      .topN("join_c", false, 4)
+                      .build(context);
+        {
+            String expected = "topn_3 | order_by: {(<1, String>, desc: false)}, limit: 4\n"
+                              " Join_2 | LeftOuterJoin, HashJoin. left_join_keys: {<0, String>}, right_join_keys: {<0, String>}\n"
+                              "  table_scan_0 | {<0, String>, <1, String>}\n"
+                              "  table_scan_1 | {<0, String>, <1, String>}\n";
+            ASSERT_DAGREQUEST_EQAUL(expected, request);
+            ASSERT_COLUMNS_EQ_R(executeStreams(request, 2),
+                                createColumns({toNullableVec<String>({"banana", "banana", "banana", "banana"}),
+                                               toNullableVec<String>({"apple", "apple", "apple", "banana"}),
+                                               toNullableVec<String>({"banana", "banana", "banana", {}}),
+                                               toNullableVec<String>({"apple", "apple", "apple", {}})}));
+            ASSERT_COLUMNS_EQ_R(executeStreams(request, 3),
+                                createColumns({toNullableVec<String>({"banana", "banana", "banana", "banana"}),
+                                               toNullableVec<String>({"apple", "apple", "apple", "banana"}),
+                                               toNullableVec<String>({"banana", "banana", "banana", {}}),
+                                               toNullableVec<String>({"apple", "apple", "apple", {}})}));
+        }
+    });
 }
 CATCH
 
 TEST_F(ExecutorTestRunner, JoinWithExchangeReceiver)
 try
 {
-    auto request = context
-                       .receive("exchange_l_table")
-                       .join(context.receive("exchange_r_table"), {col("join_c")}, ASTTableJoin::Kind::Left)
-                       .topN("join_c", false, 2)
-                       .build(context);
-    {
-        String expected = "topn_3 | order_by: {(<1, String>, desc: false)}, limit: 2\n"
-                          " Join_2 | LeftOuterJoin, HashJoin. left_join_keys: {<0, String>}, right_join_keys: {<0, String>}\n"
-                          "  exchange_receiver_0 | type:PassThrough, {<0, String>, <1, String>}\n"
-                          "  exchange_receiver_1 | type:PassThrough, {<0, String>, <1, String>}\n";
-        ASSERT_DAGREQUEST_EQAUL(expected, request);
-        executeExecutor(request,
-                        {toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"}),
-                         toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"})},
-                        2);
-
-        executeExecutor(request,
-                        {toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"}),
-                         toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"})},
-                        5);
-
-        executeExecutor(request,
-                        {toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"}),
-                         toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"})});
-    }
+    wrapForDisEnablePlanner([&]() {
+        auto request = context
+                           .receive("exchange_l_table")
+                           .join(context.receive("exchange_r_table"), {col("join_c")}, ASTTableJoin::Kind::Left)
+                           .topN("join_c", false, 2)
+                           .build(context);
+        {
+            String expected = "topn_3 | order_by: {(<1, String>, desc: false)}, limit: 2\n"
+                              " Join_2 | LeftOuterJoin, HashJoin. left_join_keys: {<0, String>}, right_join_keys: {<0, String>}\n"
+                              "  exchange_receiver_0 | type:PassThrough, {<0, String>, <1, String>}\n"
+                              "  exchange_receiver_1 | type:PassThrough, {<0, String>, <1, String>}\n";
+            ASSERT_DAGREQUEST_EQAUL(expected, request);
+            ASSERT_COLUMNS_EQ_R(executeStreams(request, 2),
+                                createColumns({toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"}),
+                                               toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"})}));
+
+            ASSERT_COLUMNS_EQ_R(executeStreams(request, 5),
+                                createColumns({toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"}),
+                                               toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"})}));
+
+            ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                                createColumns({toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"}),
+                                               toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"})}));
+        }
+    });
 }
 CATCH
 
 TEST_F(ExecutorTestRunner, JoinWithTableScanAndReceiver)
 try
 {
-    auto request = context
-                       .scan("test_db", "l_table")
-                       .join(context.receive("exchange_r_table"), {col("join_c")}, ASTTableJoin::Kind::Left)
-                       .topN("join_c", false, 2)
-                       .build(context);
-    {
-        String expected = "topn_3 | order_by: {(<1, String>, desc: false)}, limit: 2\n"
-                          " Join_2 | LeftOuterJoin, HashJoin. left_join_keys: {<0, String>}, right_join_keys: {<0, String>}\n"
-                          "  table_scan_0 | {<0, String>, <1, String>}\n"
-                          "  exchange_receiver_1 | type:PassThrough, {<0, String>, <1, String>}\n";
-        ASSERT_DAGREQUEST_EQAUL(expected, request);
-        executeExecutor(request,
-                        {toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"}),
-                         toNullableVec<String>({"banana", "banana"}),
-                         toNullableVec<String>({"apple", "banana"})},
-                        2);
-    }
+    wrapForDisEnablePlanner([&]() {
+        auto request = context
+                           .scan("test_db", "l_table")
+                           .join(context.receive("exchange_r_table"), {col("join_c")}, ASTTableJoin::Kind::Left)
+                           .topN("join_c", false, 2)
+                           .build(context);
+        {
+            String expected = "topn_3 | order_by: {(<1, String>, desc: false)}, limit: 2\n"
+                              " Join_2 | LeftOuterJoin, HashJoin. left_join_keys: {<0, String>}, right_join_keys: {<0, String>}\n"
+                              "  table_scan_0 | {<0, String>, <1, String>}\n"
+                              "  exchange_receiver_1 | type:PassThrough, {<0, String>, <1, String>}\n";
+            ASSERT_DAGREQUEST_EQAUL(expected, request);
+            ASSERT_COLUMNS_EQ_R(executeStreams(request, 2),
+                                createColumns({toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"}),
+                                               toNullableVec<String>({"banana", "banana"}),
+                                               toNullableVec<String>({"apple", "banana"})}));
+        }
+    });
 }
 CATCH
 
diff --git a/dbms/src/Flash/tests/gtest_qb_interpreter.cpp b/dbms/src/Flash/tests/gtest_interpreter.cpp
similarity index 56%
rename from dbms/src/Flash/tests/gtest_qb_interpreter.cpp
rename to dbms/src/Flash/tests/gtest_interpreter.cpp
index 9c4c15857d3..d681cfaf66b 100644
--- a/dbms/src/Flash/tests/gtest_qb_interpreter.cpp
+++ b/dbms/src/Flash/tests/gtest_interpreter.cpp
@@ -19,26 +19,26 @@ namespace DB
 {
 namespace tests
 {
-class QBInterpreterExecuteTest : public DB::tests::ExecutorTest
+class InterpreterExecuteTest : public DB::tests::ExecutorTest
 {
 public:
     void initializeContext() override
     {
         ExecutorTest::initializeContext();
 
-        context.context.setSetting("enable_planner", "false");
+        enablePlanner(false);
 
         context.addMockTable({"test_db", "test_table"}, {{"s1", TiDB::TP::TypeString}, {"s2", TiDB::TP::TypeString}});
         context.addMockTable({"test_db", "test_table_1"}, {{"s1", TiDB::TP::TypeString}, {"s2", TiDB::TP::TypeString}, {"s3", TiDB::TP::TypeString}});
         context.addMockTable({"test_db", "r_table"}, {{"r_a", TiDB::TP::TypeLong}, {"r_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
         context.addMockTable({"test_db", "l_table"}, {{"l_a", TiDB::TP::TypeLong}, {"l_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
         context.addExchangeRelationSchema("sender_1", {{"s1", TiDB::TP::TypeString}, {"s2", TiDB::TP::TypeString}, {"s3", TiDB::TP::TypeString}});
-        context.addExchangeRelationSchema("sender_l", {{"l_a", TiDB::TP::TypeString}, {"l_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
-        context.addExchangeRelationSchema("sender_r", {{"r_a", TiDB::TP::TypeString}, {"r_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
+        context.addExchangeRelationSchema("sender_l", {{"l_a", TiDB::TP::TypeLong}, {"l_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
+        context.addExchangeRelationSchema("sender_r", {{"r_a", TiDB::TP::TypeLong}, {"r_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
     }
 };
 
-TEST_F(QBInterpreterExecuteTest, SingleQueryBlock)
+TEST_F(InterpreterExecuteTest, SingleQueryBlock)
 try
 {
     auto request = context.scan("test_db", "test_table_1")
@@ -92,13 +92,13 @@ Union: <for test>
 }
 CATCH
 
-TEST_F(QBInterpreterExecuteTest, MultipleQueryBlockWithSource)
+TEST_F(InterpreterExecuteTest, MultipleQueryBlockWithSource)
 try
 {
     auto request = context.scan("test_db", "test_table_1")
                        .project({"s1", "s2", "s3"})
                        .project({"s1", "s2"})
-                       .project("s1")
+                       .project({"s1"})
                        .build(context);
     {
         String expected = R"(
@@ -200,51 +200,10 @@ Union: <for test>
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
 
-    // Join Source.
-    DAGRequestBuilder table1 = context.scan("test_db", "r_table");
-    DAGRequestBuilder table2 = context.scan("test_db", "l_table");
-    DAGRequestBuilder table3 = context.scan("test_db", "r_table");
-    DAGRequestBuilder table4 = context.scan("test_db", "l_table");
-
-    request = table1.join(
-                        table2.join(
-                            table3.join(table4,
-                                        {col("join_c")},
-                                        ASTTableJoin::Kind::Left),
-                            {col("join_c")},
-                            ASTTableJoin::Kind::Left),
-                        {col("join_c")},
-                        ASTTableJoin::Kind::Left)
-                  .build(context);
-    {
-        String expected = R"(
-CreatingSets
- Union: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = table_scan_3>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     MockTableScan
- Union x 2: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     Expression: <remove useless column after join>
-      HashJoinProbe: <join probe, join_executor_id = Join_4>
-       Expression: <final projection>
-        MockTableScan
- Union: <for test>
-  Expression x 10: <final projection>
-   Expression: <remove useless column after join>
-    HashJoinProbe: <join probe, join_executor_id = Join_6>
-     Expression: <final projection>
-      MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
-
     request = context.receive("sender_1")
                   .project({"s1", "s2", "s3"})
                   .project({"s1", "s2"})
-                  .project("s1")
+                  .project({"s1"})
                   .build(context);
     {
         String expected = R"(
@@ -263,7 +222,7 @@ Union: <for test>
     request = context.receive("sender_1")
                   .project({"s1", "s2", "s3"})
                   .project({"s1", "s2"})
-                  .project("s1")
+                  .project({"s1"})
                   .exchangeSender(tipb::Broadcast)
                   .build(context);
     {
@@ -280,387 +239,394 @@ Union: <for test>
          MockExchangeReceiver)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
-
-    // only join + ExchangeReceiver
-    DAGRequestBuilder receiver1 = context.receive("sender_l");
-    DAGRequestBuilder receiver2 = context.receive("sender_r");
-    DAGRequestBuilder receiver3 = context.receive("sender_l");
-    DAGRequestBuilder receiver4 = context.receive("sender_r");
-
-    request = receiver1.join(
-                           receiver2.join(
-                               receiver3.join(receiver4,
-                                              {col("join_c")},
-                                              ASTTableJoin::Kind::Left),
-                               {col("join_c")},
-                               ASTTableJoin::Kind::Left),
-                           {col("join_c")},
-                           ASTTableJoin::Kind::Left)
-                  .build(context);
-    {
-        String expected = R"(
-CreatingSets
- Union: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = exchange_receiver_3>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     MockExchangeReceiver
- Union x 2: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     Expression: <remove useless column after join>
-      HashJoinProbe: <join probe, join_executor_id = Join_4>
-       Expression: <final projection>
-        MockExchangeReceiver
- Union: <for test>
-  Expression x 10: <final projection>
-   Expression: <remove useless column after join>
-    HashJoinProbe: <join probe, join_executor_id = Join_6>
-     Expression: <final projection>
-      MockExchangeReceiver)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
-
-    // join + receiver + sender
-    // TODO: Find a way to write the request easier.
-    DAGRequestBuilder receiver5 = context.receive("sender_l");
-    DAGRequestBuilder receiver6 = context.receive("sender_r");
-    DAGRequestBuilder receiver7 = context.receive("sender_l");
-    DAGRequestBuilder receiver8 = context.receive("sender_r");
-    request = receiver5.join(
-                           receiver6.join(
-                               receiver7.join(receiver8,
-                                              {col("join_c")},
-                                              ASTTableJoin::Kind::Left),
-                               {col("join_c")},
-                               ASTTableJoin::Kind::Left),
-                           {col("join_c")},
-                           ASTTableJoin::Kind::Left)
-                  .exchangeSender(tipb::PassThrough)
-                  .build(context);
-    {
-        String expected = R"(
-CreatingSets
- Union: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = exchange_receiver_3>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     MockExchangeReceiver
- Union x 2: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     Expression: <remove useless column after join>
-      HashJoinProbe: <join probe, join_executor_id = Join_4>
-       Expression: <final projection>
-        MockExchangeReceiver
- Union: <for test>
-  MockExchangeSender x 10
-   Expression: <final projection>
-    Expression: <remove useless column after join>
-     HashJoinProbe: <join probe, join_executor_id = Join_6>
-      Expression: <final projection>
-       MockExchangeReceiver)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
 }
 CATCH
 
-TEST_F(QBInterpreterExecuteTest, ParallelQuery)
+TEST_F(InterpreterExecuteTest, Window)
 try
 {
-    /// executor with table scan
-    auto request = context.scan("test_db", "test_table_1")
-                       .limit(10)
+    auto request = context
+                       .scan("test_db", "test_table")
+                       .sort({{"s1", true}, {"s2", false}}, true)
+                       .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame())
                        .build(context);
     {
         String expected = R"(
-Limit, limit = 10
- Expression: <final projection>
-  MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 1);
-
-        expected = R"(
 Union: <for test>
- SharedQuery x 5: <restore concurrency>
-  Limit, limit = 10
-   Union: <for partial limit>
-    Limit x 5, limit = 10
+ Expression x 10: <final projection>
+  SharedQuery: <restore concurrency>
+   Expression: <cast after window>
+    Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
      Expression: <final projection>
-      MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 5);
+      MergeSorting, limit = 0
+       Union: <for partial order>
+        PartialSorting x 10: limit = 0
+         Expression: <final projection>
+          MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
 
-    request = context.scan("test_db", "test_table_1")
-                  .project({"s1", "s2", "s3"})
+    request = context.scan("test_db", "test_table")
+                  .sort({{"s1", true}, {"s2", false}}, true)
+                  .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame())
+                  .project({"s1", "s2", "RowNumber()"})
                   .build(context);
     {
         String expected = R"(
-Expression: <final projection>
- Expression: <projection>
-  Expression: <final projection>
-   MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 1);
-
-        expected = R"(
 Union: <for test>
- Expression x 5: <final projection>
+ Expression x 10: <final projection>
   Expression: <projection>
    Expression: <final projection>
-    MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 5);
+    SharedQuery: <restore concurrency>
+     Expression: <cast after window>
+      Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+       Expression: <final projection>
+        MergeSorting, limit = 0
+         Union: <for partial order>
+          PartialSorting x 10: limit = 0
+           Expression: <final projection>
+            MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
 
     request = context.scan("test_db", "test_table_1")
-                  .aggregation({Max(col("s1"))}, {col("s2"), col("s3")})
+                  .sort({{"s1", true}, {"s2", false}}, true)
+                  .project({"s1", "s2", "s3"})
+                  .window(RowNumber(), {"s1", true}, {"s1", false}, buildDefaultRowsFrame())
+                  .project({"s1", "s2", "s3", "RowNumber()"})
                   .build(context);
     {
         String expected = R"(
-Expression: <final projection>
- Aggregating
-  Concat
-   MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 1);
-
-        expected = R"(
 Union: <for test>
- Expression x 5: <final projection>
-  SharedQuery: <restore concurrency>
-   ParallelAggregating, max_threads: 5, final: true
-    MockTableScan x 5)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 5);
+ Expression x 10: <final projection>
+  Expression: <projection>
+   Expression: <final projection>
+    SharedQuery: <restore concurrency>
+     Expression: <cast after window>
+      Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+       Union: <merge into one for window input>
+        Expression x 10: <final projection>
+         Expression: <projection>
+          SharedQuery: <restore concurrency>
+           Expression: <final projection>
+            MergeSorting, limit = 0
+             Union: <for partial order>
+              PartialSorting x 10: limit = 0
+               Expression: <final projection>
+                MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
+}
+CATCH
 
-    request = context.scan("test_db", "test_table_1")
-                  .topN("s2", false, 10)
-                  .build(context);
+TEST_F(InterpreterExecuteTest, FineGrainedShuffle)
+try
+{
+    // fine-grained shuffle is enabled.
+    const uint64_t enable = 8;
+    const uint64_t disable = 0;
+    auto request = context
+                       .receive("sender_1", enable)
+                       .sort({{"s1", true}, {"s2", false}}, true, enable)
+                       .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame(), enable)
+                       .build(context);
     {
         String expected = R"(
-Expression: <final projection>
- MergeSorting, limit = 10
-  PartialSorting: limit = 10
-   MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 1);
+Union: <for test>
+ Expression x 10: <final projection>
+  Expression: <cast after window>
+   Window: <enable fine grained shuffle>, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+    Expression: <final projection>
+     MergeSorting: <enable fine grained shuffle>, limit = 0
+      PartialSorting: <enable fine grained shuffle>: limit = 0
+       Expression: <final projection>
+        MockExchangeReceiver
+        )";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
 
-        expected = R"(
+    auto topn_request = context
+                            .receive("sender_1")
+                            .topN("s2", false, 10)
+                            .build(context);
+    String topn_expected = R"(
 Union: <for test>
- SharedQuery x 5: <restore concurrency>
+ SharedQuery x 10: <restore concurrency>
   Expression: <final projection>
    MergeSorting, limit = 10
     Union: <for partial order>
-     PartialSorting x 5: limit = 10
-      MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 5);
-    }
-
-    request = context.scan("test_db", "test_table_1")
-                  .filter(eq(col("s2"), col("s3")))
-                  .build(context);
-    {
-        String expected = R"(
-Expression: <final projection>
- Expression: <before order and select>
-  Filter: <execute where>
-   MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 1);
-
-        expected = R"(
-Union: <for test>
- Expression x 5: <final projection>
-  Expression: <before order and select>
-   Filter: <execute where>
-    MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 5);
-    }
-
-    /// other cases
-    request = context.scan("test_db", "test_table_1")
-                  .limit(10)
-                  .project({"s1", "s2", "s3"})
-                  .aggregation({Max(col("s1"))}, {col("s2"), col("s3")})
+     PartialSorting x 10: limit = 10
+      MockExchangeReceiver
+    )";
+    ASSERT_BLOCKINPUTSTREAM_EQAUL(topn_expected, topn_request, 10);
+
+    // fine-grained shuffle is disabled.
+    request = context
+                  .receive("sender_1", disable)
+                  .sort({{"s1", true}, {"s2", false}}, true, disable)
+                  .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame(), disable)
                   .build(context);
     {
         String expected = R"(
 Union: <for test>
  Expression x 10: <final projection>
   SharedQuery: <restore concurrency>
-   ParallelAggregating, max_threads: 10, final: true
-    Expression x 10: <projection>
-     SharedQuery: <restore concurrency>
-      Limit, limit = 10
-       Union: <for partial limit>
-        Limit x 10, limit = 10
+   Expression: <cast after window>
+    Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+     Expression: <final projection>
+      MergeSorting, limit = 0
+       Union: <for partial order>
+        PartialSorting x 10: limit = 0
          Expression: <final projection>
-          MockTableScan)";
+          MockExchangeReceiver
+        )";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-
-        expected = R"(Expression: <final projection>
- Aggregating
-  Concat
-   Expression: <projection>
-    Limit, limit = 10
-     Expression: <final projection>
-      MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 1);
     }
 
-    request = context.scan("test_db", "test_table_1")
-                  .topN("s2", false, 10)
-                  .project({"s1", "s2", "s3"})
-                  .aggregation({Max(col("s1"))}, {col("s2"), col("s3")})
-                  .build(context);
+    topn_request = context
+                       .receive("sender_1")
+                       .topN("s2", false, 10)
+                       .build(context);
+    ASSERT_BLOCKINPUTSTREAM_EQAUL(topn_expected, topn_request, 10);
+}
+CATCH
+
+TEST_F(InterpreterExecuteTest, Join)
+try
+{
+    // TODO: Find a way to write the request easier.
     {
-        String expected = R"(
-Union: <for test>
- Expression x 10: <final projection>
-  SharedQuery: <restore concurrency>
-   ParallelAggregating, max_threads: 10, final: true
-    Expression x 10: <projection>
-     SharedQuery: <restore concurrency>
-      Expression: <final projection>
-       MergeSorting, limit = 10
-        Union: <for partial order>
-         PartialSorting x 10: limit = 10
-          MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+        // Join Source.
+        DAGRequestBuilder table1 = context.scan("test_db", "r_table");
+        DAGRequestBuilder table2 = context.scan("test_db", "l_table");
+        DAGRequestBuilder table3 = context.scan("test_db", "r_table");
+        DAGRequestBuilder table4 = context.scan("test_db", "l_table");
+
+        auto request = table1.join(
+                                 table2.join(
+                                     table3.join(table4,
+                                                 {col("join_c")},
+                                                 ASTTableJoin::Kind::Left),
+                                     {col("join_c")},
+                                     ASTTableJoin::Kind::Left),
+                                 {col("join_c")},
+                                 ASTTableJoin::Kind::Left)
+                           .build(context);
 
-        expected = R"(
-Expression: <final projection>
- Aggregating
-  Concat
-   Expression: <projection>
+        String expected = R"(
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = table_scan_3>, join_kind = Left
+   Expression: <append join key and join filters for build side>
     Expression: <final projection>
-     MergeSorting, limit = 10
-      PartialSorting: limit = 10
-       MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 1);
+     MockTableScan
+ Union x 2: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     Expression: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_4>
+       Expression: <final projection>
+        MockTableScan
+ Union: <for test>
+  Expression x 10: <final projection>
+   Expression: <remove useless column after join>
+    HashJoinProbe: <join probe, join_executor_id = Join_6>
+     Expression: <final projection>
+      MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
 
-    request = context.scan("test_db", "test_table_1")
-                  .aggregation({Max(col("s1"))}, {col("s2"), col("s3")})
-                  .project({"s2", "s3"})
-                  .aggregation({Max(col("s2"))}, {col("s3")})
-                  .build(context);
     {
+        // only join + ExchangeReceiver
+        DAGRequestBuilder receiver1 = context.receive("sender_l");
+        DAGRequestBuilder receiver2 = context.receive("sender_r");
+        DAGRequestBuilder receiver3 = context.receive("sender_l");
+        DAGRequestBuilder receiver4 = context.receive("sender_r");
+
+        auto request = receiver1.join(
+                                    receiver2.join(
+                                        receiver3.join(receiver4,
+                                                       {col("join_c")},
+                                                       ASTTableJoin::Kind::Left),
+                                        {col("join_c")},
+                                        ASTTableJoin::Kind::Left),
+                                    {col("join_c")},
+                                    ASTTableJoin::Kind::Left)
+                           .build(context);
+
         String expected = R"(
-Union: <for test>
- Expression x 10: <final projection>
-  SharedQuery: <restore concurrency>
-   ParallelAggregating, max_threads: 10, final: true
-    Expression x 10: <projection>
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = exchange_receiver_3>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     MockExchangeReceiver
+ Union x 2: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     Expression: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_4>
+       Expression: <final projection>
+        MockExchangeReceiver
+ Union: <for test>
+  Expression x 10: <final projection>
+   Expression: <remove useless column after join>
+    HashJoinProbe: <join probe, join_executor_id = Join_6>
      Expression: <final projection>
-      SharedQuery: <restore concurrency>
-       ParallelAggregating, max_threads: 10, final: true
-        MockTableScan x 10)";
+      MockExchangeReceiver)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-
-        expected = R"(
-Expression: <final projection>
- Aggregating
-  Concat
-   Expression: <projection>
-    Expression: <final projection>
-     Aggregating
-      Concat
-       MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 1);
     }
 
-    request = context.scan("test_db", "test_table_1")
-                  .aggregation({Max(col("s1"))}, {col("s2"), col("s3")})
-                  .exchangeSender(tipb::PassThrough)
-                  .build(context);
     {
+        // join + receiver + sender
+        DAGRequestBuilder receiver1 = context.receive("sender_l");
+        DAGRequestBuilder receiver2 = context.receive("sender_r");
+        DAGRequestBuilder receiver3 = context.receive("sender_l");
+        DAGRequestBuilder receiver4 = context.receive("sender_r");
+
+        auto request = receiver1.join(
+                                    receiver2.join(
+                                        receiver3.join(receiver4,
+                                                       {col("join_c")},
+                                                       ASTTableJoin::Kind::Left),
+                                        {col("join_c")},
+                                        ASTTableJoin::Kind::Left),
+                                    {col("join_c")},
+                                    ASTTableJoin::Kind::Left)
+                           .exchangeSender(tipb::PassThrough)
+                           .build(context);
+
         String expected = R"(
-Union: <for test>
- MockExchangeSender x 10
-  Expression: <final projection>
-   SharedQuery: <restore concurrency>
-    ParallelAggregating, max_threads: 10, final: true
-     MockTableScan x 10)";
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = exchange_receiver_3>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     MockExchangeReceiver
+ Union x 2: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     Expression: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_4>
+       Expression: <final projection>
+        MockExchangeReceiver
+ Union: <for test>
+  MockExchangeSender x 10
+   Expression: <final projection>
+    Expression: <remove useless column after join>
+     HashJoinProbe: <join probe, join_executor_id = Join_6>
+      Expression: <final projection>
+       MockExchangeReceiver)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-
-        expected = R"(
-MockExchangeSender
- Expression: <final projection>
-  Aggregating
-   Concat
-    MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 1);
     }
+}
+CATCH
 
-    request = context.scan("test_db", "test_table_1")
-                  .topN("s2", false, 10)
-                  .exchangeSender(tipb::PassThrough)
-                  .build(context);
+TEST_F(InterpreterExecuteTest, JoinThenAgg)
+try
+{
     {
+        // Left Join.
+        DAGRequestBuilder table1 = context.scan("test_db", "r_table");
+        DAGRequestBuilder table2 = context.scan("test_db", "l_table");
+
+        auto request = table1.join(
+                                 table2,
+                                 {col("join_c")},
+                                 ASTTableJoin::Kind::Left)
+                           .aggregation({Max(col("r_a"))}, {col("join_c")})
+                           .build(context);
         String expected = R"(
-Union: <for test>
- MockExchangeSender x 10
-  SharedQuery: <restore concurrency>
-   Expression: <final projection>
-    MergeSorting, limit = 10
-     Union: <for partial order>
-      PartialSorting x 10: limit = 10
-       MockTableScan)";
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = table_scan_1>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     MockTableScan
+ Union: <for test>
+  Expression x 10: <final projection>
+   SharedQuery: <restore concurrency>
+    ParallelAggregating, max_threads: 10, final: true
+     Expression x 10: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_2>
+       Expression: <final projection>
+        MockTableScan)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-
-        expected = R"(
-MockExchangeSender
- Expression: <final projection>
-  MergeSorting, limit = 10
-   PartialSorting: limit = 10
-    MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 1);
     }
 
-    request = context.scan("test_db", "test_table_1")
-                  .limit(10)
-                  .exchangeSender(tipb::PassThrough)
-                  .build(context);
     {
+        // Right Join
+        DAGRequestBuilder table1 = context.scan("test_db", "r_table");
+        DAGRequestBuilder table2 = context.scan("test_db", "l_table");
+
+        auto request = table1.join(
+                                 table2,
+                                 {col("join_c")},
+                                 ASTTableJoin::Kind::Right)
+                           .aggregation({Max(col("r_a"))}, {col("join_c")})
+                           .build(context);
         String expected = R"(
-Union: <for test>
- MockExchangeSender x 10
-  SharedQuery: <restore concurrency>
-   Limit, limit = 10
-    Union: <for partial limit>
-     Limit x 10, limit = 10
-      Expression: <final projection>
-       MockTableScan)";
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = table_scan_1>, join_kind = Right
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     MockTableScan
+ Union: <for test>
+  Expression x 10: <final projection>
+   SharedQuery: <restore concurrency>
+    ParallelAggregating, max_threads: 10, final: true
+     Expression x 10: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_2>
+       Expression: <append join key and join filters for probe side>
+        Expression: <final projection>
+         MockTableScan
+     Expression x 10: <remove useless column after join>
+      NonJoined: <add stream with non_joined_data if full_or_right_join>)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-
-        expected = R"(
-MockExchangeSender
- Limit, limit = 10
-  Expression: <final projection>
-   MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 1);
     }
 
-    DAGRequestBuilder table1 = context.scan("test_db", "r_table");
-    DAGRequestBuilder table2 = context.scan("test_db", "l_table");
-    request = table1.join(table2.limit(1), {col("join_c")}, ASTTableJoin::Kind::Left).build(context);
     {
+        // Right join + receiver + sender
+        DAGRequestBuilder receiver1 = context.receive("sender_l");
+        DAGRequestBuilder receiver2 = context.receive("sender_r");
+
+        auto request = receiver1.join(
+                                    receiver2,
+                                    {col("join_c")},
+                                    ASTTableJoin::Kind::Right)
+                           .aggregation({Sum(col("r_a"))}, {col("join_c")})
+                           .exchangeSender(tipb::PassThrough)
+                           .limit(10)
+                           .build(context);
         String expected = R"(
 CreatingSets
  Union: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = limit_2>, join_kind = Left
+  HashJoinBuildBlockInputStream x 20: <join build, build_side_root_executor_id = exchange_receiver_1>, join_kind = Right
    Expression: <append join key and join filters for build side>
-    SharedQuery: <restore concurrency>
-     Limit, limit = 1
-      Union: <for partial limit>
-       Limit x 10, limit = 1
-        Expression: <final projection>
-         MockTableScan
+    Expression: <final projection>
+     MockExchangeReceiver
  Union: <for test>
-  Expression x 10: <final projection>
-   Expression: <remove useless column after join>
-    HashJoinProbe: <join probe, join_executor_id = Join_3>
-     Expression: <final projection>
-      MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+  MockExchangeSender x 20
+   SharedQuery: <restore concurrency>
+    Limit, limit = 10
+     Union: <for partial limit>
+      Limit x 20, limit = 10
+       Expression: <final projection>
+        Expression: <before order and select>
+         SharedQuery: <restore concurrency>
+          ParallelAggregating, max_threads: 20, final: true
+           Expression x 20: <remove useless column after join>
+            HashJoinProbe: <join probe, join_executor_id = Join_2>
+             Expression: <append join key and join filters for probe side>
+              Expression: <final projection>
+               MockExchangeReceiver
+           Expression x 20: <remove useless column after join>
+            NonJoined: <add stream with non_joined_data if full_or_right_join>)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 20);
     }
 }
 CATCH
diff --git a/dbms/src/Flash/tests/gtest_limit_executor.cpp b/dbms/src/Flash/tests/gtest_limit_executor.cpp
new file mode 100644
index 00000000000..a3ddf341525
--- /dev/null
+++ b/dbms/src/Flash/tests/gtest_limit_executor.cpp
@@ -0,0 +1,79 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/ExecutorTestUtils.h>
+#include <TestUtils/mockExecutor.h>
+
+namespace DB
+{
+namespace tests
+{
+
+class ExecutorLimitTestRunner : public DB::tests::ExecutorTest
+{
+public:
+    using ColDataType = std::optional<typename TypeTraits<String>::FieldType>;
+    using ColumnWithData = std::vector<ColDataType>;
+
+    void initializeContext() override
+    {
+        ExecutorTest::initializeContext();
+
+        context.addMockTable({db_name, table_name},
+                             {{col_name, TiDB::TP::TypeString}},
+                             {toNullableVec<String>(col_name, col0)});
+    }
+
+    std::shared_ptr<tipb::DAGRequest> buildDAGRequest(size_t limit_num)
+    {
+        return context.scan(db_name, table_name).limit(limit_num).build(context);
+    }
+
+    /// Prepare some names
+    const String db_name{"test_db"};
+    const String table_name{"projection_test_table"};
+    const String col_name{"limit_col"};
+    const ColumnWithData col0{"col0-0", {}, "col0-2", "col0-3", {}, "col0-5", "col0-6", "col0-7"};
+};
+
+TEST_F(ExecutorLimitTestRunner, Limit)
+try
+{
+    wrapForDisEnablePlanner([&]() {
+        std::shared_ptr<tipb::DAGRequest> request;
+        ColumnsWithTypeAndName expect_cols;
+
+        /// Check limit result with various parameters
+        const size_t col_data_num = col0.size();
+        for (size_t limit_num = 0; limit_num <= col_data_num + 3; ++limit_num)
+        {
+            if (limit_num == col_data_num + 3)
+                limit_num = INT_MAX;
+            request = buildDAGRequest(limit_num);
+
+            if (limit_num == 0)
+                expect_cols = {};
+            else if (limit_num > col_data_num)
+                expect_cols = {toNullableVec<String>(col_name, ColumnWithData(col0.begin(), col0.end()))};
+            else
+                expect_cols = {toNullableVec<String>(col_name, ColumnWithData(col0.begin(), col0.begin() + limit_num))};
+
+            ASSERT_COLUMNS_EQ_R(executeStreams(request), expect_cols);
+        }
+    });
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Flash/tests/gtest_planner_interpreter.cpp b/dbms/src/Flash/tests/gtest_planner_interpreter.cpp
index 3840b858340..aa4950a2b22 100644
--- a/dbms/src/Flash/tests/gtest_planner_interpreter.cpp
+++ b/dbms/src/Flash/tests/gtest_planner_interpreter.cpp
@@ -26,19 +26,19 @@ class PlannerInterpreterExecuteTest : public DB::tests::ExecutorTest
     {
         ExecutorTest::initializeContext();
 
-        context.context.setSetting("enable_planner", "true");
+        enablePlanner(true);
 
         context.addMockTable({"test_db", "test_table"}, {{"s1", TiDB::TP::TypeString}, {"s2", TiDB::TP::TypeString}});
         context.addMockTable({"test_db", "test_table_1"}, {{"s1", TiDB::TP::TypeString}, {"s2", TiDB::TP::TypeString}, {"s3", TiDB::TP::TypeString}});
         context.addMockTable({"test_db", "r_table"}, {{"r_a", TiDB::TP::TypeLong}, {"r_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
         context.addMockTable({"test_db", "l_table"}, {{"l_a", TiDB::TP::TypeLong}, {"l_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
         context.addExchangeRelationSchema("sender_1", {{"s1", TiDB::TP::TypeString}, {"s2", TiDB::TP::TypeString}, {"s3", TiDB::TP::TypeString}});
-        context.addExchangeRelationSchema("sender_l", {{"l_a", TiDB::TP::TypeString}, {"l_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
-        context.addExchangeRelationSchema("sender_r", {{"r_a", TiDB::TP::TypeString}, {"r_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
+        context.addExchangeRelationSchema("sender_l", {{"l_a", TiDB::TP::TypeLong}, {"l_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
+        context.addExchangeRelationSchema("sender_r", {{"r_a", TiDB::TP::TypeLong}, {"r_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
     }
 };
 
-TEST_F(PlannerInterpreterExecuteTest, SimpleQuery)
+TEST_F(PlannerInterpreterExecuteTest, SingleQueryBlock)
 try
 {
     auto request = context.scan("test_db", "test_table_1")
@@ -92,282 +92,6 @@ Union: <for test>
 }
 CATCH
 
-TEST_F(PlannerInterpreterExecuteTest, ComplexQuery)
-try
-{
-    auto request = context.scan("test_db", "test_table_1")
-                       .project({"s1", "s2", "s3"})
-                       .project({"s1", "s2"})
-                       .project("s1")
-                       .build(context);
-    {
-        String expected = R"(
-Union: <for test>
- Expression x 10: <final projection>
-  Expression: <projection>
-   Expression: <final projection>
-    Expression: <projection>
-     Expression: <final projection>
-      Expression: <projection>
-       Expression: <final projection>
-        MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
-
-    request = context.scan("test_db", "test_table_1")
-                  .project({"s1", "s2", "s3"})
-                  .topN({{"s1", true}, {"s2", false}}, 10)
-                  .project({"s1", "s2"})
-                  .build(context);
-    {
-        String expected = R"(
-Union: <for test>
- Expression x 10: <final projection>
-  Expression: <projection>
-   Expression: <final projection>
-    SharedQuery: <restore concurrency>
-     MergeSorting, limit = 10
-      Union: <for partial order>
-       PartialSorting x 10: limit = 10
-        Expression: <projection>
-         Expression: <final projection>
-          MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
-
-    request = context.scan("test_db", "test_table_1")
-                  .project({"s1", "s2", "s3"})
-                  .topN({{"s1", true}, {"s2", false}}, 10)
-                  .project({"s1", "s2"})
-                  .aggregation({Max(col("s1"))}, {col("s1"), col("s2")})
-                  .project({"max(s1)", "s1", "s2"})
-                  .build(context);
-    {
-        String expected = R"(
-Union: <for test>
- Expression x 10: <final projection>
-  Expression: <projection>
-   Expression: <final projection>
-    Expression: <cast after aggregation>
-     SharedQuery: <restore concurrency>
-      ParallelAggregating, max_threads: 10, final: true
-       Expression x 10: <projection>
-        Expression: <final projection>
-         SharedQuery: <restore concurrency>
-          MergeSorting, limit = 10
-           Union: <for partial order>
-            PartialSorting x 10: limit = 10
-             Expression: <projection>
-              Expression: <final projection>
-               MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
-
-    request = context.scan("test_db", "test_table_1")
-                  .project({"s1", "s2", "s3"})
-                  .topN({{"s1", true}, {"s2", false}}, 10)
-                  .project({"s1", "s2"})
-                  .aggregation({Max(col("s1"))}, {col("s1"), col("s2")})
-                  .project({"max(s1)", "s1", "s2"})
-                  .filter(eq(col("s1"), col("s2")))
-                  .project({"max(s1)", "s1"})
-                  .limit(10)
-                  .build(context);
-    {
-        String expected = R"(
-Union: <for test>
- Expression x 10: <final projection>
-  SharedQuery: <restore concurrency>
-   Limit, limit = 10
-    Union: <for partial limit>
-     Limit x 10, limit = 10
-      Expression: <projection>
-       Expression: <final projection>
-        Filter
-         Expression: <projection>
-          Expression: <final projection>
-           Expression: <cast after aggregation>
-            SharedQuery: <restore concurrency>
-             ParallelAggregating, max_threads: 10, final: true
-              Expression x 10: <projection>
-               Expression: <final projection>
-                SharedQuery: <restore concurrency>
-                 MergeSorting, limit = 10
-                  Union: <for partial order>
-                   PartialSorting x 10: limit = 10
-                    Expression: <projection>
-                     Expression: <final projection>
-                      MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
-
-    // Join Source.
-    DAGRequestBuilder table1 = context.scan("test_db", "r_table");
-    DAGRequestBuilder table2 = context.scan("test_db", "l_table");
-    DAGRequestBuilder table3 = context.scan("test_db", "r_table");
-    DAGRequestBuilder table4 = context.scan("test_db", "l_table");
-
-    request = table1.join(
-                        table2.join(
-                            table3.join(table4,
-                                        {col("join_c")},
-                                        ASTTableJoin::Kind::Left),
-                            {col("join_c")},
-                            ASTTableJoin::Kind::Left),
-                        {col("join_c")},
-                        ASTTableJoin::Kind::Left)
-                  .build(context);
-    {
-        String expected = R"(
-CreatingSets
- Union: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = table_scan_3>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     MockTableScan
- Union x 2: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     Expression: <remove useless column after join>
-      HashJoinProbe: <join probe, join_executor_id = Join_4>
-       Expression: <final projection>
-        MockTableScan
- Union: <for test>
-  Expression x 10: <final projection>
-   Expression: <remove useless column after join>
-    HashJoinProbe: <join probe, join_executor_id = Join_6>
-     Expression: <final projection>
-      MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
-
-    request = context.receive("sender_1")
-                  .project({"s1", "s2", "s3"})
-                  .project({"s1", "s2"})
-                  .project("s1")
-                  .build(context);
-    {
-        String expected = R"(
-Union: <for test>
- Expression x 10: <final projection>
-  Expression: <projection>
-   Expression: <final projection>
-    Expression: <projection>
-     Expression: <final projection>
-      Expression: <projection>
-       Expression: <final projection>
-        MockExchangeReceiver)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
-
-    request = context.receive("sender_1")
-                  .project({"s1", "s2", "s3"})
-                  .project({"s1", "s2"})
-                  .project("s1")
-                  .exchangeSender(tipb::Broadcast)
-                  .build(context);
-    {
-        String expected = R"(
-Union: <for test>
- MockExchangeSender x 10
-  Expression: <final projection>
-   Expression: <projection>
-    Expression: <final projection>
-     Expression: <projection>
-      Expression: <final projection>
-       Expression: <projection>
-        Expression: <final projection>
-         MockExchangeReceiver)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
-
-    // only join + ExchangeReceiver
-    DAGRequestBuilder receiver1 = context.receive("sender_l");
-    DAGRequestBuilder receiver2 = context.receive("sender_r");
-    DAGRequestBuilder receiver3 = context.receive("sender_l");
-    DAGRequestBuilder receiver4 = context.receive("sender_r");
-
-    request = receiver1.join(
-                           receiver2.join(
-                               receiver3.join(receiver4,
-                                              {col("join_c")},
-                                              ASTTableJoin::Kind::Left),
-                               {col("join_c")},
-                               ASTTableJoin::Kind::Left),
-                           {col("join_c")},
-                           ASTTableJoin::Kind::Left)
-                  .build(context);
-    {
-        String expected = R"(
-CreatingSets
- Union: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = exchange_receiver_3>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     MockExchangeReceiver
- Union x 2: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     Expression: <remove useless column after join>
-      HashJoinProbe: <join probe, join_executor_id = Join_4>
-       Expression: <final projection>
-        MockExchangeReceiver
- Union: <for test>
-  Expression x 10: <final projection>
-   Expression: <remove useless column after join>
-    HashJoinProbe: <join probe, join_executor_id = Join_6>
-     Expression: <final projection>
-      MockExchangeReceiver)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
-
-    // join + receiver + sender
-    // TODO: Find a way to write the request easier.
-    DAGRequestBuilder receiver5 = context.receive("sender_l");
-    DAGRequestBuilder receiver6 = context.receive("sender_r");
-    DAGRequestBuilder receiver7 = context.receive("sender_l");
-    DAGRequestBuilder receiver8 = context.receive("sender_r");
-    request = receiver5.join(
-                           receiver6.join(
-                               receiver7.join(receiver8,
-                                              {col("join_c")},
-                                              ASTTableJoin::Kind::Left),
-                               {col("join_c")},
-                               ASTTableJoin::Kind::Left),
-                           {col("join_c")},
-                           ASTTableJoin::Kind::Left)
-                  .exchangeSender(tipb::PassThrough)
-                  .build(context);
-    {
-        String expected = R"(
-CreatingSets
- Union: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = exchange_receiver_3>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     MockExchangeReceiver
- Union x 2: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     Expression: <remove useless column after join>
-      HashJoinProbe: <join probe, join_executor_id = Join_4>
-       Expression: <final projection>
-        MockExchangeReceiver
- Union: <for test>
-  MockExchangeSender x 10
-   Expression: <final projection>
-    Expression: <remove useless column after join>
-     HashJoinProbe: <join probe, join_executor_id = Join_6>
-      Expression: <final projection>
-       MockExchangeReceiver)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
-}
-CATCH
-
 TEST_F(PlannerInterpreterExecuteTest, ParallelQuery)
 try
 {
@@ -673,5 +397,548 @@ CreatingSets
 }
 CATCH
 
+TEST_F(PlannerInterpreterExecuteTest, MultipleQueryBlockWithSource)
+try
+{
+    auto request = context.scan("test_db", "test_table_1")
+                       .project({"s1", "s2", "s3"})
+                       .project({"s1", "s2"})
+                       .project({"s1"})
+                       .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  Expression: <projection>
+   Expression: <final projection>
+    Expression: <projection>
+     Expression: <final projection>
+      Expression: <projection>
+       Expression: <final projection>
+        MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    request = context.scan("test_db", "test_table_1")
+                  .project({"s1", "s2", "s3"})
+                  .topN({{"s1", true}, {"s2", false}}, 10)
+                  .project({"s1", "s2"})
+                  .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  Expression: <projection>
+   Expression: <final projection>
+    SharedQuery: <restore concurrency>
+     MergeSorting, limit = 10
+      Union: <for partial order>
+       PartialSorting x 10: limit = 10
+        Expression: <projection>
+         Expression: <final projection>
+          MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    request = context.scan("test_db", "test_table_1")
+                  .project({"s1", "s2", "s3"})
+                  .topN({{"s1", true}, {"s2", false}}, 10)
+                  .project({"s1", "s2"})
+                  .aggregation({Max(col("s1"))}, {col("s1"), col("s2")})
+                  .project({"max(s1)", "s1", "s2"})
+                  .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  Expression: <projection>
+   Expression: <final projection>
+    Expression: <cast after aggregation>
+     SharedQuery: <restore concurrency>
+      ParallelAggregating, max_threads: 10, final: true
+       Expression x 10: <projection>
+        Expression: <final projection>
+         SharedQuery: <restore concurrency>
+          MergeSorting, limit = 10
+           Union: <for partial order>
+            PartialSorting x 10: limit = 10
+             Expression: <projection>
+              Expression: <final projection>
+               MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    request = context.scan("test_db", "test_table_1")
+                  .project({"s1", "s2", "s3"})
+                  .topN({{"s1", true}, {"s2", false}}, 10)
+                  .project({"s1", "s2"})
+                  .aggregation({Max(col("s1"))}, {col("s1"), col("s2")})
+                  .project({"max(s1)", "s1", "s2"})
+                  .filter(eq(col("s1"), col("s2")))
+                  .project({"max(s1)", "s1"})
+                  .limit(10)
+                  .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  SharedQuery: <restore concurrency>
+   Limit, limit = 10
+    Union: <for partial limit>
+     Limit x 10, limit = 10
+      Expression: <projection>
+       Expression: <final projection>
+        Filter
+         Expression: <projection>
+          Expression: <final projection>
+           Expression: <cast after aggregation>
+            SharedQuery: <restore concurrency>
+             ParallelAggregating, max_threads: 10, final: true
+              Expression x 10: <projection>
+               Expression: <final projection>
+                SharedQuery: <restore concurrency>
+                 MergeSorting, limit = 10
+                  Union: <for partial order>
+                   PartialSorting x 10: limit = 10
+                    Expression: <projection>
+                     Expression: <final projection>
+                      MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    request = context.receive("sender_1")
+                  .project({"s1", "s2", "s3"})
+                  .project({"s1", "s2"})
+                  .project({"s1"})
+                  .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  Expression: <projection>
+   Expression: <final projection>
+    Expression: <projection>
+     Expression: <final projection>
+      Expression: <projection>
+       Expression: <final projection>
+        MockExchangeReceiver)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    request = context.receive("sender_1")
+                  .project({"s1", "s2", "s3"})
+                  .project({"s1", "s2"})
+                  .project({"s1"})
+                  .exchangeSender(tipb::Broadcast)
+                  .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ MockExchangeSender x 10
+  Expression: <final projection>
+   Expression: <projection>
+    Expression: <final projection>
+     Expression: <projection>
+      Expression: <final projection>
+       Expression: <projection>
+        Expression: <final projection>
+         MockExchangeReceiver)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+}
+CATCH
+
+TEST_F(PlannerInterpreterExecuteTest, Window)
+try
+{
+    auto request = context
+                       .scan("test_db", "test_table")
+                       .sort({{"s1", true}, {"s2", false}}, true)
+                       .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame())
+                       .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  SharedQuery: <restore concurrency>
+   Expression: <cast after window>
+    Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+     Expression: <final projection>
+      MergeSorting, limit = 0
+       Union: <for partial order>
+        PartialSorting x 10: limit = 0
+         Expression: <final projection>
+          MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    request = context.scan("test_db", "test_table")
+                  .sort({{"s1", true}, {"s2", false}}, true)
+                  .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame())
+                  .project({"s1", "s2", "RowNumber()"})
+                  .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  Expression: <projection>
+   Expression: <final projection>
+    SharedQuery: <restore concurrency>
+     Expression: <cast after window>
+      Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+       Expression: <final projection>
+        MergeSorting, limit = 0
+         Union: <for partial order>
+          PartialSorting x 10: limit = 0
+           Expression: <final projection>
+            MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    request = context.scan("test_db", "test_table_1")
+                  .sort({{"s1", true}, {"s2", false}}, true)
+                  .project({"s1", "s2", "s3"})
+                  .window(RowNumber(), {"s1", true}, {"s1", false}, buildDefaultRowsFrame())
+                  .project({"s1", "s2", "s3", "RowNumber()"})
+                  .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  Expression: <projection>
+   Expression: <final projection>
+    SharedQuery: <restore concurrency>
+     Expression: <cast after window>
+      Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+       Union: <merge into one for window input>
+        Expression x 10: <final projection>
+         Expression: <projection>
+          SharedQuery: <restore concurrency>
+           Expression: <final projection>
+            MergeSorting, limit = 0
+             Union: <for partial order>
+              PartialSorting x 10: limit = 0
+               Expression: <final projection>
+                MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+}
+CATCH
+
+/// todo support FineGrainedShuffle
+/*
+TEST_F(PlannerInterpreterExecuteTest, FineGrainedShuffle)
+try
+{
+    // fine-grained shuffle is enabled.
+    const uint64_t enable = 8;
+    const uint64_t disable = 0;
+    auto request = context
+                       .receive("sender_1", enable)
+                       .sort({{"s1", true}, {"s2", false}}, true, enable)
+                       .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame(), enable)
+                       .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  Expression: <cast after window>
+   Window: <enable fine grained shuffle>, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+    Expression: <final projection>
+     MergeSorting: <enable fine grained shuffle>, limit = 0
+      PartialSorting: <enable fine grained shuffle>: limit = 0
+       Expression: <final projection>
+        MockExchangeReceiver
+        )";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    auto topn_request = context
+                            .receive("sender_1")
+                            .topN("s2", false, 10)
+                            .build(context);
+    String topn_expected = R"(
+Union: <for test>
+ SharedQuery x 10: <restore concurrency>
+  Expression: <final projection>
+   MergeSorting, limit = 10
+    Union: <for partial order>
+     PartialSorting x 10: limit = 10
+      MockExchangeReceiver
+    )";
+    ASSERT_BLOCKINPUTSTREAM_EQAUL(topn_expected, topn_request, 10);
+
+    // fine-grained shuffle is disabled.
+    request = context
+                  .receive("sender_1", disable)
+                  .sort({{"s1", true}, {"s2", false}}, true, disable)
+                  .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame(), disable)
+                  .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  SharedQuery: <restore concurrency>
+   Expression: <cast after window>
+    Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+     Expression: <final projection>
+      MergeSorting, limit = 0
+       Union: <for partial order>
+        PartialSorting x 10: limit = 0
+         Expression: <final projection>
+          MockExchangeReceiver
+        )";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    topn_request = context
+                       .receive("sender_1")
+                       .topN("s2", false, 10)
+                       .build(context);
+    ASSERT_BLOCKINPUTSTREAM_EQAUL(topn_expected, topn_request, 10);
+}
+CATCH
+*/
+
+TEST_F(PlannerInterpreterExecuteTest, Join)
+try
+{
+    // TODO: Find a way to write the request easier.
+    {
+        // Join Source.
+        DAGRequestBuilder table1 = context.scan("test_db", "r_table");
+        DAGRequestBuilder table2 = context.scan("test_db", "l_table");
+        DAGRequestBuilder table3 = context.scan("test_db", "r_table");
+        DAGRequestBuilder table4 = context.scan("test_db", "l_table");
+
+        auto request = table1.join(
+                                 table2.join(
+                                     table3.join(table4,
+                                                 {col("join_c")},
+                                                 ASTTableJoin::Kind::Left),
+                                     {col("join_c")},
+                                     ASTTableJoin::Kind::Left),
+                                 {col("join_c")},
+                                 ASTTableJoin::Kind::Left)
+                           .build(context);
+
+        String expected = R"(
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = table_scan_3>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     MockTableScan
+ Union x 2: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     Expression: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_4>
+       Expression: <final projection>
+        MockTableScan
+ Union: <for test>
+  Expression x 10: <final projection>
+   Expression: <remove useless column after join>
+    HashJoinProbe: <join probe, join_executor_id = Join_6>
+     Expression: <final projection>
+      MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    {
+        // only join + ExchangeReceiver
+        DAGRequestBuilder receiver1 = context.receive("sender_l");
+        DAGRequestBuilder receiver2 = context.receive("sender_r");
+        DAGRequestBuilder receiver3 = context.receive("sender_l");
+        DAGRequestBuilder receiver4 = context.receive("sender_r");
+
+        auto request = receiver1.join(
+                                    receiver2.join(
+                                        receiver3.join(receiver4,
+                                                       {col("join_c")},
+                                                       ASTTableJoin::Kind::Left),
+                                        {col("join_c")},
+                                        ASTTableJoin::Kind::Left),
+                                    {col("join_c")},
+                                    ASTTableJoin::Kind::Left)
+                           .build(context);
+
+        String expected = R"(
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = exchange_receiver_3>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     MockExchangeReceiver
+ Union x 2: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     Expression: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_4>
+       Expression: <final projection>
+        MockExchangeReceiver
+ Union: <for test>
+  Expression x 10: <final projection>
+   Expression: <remove useless column after join>
+    HashJoinProbe: <join probe, join_executor_id = Join_6>
+     Expression: <final projection>
+      MockExchangeReceiver)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    {
+        // join + receiver + sender
+        DAGRequestBuilder receiver1 = context.receive("sender_l");
+        DAGRequestBuilder receiver2 = context.receive("sender_r");
+        DAGRequestBuilder receiver3 = context.receive("sender_l");
+        DAGRequestBuilder receiver4 = context.receive("sender_r");
+
+        auto request = receiver1.join(
+                                    receiver2.join(
+                                        receiver3.join(receiver4,
+                                                       {col("join_c")},
+                                                       ASTTableJoin::Kind::Left),
+                                        {col("join_c")},
+                                        ASTTableJoin::Kind::Left),
+                                    {col("join_c")},
+                                    ASTTableJoin::Kind::Left)
+                           .exchangeSender(tipb::PassThrough)
+                           .build(context);
+
+        String expected = R"(
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = exchange_receiver_3>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     MockExchangeReceiver
+ Union x 2: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     Expression: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_4>
+       Expression: <final projection>
+        MockExchangeReceiver
+ Union: <for test>
+  MockExchangeSender x 10
+   Expression: <final projection>
+    Expression: <remove useless column after join>
+     HashJoinProbe: <join probe, join_executor_id = Join_6>
+      Expression: <final projection>
+       MockExchangeReceiver)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+}
+CATCH
+
+TEST_F(PlannerInterpreterExecuteTest, JoinThenAgg)
+try
+{
+    {
+        // Left Join.
+        DAGRequestBuilder table1 = context.scan("test_db", "r_table");
+        DAGRequestBuilder table2 = context.scan("test_db", "l_table");
+
+        auto request = table1.join(
+                                 table2,
+                                 {col("join_c")},
+                                 ASTTableJoin::Kind::Left)
+                           .aggregation({Max(col("r_a"))}, {col("join_c")})
+                           .build(context);
+        String expected = R"(
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = table_scan_1>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     MockTableScan
+ Union: <for test>
+  Expression x 10: <final projection>
+   SharedQuery: <restore concurrency>
+    ParallelAggregating, max_threads: 10, final: true
+     Expression x 10: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_2>
+       Expression: <final projection>
+        MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    {
+        // Right Join
+        DAGRequestBuilder table1 = context.scan("test_db", "r_table");
+        DAGRequestBuilder table2 = context.scan("test_db", "l_table");
+
+        auto request = table1.join(
+                                 table2,
+                                 {col("join_c")},
+                                 ASTTableJoin::Kind::Right)
+                           .aggregation({Max(col("r_a"))}, {col("join_c")})
+                           .build(context);
+        String expected = R"(
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = table_scan_1>, join_kind = Right
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     MockTableScan
+ Union: <for test>
+  Expression x 10: <final projection>
+   SharedQuery: <restore concurrency>
+    ParallelAggregating, max_threads: 10, final: true
+     Expression x 10: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_2>
+       Expression: <append join key and join filters for probe side>
+        Expression: <final projection>
+         MockTableScan
+     Expression x 10: <remove useless column after join>
+      NonJoined: <add stream with non_joined_data if full_or_right_join>)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    {
+        // Right join + receiver + sender
+        DAGRequestBuilder receiver1 = context.receive("sender_l");
+        DAGRequestBuilder receiver2 = context.receive("sender_r");
+
+        auto request = receiver1.join(
+                                    receiver2,
+                                    {col("join_c")},
+                                    ASTTableJoin::Kind::Right)
+                           .aggregation({Sum(col("r_a"))}, {col("join_c")})
+                           .exchangeSender(tipb::PassThrough)
+                           .limit(10)
+                           .build(context);
+        String expected = R"(
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 20: <join build, build_side_root_executor_id = exchange_receiver_1>, join_kind = Right
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     MockExchangeReceiver
+ Union: <for test>
+  MockExchangeSender x 20
+   SharedQuery: <restore concurrency>
+    Limit, limit = 10
+     Union: <for partial limit>
+      Limit x 20, limit = 10
+       Expression: <final projection>
+        Expression: <before order and select>
+         SharedQuery: <restore concurrency>
+          ParallelAggregating, max_threads: 20, final: true
+           Expression x 20: <remove useless column after join>
+            HashJoinProbe: <join probe, join_executor_id = Join_2>
+             Expression: <append join key and join filters for probe side>
+              Expression: <final projection>
+               MockExchangeReceiver
+           Expression x 20: <remove useless column after join>
+            NonJoined: <add stream with non_joined_data if full_or_right_join>)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 20);
+    }
+}
+CATCH
+
 } // namespace tests
 } // namespace DB
diff --git a/dbms/src/Flash/tests/gtest_projection_executor.cpp b/dbms/src/Flash/tests/gtest_projection_executor.cpp
new file mode 100644
index 00000000000..65b75bad9ab
--- /dev/null
+++ b/dbms/src/Flash/tests/gtest_projection_executor.cpp
@@ -0,0 +1,228 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/ExecutorTestUtils.h>
+#include <TestUtils/mockExecutor.h>
+
+namespace DB
+{
+namespace tests
+{
+
+class ExecutorProjectionTestRunner : public DB::tests::ExecutorTest
+{
+public:
+    using ColDataString = std::vector<std::optional<typename TypeTraits<String>::FieldType>>;
+    using ColDataInt32 = std::vector<std::optional<typename TypeTraits<Int32>::FieldType>>;
+
+    void initializeContext() override
+    {
+        ExecutorTest::initializeContext();
+
+        context.addMockTable({db_name, table_name},
+                             {{col_names[0], TiDB::TP::TypeString},
+                              {col_names[1], TiDB::TP::TypeString},
+                              {col_names[2], TiDB::TP::TypeString},
+                              {col_names[3], TiDB::TP::TypeLong},
+                              {col_names[4], TiDB::TP::TypeLong}},
+                             {toNullableVec<String>(col_names[0], col0),
+                              toNullableVec<String>(col_names[1], col1),
+                              toNullableVec<String>(col_names[2], col2),
+                              toNullableVec<Int32>(col_names[3], col3),
+                              toNullableVec<Int32>(col_names[4], col4)});
+    }
+
+    template <typename T>
+    std::shared_ptr<tipb::DAGRequest> buildDAGRequest(T param)
+    {
+        return context.scan(db_name, table_name).project(param).build(context);
+    };
+
+    void executeWithConcurrency(const std::shared_ptr<tipb::DAGRequest> & request, const ColumnsWithTypeAndName & expect_columns)
+    {
+        for (size_t i = 1; i < 10; i += 2)
+        {
+            ASSERT_COLUMNS_EQ_UR(executeStreams(request, i), expect_columns);
+        }
+    }
+
+    /// Prepare column data
+    const ColDataString col0{"col0-0", "col0-1", "", "col0-2", {}, "col0-3", ""};
+    const ColDataString col1{"col1-0", {}, "", "col1-1", "", "col1-2", "col1-3"};
+    const ColDataString col2{"", "col2-0", "col2-1", {}, "col2-3", {}, "col2-4"};
+    const ColDataInt32 col3{1, {}, 0, -111111, {}, 0, 9999};
+
+    /** Each value in col4 should be different from each other so that topn 
+     *  could sort the columns into an unique result, or multi-results could
+     *  be right.
+     */
+    const ColDataInt32 col4{0, 5, -123, -234, {}, 24353, 9999};
+
+    /// Results after sorted by col4
+    const ColDataString col0_sorted_asc{{}, "col0-2", "", "col0-0", "col0-1", "", "col0-3"};
+    const ColDataString col1_sorted_asc{"", "col1-1", "", "col1-0", {}, "col1-3", "col1-2"};
+    const ColDataString col2_sorted_asc{"col2-3", {}, "col2-1", "", "col2-0", "col2-4", {}};
+    const ColDataInt32 col3_sorted_asc{{}, -111111, 0, 1, {}, 9999, 0};
+    const ColDataInt32 col4_sorted_asc{{}, -234, -123, 0, 5, 9999, 24353};
+
+    /// Prepare some names
+    std::vector<String> col_names{"col0", "col1", "col2", "col3", "col4"};
+    const String db_name{"test_db"};
+    const String table_name{"projection_test_table"};
+};
+
+TEST_F(ExecutorProjectionTestRunner, Projection)
+try
+{
+    wrapForDisEnablePlanner([&]() {
+        /// Check single column
+        auto request = buildDAGRequest<MockColumnNameVec>({col_names[4]});
+        executeWithConcurrency(request, {toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        /// Check multi columns
+        request = buildDAGRequest<MockColumnNameVec>({col_names[0], col_names[4]});
+        executeWithConcurrency(request,
+                               {
+                                   toNullableVec<String>(col_names[0], col0_sorted_asc),
+                                   toNullableVec<Int32>(col_names[4], col4_sorted_asc),
+                               });
+
+        /// Check multi columns
+        request = buildDAGRequest<MockColumnNameVec>({col_names[0], col_names[1], col_names[4]});
+        executeWithConcurrency(request,
+                               {toNullableVec<String>(col_names[0], col0_sorted_asc),
+                                toNullableVec<String>(col_names[1], col1_sorted_asc),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        /// Check duplicate columns
+        request = buildDAGRequest<MockColumnNameVec>({col_names[4], col_names[4], col_names[4]});
+        executeWithConcurrency(request,
+                               {toNullableVec<Int32>(col_names[4], col4_sorted_asc),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        {
+            /// Check large number of columns
+            const size_t col_num = 100;
+            MockColumnNameVec projection_input;
+            ColumnsWithTypeAndName columns;
+            auto expect_column = toNullableVec<Int32>(col_names[4], col4_sorted_asc);
+
+            for (size_t i = 0; i < col_num; ++i)
+            {
+                projection_input.push_back(col_names[4]);
+                columns.push_back(expect_column);
+            }
+
+            request = buildDAGRequest<MockColumnNameVec>(projection_input);
+            executeWithConcurrency(request, columns);
+        }
+    });
+}
+CATCH
+
+TEST_F(ExecutorProjectionTestRunner, ProjectionFunction)
+try
+{
+    wrapForDisEnablePlanner([&]() {
+        std::shared_ptr<tipb::DAGRequest> request;
+
+        /// Test "equal" function
+
+        /// Data type: TypeString
+        request = buildDAGRequest<MockAstVec>({eq(col(col_names[0]), col(col_names[0])), col(col_names[4])});
+        executeWithConcurrency(request,
+                               {toNullableVec<UInt64>({{}, 1, 1, 1, 1, 1, 1}),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        request = buildDAGRequest<MockAstVec>({eq(col(col_names[0]), col(col_names[1])), col(col_names[4])});
+        executeWithConcurrency(request,
+                               {toNullableVec<UInt64>({{}, 0, 1, 0, {}, 0, 0}),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        /// Data type: TypeLong
+        request = buildDAGRequest<MockAstVec>({eq(col(col_names[3]), col(col_names[4])), col(col_names[4])});
+        executeWithConcurrency(request,
+                               {toNullableVec<UInt64>({{}, 0, 0, 0, {}, 1, 0}),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+
+        /// Test "greater" function
+
+        /// Data type: TypeString
+        request = buildDAGRequest<MockAstVec>({gt(col(col_names[0]), col(col_names[1])), col(col_names[4])});
+        executeWithConcurrency(request,
+                               {toNullableVec<UInt64>({{}, 0, 0, 0, {}, 0, 0}),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        request = buildDAGRequest<MockAstVec>({gt(col(col_names[1]), col(col_names[0])), col(col_names[4])});
+        executeWithConcurrency(request,
+                               {toNullableVec<UInt64>({{}, 1, 0, 1, {}, 1, 1}),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        /// Data type: TypeLong
+        request = buildDAGRequest<MockAstVec>({gt(col(col_names[3]), col(col_names[4])), col(col_names[4])});
+        executeWithConcurrency(request,
+                               {toNullableVec<UInt64>({{}, 0, 1, 1, {}, 0, 0}),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        request = buildDAGRequest<MockAstVec>({gt(col(col_names[4]), col(col_names[3])), col(col_names[4])});
+        executeWithConcurrency(request,
+                               {toNullableVec<UInt64>({{}, 1, 0, 0, {}, 0, 1}),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+
+        /// Test "and" function
+
+        /// Data type: TypeString
+        request = buildDAGRequest<MockAstVec>({And(col(col_names[0]), col(col_names[0])), col(col_names[4])});
+        executeWithConcurrency(request,
+                               {toNullableVec<UInt64>({{}, 0, 0, 0, 0, 0, 0}),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        request = buildDAGRequest<MockAstVec>({And(col(col_names[0]), col(col_names[1])), col(col_names[4])});
+        executeWithConcurrency(request,
+                               {toNullableVec<UInt64>({0, 0, 0, 0, 0, 0, 0}),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        /// Data type: TypeLong
+        request = buildDAGRequest<MockAstVec>({And(col(col_names[3]), col(col_names[4])), col(col_names[4])});
+        executeWithConcurrency(request,
+                               {toNullableVec<UInt64>({{}, 1, 0, 0, {}, 1, 0}),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        /// Test "not" function
+
+        /// Data type: TypeString
+        request = buildDAGRequest<MockAstVec>({NOT(col(col_names[0])), NOT(col(col_names[1])), NOT(col(col_names[2])), col(col_names[4])});
+        executeWithConcurrency(request,
+                               {toNullableVec<UInt64>({{}, 1, 1, 1, 1, 1, 1}),
+                                toNullableVec<UInt64>({1, 1, 1, 1, {}, 1, 1}),
+                                toNullableVec<UInt64>({1, {}, 1, 1, 1, 1, {}}),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        /// Data type: TypeLong
+        request = buildDAGRequest<MockAstVec>({NOT(col(col_names[3])), NOT(col(col_names[4])), col(col_names[4])});
+        executeWithConcurrency(request,
+                               {toNullableVec<UInt64>({{}, 0, 1, 0, {}, 0, 1}),
+                                toNullableVec<UInt64>({{}, 0, 0, 1, 0, 0, 0}),
+                                toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+        /// TODO more functions...
+    });
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Flash/tests/gtest_topn_executor.cpp b/dbms/src/Flash/tests/gtest_topn_executor.cpp
new file mode 100644
index 00000000000..a6ba3183118
--- /dev/null
+++ b/dbms/src/Flash/tests/gtest_topn_executor.cpp
@@ -0,0 +1,225 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/ExecutorTestUtils.h>
+#include <TestUtils/mockExecutor.h>
+
+namespace DB
+{
+namespace tests
+{
+
+class ExecutorTopNTestRunner : public DB::tests::ExecutorTest
+{
+public:
+    using ColStringType = std::optional<typename TypeTraits<String>::FieldType>;
+    using ColInt32Type = std::optional<typename TypeTraits<Int32>::FieldType>;
+    using ColumnWithString = std::vector<ColStringType>;
+    using ColumnWithInt32 = std::vector<ColInt32Type>;
+
+    void initializeContext() override
+    {
+        ExecutorTest::initializeContext();
+
+        context.addMockTable({db_name, table_single_name},
+                             {{single_col_name, TiDB::TP::TypeString}},
+                             {toNullableVec<String>(single_col_name, col0)});
+
+        context.addMockTable({db_name, table_name},
+                             {{col_name[0], TiDB::TP::TypeLong},
+                              {col_name[1], TiDB::TP::TypeString},
+                              {col_name[2], TiDB::TP::TypeString},
+                              {col_name[3], TiDB::TP::TypeLong}},
+                             {toNullableVec<Int32>(col_name[0], col_age),
+                              toNullableVec<String>(col_name[1], col_gender),
+                              toNullableVec<String>(col_name[2], col_country),
+                              toNullableVec<Int32>(col_name[3], c0l_salary)});
+    }
+
+    std::shared_ptr<tipb::DAGRequest> buildDAGRequest(const String & table_name, const String & col_name, bool is_desc, int limit_num)
+    {
+        return context.scan(db_name, table_name).topN(col_name, is_desc, limit_num).build(context);
+    }
+
+    std::shared_ptr<tipb::DAGRequest> buildDAGRequest(const String & table_name, MockOrderByItemVec order_by_items, int limit, MockAstVec func_proj_ast = {}, MockColumnNameVec out_proj_ast = {})
+    {
+        if (func_proj_ast.size() == 0)
+            return context.scan(db_name, table_name).topN(order_by_items, limit).build(context);
+        else
+            return context.scan(db_name, table_name).project(func_proj_ast).topN(order_by_items, limit).project(out_proj_ast).build(context);
+    }
+
+    /// Prepare some names
+    const String db_name{"test_db"};
+
+    const String table_single_name{"topn_single_table"}; /// For single column test
+    const String single_col_name{"single_col"};
+    ColumnWithString col0{"col0-0", "col0-1", "col0-2", {}, "col0-4", {}, "col0-6", "col0-7"};
+
+    const String table_name{"clerk"};
+    const std::vector<String> col_name{"age", "gender", "country", "salary"};
+    ColumnWithInt32 col_age{{}, 27, 32, 36, {}, 34};
+    ColumnWithString col_gender{"female", "female", "male", "female", "male", "male"};
+    ColumnWithString col_country{"korea", "usa", "usa", "china", "china", "china"};
+    ColumnWithInt32 c0l_salary{1300, 0, {}, 900, {}, -300};
+};
+
+TEST_F(ExecutorTopNTestRunner, TopN)
+try
+{
+    wrapForDisEnablePlanner([&]() {
+        std::shared_ptr<tipb::DAGRequest> request;
+        std::vector<ColumnsWithTypeAndName> expect_cols;
+
+        {
+            /// Test single column
+            size_t col_data_num = col0.size();
+            for (size_t i = 1; i <= 1; ++i)
+            {
+                bool is_desc;
+                is_desc = static_cast<bool>(i); /// Set descent or ascent
+                if (is_desc)
+                    sort(col0.begin(), col0.end(), std::greater<ColStringType>()); /// Sort col0 for the following comparison
+                else
+                    sort(col0.begin(), col0.end());
+
+                for (size_t limit_num = 0; limit_num <= col_data_num + 5; ++limit_num)
+                {
+                    request = buildDAGRequest(table_single_name, single_col_name, is_desc, limit_num);
+
+                    expect_cols.clear();
+                    if (limit_num == 0 || limit_num > col_data_num)
+                        expect_cols.push_back({toNullableVec<String>(single_col_name, ColumnWithString(col0.begin(), col0.end()))});
+                    else
+                        expect_cols.push_back({toNullableVec<String>(single_col_name, ColumnWithString(col0.begin(), col0.begin() + limit_num))});
+
+                    ASSERT_COLUMNS_EQ_R(executeStreams(request), expect_cols[0]);
+                    ASSERT_COLUMNS_EQ_R(executeStreams(request, 2), expect_cols[0]);
+                    ASSERT_COLUMNS_EQ_R(executeStreams(request, 4), expect_cols[0]);
+                    ASSERT_COLUMNS_EQ_R(executeStreams(request, 8), expect_cols[0]);
+                }
+            }
+        }
+
+        {
+            /// Test multi-columns
+            expect_cols = {{toNullableVec<Int32>(col_name[0], ColumnWithInt32{36, 34, 32, 27, {}, {}}),
+                            toNullableVec<String>(col_name[1], ColumnWithString{"female", "male", "male", "female", "male", "female"}),
+                            toNullableVec<String>(col_name[2], ColumnWithString{"china", "china", "usa", "usa", "china", "korea"}),
+                            toNullableVec<Int32>(col_name[3], ColumnWithInt32{900, -300, {}, 0, {}, 1300})},
+                           {toNullableVec<Int32>(col_name[0], ColumnWithInt32{32, {}, 34, 27, 36, {}}),
+                            toNullableVec<String>(col_name[1], ColumnWithString{"male", "male", "male", "female", "female", "female"}),
+                            toNullableVec<String>(col_name[2], ColumnWithString{"usa", "china", "china", "usa", "china", "korea"}),
+                            toNullableVec<Int32>(col_name[3], ColumnWithInt32{{}, {}, -300, 0, 900, 1300})},
+                           {toNullableVec<Int32>(col_name[0], ColumnWithInt32{34, {}, 32, 36, {}, 27}),
+                            toNullableVec<String>(col_name[1], ColumnWithString{"male", "male", "male", "female", "female", "female"}),
+                            toNullableVec<String>(col_name[2], ColumnWithString{"china", "china", "usa", "china", "korea", "usa"}),
+                            toNullableVec<Int32>(col_name[3], ColumnWithInt32{-300, {}, {}, 900, 1300, 0})}};
+
+            std::vector<MockOrderByItemVec> order_by_items{
+                /// select * from clerk order by age DESC, gender DESC;
+                {MockOrderByItem(col_name[0], true), MockOrderByItem(col_name[1], true)},
+                /// select * from clerk order by gender DESC, salary ASC;
+                {MockOrderByItem(col_name[1], true), MockOrderByItem(col_name[3], false)},
+                /// select * from clerk order by gender DESC, country ASC, salary DESC;
+                {MockOrderByItem(col_name[1], true), MockOrderByItem(col_name[2], false), MockOrderByItem(col_name[3], true)}};
+
+            size_t test_num = expect_cols.size();
+
+            for (size_t i = 0; i < test_num; ++i)
+            {
+                request = buildDAGRequest(table_name, order_by_items[i], 100);
+                ASSERT_COLUMNS_EQ_R(executeStreams(request), expect_cols[i]);
+            }
+        }
+    });
+}
+CATCH
+
+TEST_F(ExecutorTopNTestRunner, TopNFunction)
+try
+{
+    wrapForDisEnablePlanner([&]() {
+        std::shared_ptr<tipb::DAGRequest> request;
+        std::vector<ColumnsWithTypeAndName> expect_cols;
+        MockColumnNameVec output_projection{col_name[0], col_name[1], col_name[2], col_name[3]};
+        MockAstVec func_projection; // Do function operation for topn
+        MockOrderByItemVec order_by_items;
+        ASTPtr col0_ast = col(col_name[0]);
+        ASTPtr col1_ast = col(col_name[1]);
+        ASTPtr col2_ast = col(col_name[2]);
+        ASTPtr col3_ast = col(col_name[3]);
+        ASTPtr func_ast;
+
+        {
+            /// "and" function
+            expect_cols = {{toNullableVec<Int32>(col_name[0], ColumnWithInt32{{}, {}, 32, 27, 36, 34}),
+                            toNullableVec<String>(col_name[1], ColumnWithString{"female", "male", "male", "female", "female", "male"}),
+                            toNullableVec<String>(col_name[2], ColumnWithString{"korea", "china", "usa", "usa", "china", "china"}),
+                            toNullableVec<Int32>(col_name[3], ColumnWithInt32{1300, {}, {}, 0, 900, -300})}};
+
+            {
+                /// select * from clerk order by age and salary ASC limit 100;
+                order_by_items = {MockOrderByItem("and(age, salary)", false)};
+                func_ast = And(col(col_name[0]), col(col_name[3]));
+                func_projection = {col0_ast, col1_ast, col2_ast, col3_ast, func_ast};
+
+                request = buildDAGRequest(table_name, order_by_items, 100, func_projection, output_projection);
+                ASSERT_COLUMNS_EQ_R(executeStreams(request), expect_cols[0]);
+            }
+        }
+
+        {
+            /// "equal" function
+            expect_cols = {{toNullableVec<Int32>(col_name[0], ColumnWithInt32{27, 36, 34, 32, {}, {}}),
+                            toNullableVec<String>(col_name[1], ColumnWithString{"female", "female", "male", "male", "female", "male"}),
+                            toNullableVec<String>(col_name[2], ColumnWithString{"usa", "china", "china", "usa", "korea", "china"}),
+                            toNullableVec<Int32>(col_name[3], ColumnWithInt32{0, 900, -300, {}, 1300, {}})}};
+
+            {
+                /// select age, salary from clerk order by age = salary DESC limit 100;
+                order_by_items = {MockOrderByItem("equals(age, salary)", true)};
+                func_ast = eq(col(col_name[0]), col(col_name[3]));
+                func_projection = {col0_ast, col1_ast, col2_ast, col3_ast, func_ast};
+
+                request = buildDAGRequest(table_name, order_by_items, 100, func_projection, output_projection);
+                ASSERT_COLUMNS_EQ_R(executeStreams(request), expect_cols[0]);
+            }
+        }
+
+        {
+            /// "greater" function
+            expect_cols = {{toNullableVec<Int32>(col_name[0], ColumnWithInt32{{}, 32, {}, 36, 27, 34}),
+                            toNullableVec<String>(col_name[1], ColumnWithString{"female", "male", "male", "female", "female", "male"}),
+                            toNullableVec<String>(col_name[2], ColumnWithString{"korea", "usa", "china", "china", "usa", "china"}),
+                            toNullableVec<Int32>(col_name[3], ColumnWithInt32{1300, {}, {}, 900, 0, -300})}};
+
+            {
+                /// select age, gender, country, salary from clerk order by age > salary ASC limit 100;
+                order_by_items = {MockOrderByItem("greater(age, salary)", false)};
+                func_ast = gt(col(col_name[0]), col(col_name[3]));
+                func_projection = {col0_ast, col1_ast, col2_ast, col3_ast, func_ast};
+
+                request = buildDAGRequest(table_name, order_by_items, 100, func_projection, output_projection);
+                ASSERT_COLUMNS_EQ_R(executeStreams(request), expect_cols[0]);
+            }
+        }
+
+        /// TODO more functions...
+    });
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Functions/CollationOperatorOptimized.h b/dbms/src/Functions/CollationOperatorOptimized.h
new file mode 100644
index 00000000000..395ecc5b9eb
--- /dev/null
+++ b/dbms/src/Functions/CollationOperatorOptimized.h
@@ -0,0 +1,210 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Columns/ColumnString.h>
+#include <Core/AccurateComparison.h>
+#include <Functions/StringUtil.h>
+#include <common/StringRef.h>
+#include <common/defines.h>
+
+#include <cstddef>
+#include <string_view>
+
+
+namespace DB
+{
+
+template <typename T>
+ALWAYS_INLINE inline int signum(T val)
+{
+    return (0 < val) - (val < 0);
+}
+
+// Check equality is much faster than other comparison.
+// - check size first
+// - return 0 if equal else 1
+__attribute__((flatten, always_inline, pure)) inline uint8_t RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs)
+{
+    return StringRef(lhs) == StringRef(rhs) ? 0 : 1;
+}
+
+// Compare str view by memcmp
+__attribute__((flatten, always_inline, pure)) inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2)
+{
+    return signum(v1.compare(v2));
+}
+
+constexpr char SPACE = ' ';
+
+// Remove tail space
+__attribute__((flatten, always_inline, pure)) inline std::string_view RightTrim(const std::string_view & v)
+{
+    if (likely(v.empty() || v.back() != SPACE))
+        return v;
+    size_t end = v.find_last_not_of(SPACE);
+    return end == std::string_view::npos ? std::string_view{} : std::string_view(v.data(), end + 1);
+}
+
+__attribute__((flatten, always_inline, pure)) inline int RtrimStrCompare(const std::string_view & va, const std::string_view & vb)
+{
+    return RawStrCompare(RightTrim(va), RightTrim(vb));
+}
+
+// If true, only need to check equal or not.
+template <typename T>
+struct IsEqualRelated
+{
+    static constexpr const bool value = false;
+};
+
+// For `EqualsOp` and `NotEqualsOp`, value is true.
+template <typename... A>
+struct IsEqualRelated<DB::EqualsOp<A...>>
+{
+    static constexpr const bool value = true;
+};
+template <typename... A>
+struct IsEqualRelated<DB::NotEqualsOp<A...>>
+{
+    static constexpr const bool value = true;
+};
+
+// Loop columns and invoke callback for each pair.
+template <typename F>
+__attribute__((flatten, always_inline)) inline void LoopTwoColumns(
+    const ColumnString::Chars_t & a_data,
+    const ColumnString::Offsets & a_offsets,
+    const ColumnString::Chars_t & b_data,
+    const ColumnString::Offsets & b_offsets,
+    size_t size,
+    F && func)
+{
+    for (size_t i = 0; i < size; ++i)
+    {
+        size_t a_size = StringUtil::sizeAt(a_offsets, i) - 1;
+        size_t b_size = StringUtil::sizeAt(b_offsets, i) - 1;
+        const auto * a_ptr = reinterpret_cast<const char *>(&a_data[StringUtil::offsetAt(a_offsets, i)]);
+        const auto * b_ptr = reinterpret_cast<const char *>(&b_data[StringUtil::offsetAt(b_offsets, i)]);
+
+        func({a_ptr, a_size}, {b_ptr, b_size}, i);
+    }
+}
+
+// Loop one column and invoke callback for each pair.
+template <typename F>
+__attribute__((flatten, always_inline)) inline void LoopOneColumn(
+    const ColumnString::Chars_t & a_data,
+    const ColumnString::Offsets & a_offsets,
+    size_t size,
+    F && func)
+{
+    for (size_t i = 0; i < size; ++i)
+    {
+        size_t a_size = StringUtil::sizeAt(a_offsets, i) - 1;
+        const auto * a_ptr = reinterpret_cast<const char *>(&a_data[StringUtil::offsetAt(a_offsets, i)]);
+
+        func({a_ptr, a_size}, i);
+    }
+}
+
+// Handle str-column compare str-column.
+// - Optimize UTF8_BIN and UTF8MB4_BIN
+//   - Check if columns do NOT contain tail space
+//   - If Op is `EqualsOp` or `NotEqualsOp`, optimize comparison by faster way
+template <typename Op, typename Result>
+ALWAYS_INLINE inline bool StringVectorStringVector(
+    const ColumnString::Chars_t & a_data,
+    const ColumnString::Offsets & a_offsets,
+    const ColumnString::Chars_t & b_data,
+    const ColumnString::Offsets & b_offsets,
+    const TiDB::TiDBCollatorPtr & collator,
+    Result & c)
+{
+    bool use_optimized_path = false;
+
+    switch (collator->getCollatorId())
+    {
+    case TiDB::ITiDBCollator::UTF8MB4_BIN:
+    case TiDB::ITiDBCollator::UTF8_BIN:
+    {
+        size_t size = a_offsets.size();
+
+        LoopTwoColumns(a_data, a_offsets, b_data, b_offsets, size, [&c](const std::string_view & va, const std::string_view & vb, size_t i) {
+            if constexpr (IsEqualRelated<Op>::value)
+            {
+                c[i] = Op::apply(RawStrEqualCompare(RightTrim(va), RightTrim(vb)), 0);
+            }
+            else
+            {
+                c[i] = Op::apply(RtrimStrCompare(va, vb), 0);
+            }
+        });
+
+        use_optimized_path = true;
+
+        break;
+    }
+    default:
+        break;
+    }
+    return use_optimized_path;
+}
+
+// Handle str-column compare const-str.
+// - Optimize UTF8_BIN and UTF8MB4_BIN
+//   - Right trim const-str first
+//   - Check if column does NOT contain tail space
+//   - If Op is `EqualsOp` or `NotEqualsOp`, optimize comparison by faster way
+template <typename Op, typename Result>
+ALWAYS_INLINE inline bool StringVectorConstant(
+    const ColumnString::Chars_t & a_data,
+    const ColumnString::Offsets & a_offsets,
+    const std::string_view & b,
+    const TiDB::TiDBCollatorPtr & collator,
+    Result & c)
+{
+    bool use_optimized_path = false;
+
+    switch (collator->getCollatorId())
+    {
+    case TiDB::ITiDBCollator::UTF8MB4_BIN:
+    case TiDB::ITiDBCollator::UTF8_BIN:
+    {
+        size_t size = a_offsets.size();
+
+        std::string_view tar_str_view = RightTrim(b); // right trim const-str first
+
+        LoopOneColumn(a_data, a_offsets, size, [&c, &tar_str_view](const std::string_view & view, size_t i) {
+            if constexpr (IsEqualRelated<Op>::value)
+            {
+                c[i] = Op::apply(RawStrEqualCompare(RightTrim(view), tar_str_view), 0);
+            }
+            else
+            {
+                c[i] = Op::apply(RawStrCompare(RightTrim(view), tar_str_view), 0);
+            }
+        });
+
+        use_optimized_path = true;
+        break;
+    }
+    default:
+        break;
+    }
+    return use_optimized_path;
+}
+
+} // namespace DB
diff --git a/dbms/src/Functions/FunctionsComparison.h b/dbms/src/Functions/FunctionsComparison.h
index 1c63a286452..8f7502fba85 100644
--- a/dbms/src/Functions/FunctionsComparison.h
+++ b/dbms/src/Functions/FunctionsComparison.h
@@ -33,6 +33,7 @@
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypeTuple.h>
 #include <DataTypes/DataTypesNumber.h>
+#include <Functions/CollationOperatorOptimized.h>
 #include <Functions/FunctionHelpers.h>
 #include <Functions/FunctionsLogical.h>
 #include <Functions/IFunction.h>
@@ -301,6 +302,12 @@ struct StringComparisonWithCollatorImpl
         const TiDB::TiDBCollatorPtr & collator,
         PaddedPODArray<ResultType> & c)
     {
+        bool optimized_path = StringVectorStringVector<Op>(a_data, a_offsets, b_data, b_offsets, collator, c);
+        if (optimized_path)
+        {
+            return;
+        }
+
         size_t size = a_offsets.size();
 
         for (size_t i = 0; i < size; ++i)
@@ -317,10 +324,17 @@ struct StringComparisonWithCollatorImpl
     static void NO_INLINE stringVectorConstant(
         const ColumnString::Chars_t & a_data,
         const ColumnString::Offsets & a_offsets,
-        const std::string & b,
+        const std::string_view & b,
         const TiDB::TiDBCollatorPtr & collator,
         PaddedPODArray<ResultType> & c)
     {
+        bool optimized_path = StringVectorConstant<Op>(a_data, a_offsets, b, collator, c);
+
+        if (optimized_path)
+        {
+            return;
+        }
+
         size_t size = a_offsets.size();
         ColumnString::Offset b_size = b.size();
         const char * b_data = reinterpret_cast<const char *>(b.data());
@@ -332,7 +346,7 @@ struct StringComparisonWithCollatorImpl
     }
 
     static void constantStringVector(
-        const std::string & a,
+        const std::string_view & a,
         const ColumnString::Chars_t & b_data,
         const ColumnString::Offsets & b_offsets,
         const TiDB::TiDBCollatorPtr & collator,
@@ -342,8 +356,8 @@ struct StringComparisonWithCollatorImpl
     }
 
     static void constantConstant(
-        const std::string & a,
-        const std::string & b,
+        const std::string_view & a,
+        const std::string_view & b,
         const TiDB::TiDBCollatorPtr & collator,
         ResultType & c)
     {
@@ -706,6 +720,25 @@ class FunctionComparison : public IFunction
         }
     }
 
+    static inline std::string_view genConstStrRef(const ColumnConst * c0_const)
+    {
+        std::string_view c0_const_str_ref{};
+        if (c0_const)
+        {
+            if (const auto * c0_const_string = checkAndGetColumn<ColumnString>(&c0_const->getDataColumn()); c0_const_string)
+            {
+                c0_const_str_ref = std::string_view(c0_const_string->getDataAt(0));
+            }
+            else if (const auto * c0_const_fixed_string = checkAndGetColumn<ColumnFixedString>(&c0_const->getDataColumn()); c0_const_fixed_string)
+            {
+                c0_const_str_ref = std::string_view(c0_const_fixed_string->getDataAt(0));
+            }
+            else
+                throw Exception("Logical error: ColumnConst contains not String nor FixedString column", ErrorCodes::ILLEGAL_COLUMN);
+        }
+        return c0_const_str_ref;
+    }
+
     template <typename ResultColumnType>
     bool executeStringWithCollator(
         Block & block,
@@ -720,10 +753,13 @@ class FunctionComparison : public IFunction
         using ResultType = typename ResultColumnType::value_type;
         using StringImpl = StringComparisonWithCollatorImpl<Op<int, int>, ResultType>;
 
+        std::string_view c0_const_str_ref = genConstStrRef(c0_const);
+        std::string_view c1_const_str_ref = genConstStrRef(c1_const);
+
         if (c0_const && c1_const)
         {
             ResultType res = 0;
-            StringImpl::constantConstant(c0_const->getValue<String>(), c1_const->getValue<String>(), collator, res);
+            StringImpl::constantConstant(c0_const_str_ref, c1_const_str_ref, collator, res);
             block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(c0_const->size(), toField(res));
             return true;
         }
@@ -745,12 +781,12 @@ class FunctionComparison : public IFunction
                 StringImpl::stringVectorConstant(
                     c0_string->getChars(),
                     c0_string->getOffsets(),
-                    c1_const->getValue<String>(),
+                    c1_const_str_ref,
                     collator,
                     c_res->getData());
             else if (c0_const && c1_string)
                 StringImpl::constantStringVector(
-                    c0_const->getValue<String>(),
+                    c0_const_str_ref,
                     c1_string->getChars(),
                     c1_string->getOffsets(),
                     collator,
@@ -770,8 +806,8 @@ class FunctionComparison : public IFunction
     template <typename ReturnColumnType = ColumnUInt8>
     bool executeString(Block & block, size_t result, const IColumn * c0, const IColumn * c1) const
     {
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
-        const ColumnString * c1_string = checkAndGetColumn<ColumnString>(c1);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
+        const auto * c1_string = checkAndGetColumn<ColumnString>(c1);
         const ColumnConst * c0_const = checkAndGetColumnConstStringOrFixedString(c0);
         const ColumnConst * c1_const = checkAndGetColumnConstStringOrFixedString(c1);
 
diff --git a/dbms/src/Functions/FunctionsConversion.cpp b/dbms/src/Functions/FunctionsConversion.cpp
index 118574ed33d..0446f76bd51 100644
--- a/dbms/src/Functions/FunctionsConversion.cpp
+++ b/dbms/src/Functions/FunctionsConversion.cpp
@@ -240,6 +240,7 @@ void registerFunctionsConversion(FunctionFactory & factory)
 
     factory.registerFunction<FunctionFromUnixTime>();
     factory.registerFunction<FunctionDateFormat>();
+    factory.registerFunction<FunctionGetFormat>();
     factory.registerFunction<FunctionTiDBUnixTimeStamp<NameTiDBUnixTimeStampInt>>();
     factory.registerFunction<FunctionTiDBUnixTimeStamp<NameTiDBUnixTimeStampDec>>();
     factory.registerFunction<FunctionStrToDate<NameStrToDateDate>>();
diff --git a/dbms/src/Functions/FunctionsConversion.h b/dbms/src/Functions/FunctionsConversion.h
index ddf64a70ca1..e8333ceeeea 100644
--- a/dbms/src/Functions/FunctionsConversion.h
+++ b/dbms/src/Functions/FunctionsConversion.h
@@ -1751,6 +1751,120 @@ class FunctionDateFormat : public IFunction
     }
 };
 
+class FunctionGetFormat : public IFunction
+{
+private:
+    static String get_format(const StringRef & time_type, const StringRef & location)
+    {
+        if (time_type == "DATE")
+        {
+            if (location == "USA")
+                return "%m.%d.%Y";
+            else if (location == "JIS")
+                return "%Y-%m-%d";
+            else if (location == "ISO")
+                return "%Y-%m-%d";
+            else if (location == "EUR")
+                return "%d.%m.%Y";
+            else if (location == "INTERNAL")
+                return "%Y%m%d";
+        }
+        else if (time_type == "DATETIME" || time_type == "TIMESTAMP")
+        {
+            if (location == "USA")
+                return "%Y-%m-%d %H.%i.%s";
+            else if (location == "JIS")
+                return "%Y-%m-%d %H:%i:%s";
+            else if (location == "ISO")
+                return "%Y-%m-%d %H:%i:%s";
+            else if (location == "EUR")
+                return "%Y-%m-%d %H.%i.%s";
+            else if (location == "INTERNAL")
+                return "%Y%m%d%H%i%s";
+        }
+        else if (time_type == "TIME")
+        {
+            if (location == "USA")
+                return "%h:%i:%s %p";
+            else if (location == "JIS")
+                return "%H:%i:%s";
+            else if (location == "ISO")
+                return "%H:%i:%s";
+            else if (location == "EUR")
+                return "%H.%i.%s";
+            else if (location == "INTERNAL")
+                return "%H%i%s";
+        }
+        return "";
+    }
+
+public:
+    static constexpr auto name = "getFormat";
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionGetFormat>(); };
+
+    String getName() const override { return name; }
+
+    size_t getNumberOfArguments() const override { return 2; }
+
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+    {
+        if (!arguments[0].type->isString())
+            throw Exception("First argument for function " + getName() + " must be String", ErrorCodes::ILLEGAL_COLUMN);
+        if (!arguments[1].type->isString())
+            throw Exception("Second argument for function " + getName() + " must be String", ErrorCodes::ILLEGAL_COLUMN);
+
+        return std::make_shared<DataTypeString>();
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    /**
+     * @brief The first argument is designed as a MySQL reserved word. You would encounter a syntax error when wrap it around with quote in SQL.
+     * For example, select GET_FORMAT("DATE", "USA") will fail. Removing the quote can solve the problem.
+     * Thus the first argument should always be a ColumnConst. See details in the link below:
+     * https://dev.mysql.com/doc/refman/5.7/en/date-and-time-functions.html#function_get-format
+     *
+     * @return ColumnNumbers
+     */
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
+
+    void executeImpl(Block & block, const ColumnNumbers & arguments, const size_t result) const override
+    {
+        const auto * location_col = checkAndGetColumn<ColumnString>(block.getByPosition(arguments[1]).column.get());
+        assert(location_col);
+        size_t size = location_col->size();
+        const auto & time_type_col = block.getByPosition(arguments[0]).column;
+        auto col_to = ColumnString::create();
+
+        if (time_type_col->isColumnConst())
+        {
+            const auto & time_type_col_const = checkAndGetColumnConst<ColumnString>(time_type_col.get());
+            const auto & time_type = time_type_col_const->getValue<String>();
+
+            ColumnString::Chars_t & data_to = col_to->getChars();
+            ColumnString::Offsets & offsets_to = col_to->getOffsets();
+            auto max_length = 18;
+            data_to.resize(size * max_length);
+            offsets_to.resize(size);
+            WriteBufferFromVector<ColumnString::Chars_t> write_buffer(data_to);
+            for (size_t i = 0; i < size; ++i)
+            {
+                const auto & location = location_col->getDataAt(i);
+                const auto & result = get_format(StringRef(time_type), location);
+                write_buffer.write(result.c_str(), result.size());
+                writeChar(0, write_buffer);
+                offsets_to[i] = write_buffer.count();
+            }
+            data_to.resize(write_buffer.count());
+            block.getByPosition(result).column = std::move(col_to);
+        }
+        else
+        {
+            throw Exception("First argument for function " + getName() + " must be String constant", ErrorCodes::ILLEGAL_COLUMN);
+        }
+    }
+};
+
 struct NameStrToDateDate
 {
     static constexpr auto name = "strToDateDate";
diff --git a/dbms/src/Functions/FunctionsDuration.cpp b/dbms/src/Functions/FunctionsDuration.cpp
index ea7b86ac670..9ccafd2794d 100644
--- a/dbms/src/Functions/FunctionsDuration.cpp
+++ b/dbms/src/Functions/FunctionsDuration.cpp
@@ -97,6 +97,57 @@ void FunctionDurationSplit<Impl>::executeImpl(Block & block, const ColumnNumbers
             ErrorCodes::ILLEGAL_COLUMN);
 };
 
+template <typename Impl>
+DataTypePtr FunctionMyDurationToSec<Impl>::getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const
+{
+    if (!arguments[0].type->isMyTime())
+    {
+        throw Exception(
+            fmt::format("Illegal type {} of the first argument of function {}", arguments[0].type->getName(), getName()),
+            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+    }
+    return std::make_shared<DataTypeInt64>();
+}
+
+template <typename Impl>
+void FunctionMyDurationToSec<Impl>::executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const
+{
+    const auto * from_type = checkAndGetDataType<DataTypeMyDuration>(block.getByPosition(arguments[0]).type.get());
+    if (from_type == nullptr)
+    {
+        throw Exception(
+            fmt::format(
+                "Illegal column {} of the first argument of function {}",
+                block.getByPosition(arguments[0]).column->getName(),
+                name),
+            ErrorCodes::ILLEGAL_COLUMN);
+    }
+
+    using FromFieldType = typename DataTypeMyDuration::FieldType;
+    const auto * col_from = checkAndGetColumn<ColumnVector<FromFieldType>>(block.getByPosition(arguments[0]).column.get());
+    if (col_from != nullptr)
+    {
+        const typename ColumnVector<FromFieldType>::Container & vec_from = col_from->getData();
+        const size_t size = vec_from.size();
+        auto col_to = ColumnVector<Int64>::create(size);
+        typename ColumnVector<Int64>::Container & vec_to = col_to->getData();
+
+        for (size_t i = 0; i < size; ++i)
+        {
+            MyDuration val(vec_from[i], from_type->getFsp());
+            vec_to[i] = Impl::apply(val);
+        }
+        block.getByPosition(result).column = std::move(col_to);
+    }
+    else
+        throw Exception(
+            fmt::format(
+                "Illegal column {} of the first argument of function {}",
+                block.getByPosition(arguments[0]).column->getName(),
+                name),
+            ErrorCodes::ILLEGAL_COLUMN);
+}
+
 struct DurationSplitHourImpl
 {
     static constexpr auto name = "hour";
@@ -133,11 +184,27 @@ struct DurationSplitMicroSecondImpl
     }
 };
 
+struct TiDBTimeToSecTransformerImpl
+{
+    static constexpr auto name = "tidbTimeToSec";
+    static Int64 apply(const MyDuration & val)
+    {
+        Int64 sign = 1;
+        if (val.isNeg())
+        {
+            sign = -1;
+        }
+        return sign * (val.hours() * 3600 + val.minutes() * 60 + val.seconds());
+    }
+};
+
 using FunctionDurationHour = FunctionDurationSplit<DurationSplitHourImpl>;
 using FunctionDurationMinute = FunctionDurationSplit<DurationSplitMinuteImpl>;
 using FunctionDurationSecond = FunctionDurationSplit<DurationSplitSecondImpl>;
 using FunctionDurationMicroSecond = FunctionDurationSplit<DurationSplitMicroSecondImpl>;
 
+using FunctionToTiDBTimeToSec = FunctionMyDurationToSec<TiDBTimeToSecTransformerImpl>;
+
 void registerFunctionsDuration(FunctionFactory & factory)
 {
     factory.registerFunction<FunctionConvertDurationFromNanos>();
@@ -146,5 +213,7 @@ void registerFunctionsDuration(FunctionFactory & factory)
     factory.registerFunction<FunctionDurationMinute>();
     factory.registerFunction<FunctionDurationSecond>();
     factory.registerFunction<FunctionDurationMicroSecond>();
+
+    factory.registerFunction<FunctionToTiDBTimeToSec>();
 }
 } // namespace DB
diff --git a/dbms/src/Functions/FunctionsDuration.h b/dbms/src/Functions/FunctionsDuration.h
index 4247cde03ff..5bc54d425f4 100644
--- a/dbms/src/Functions/FunctionsDuration.h
+++ b/dbms/src/Functions/FunctionsDuration.h
@@ -69,4 +69,23 @@ class FunctionDurationSplit : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override;
 };
 
+template <typename Impl>
+class FunctionMyDurationToSec : public IFunction
+{
+public:
+    static constexpr auto name = Impl::name;
+
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionMyDurationToSec>(); };
+
+    String getName() const override { return name; }
+
+    size_t getNumberOfArguments() const override { return 1; }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override;
+
+    void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override;
+};
+
 } // namespace DB
\ No newline at end of file
diff --git a/dbms/src/Functions/FunctionsGeo.cpp b/dbms/src/Functions/FunctionsGeo.cpp
index a6fd2ff522e..02e11b66d77 100644
--- a/dbms/src/Functions/FunctionsGeo.cpp
+++ b/dbms/src/Functions/FunctionsGeo.cpp
@@ -28,13 +28,6 @@
 #include <boost/geometry/geometries/point_xy.hpp>
 #include <boost/geometry/geometries/polygon.hpp>
 
-
-namespace ProfileEvents
-{
-extern const Event PolygonsAddedToPool;
-extern const Event PolygonsInPoolAllocatedBytes;
-} // namespace ProfileEvents
-
 namespace DB
 {
 namespace ErrorCodes
@@ -60,9 +53,6 @@ ColumnPtr callPointInPolygonImplWithPool(const IColumn & x, const IColumn & y, P
         /// To allocate memory.
         ptr->init();
 
-        ProfileEvents::increment(ProfileEvents::PolygonsAddedToPool);
-        ProfileEvents::increment(ProfileEvents::PolygonsInPoolAllocatedBytes, ptr->getAllocatedBytes());
-
         return ptr.release();
     };
 
@@ -121,30 +111,30 @@ class FunctionPointInPolygon : public IFunction
             throw Exception("Too few arguments", ErrorCodes::TOO_LESS_ARGUMENTS_FOR_FUNCTION);
         }
 
-        auto getMsgPrefix = [this](size_t i) {
+        auto get_msg_prefix = [this](size_t i) {
             return "Argument " + toString(i + 1) + " for function " + getName();
         };
 
         for (size_t i = 1; i < arguments.size(); ++i)
         {
-            auto * array = checkAndGetDataType<DataTypeArray>(arguments[i].get());
+            const auto * array = checkAndGetDataType<DataTypeArray>(arguments[i].get());
             if (array == nullptr && i != 1)
-                throw Exception(getMsgPrefix(i) + " must be array of tuples.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+                throw Exception(get_msg_prefix(i) + " must be array of tuples.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
 
-            auto * tuple = checkAndGetDataType<DataTypeTuple>(array ? array->getNestedType().get() : arguments[i].get());
+            const auto * tuple = checkAndGetDataType<DataTypeTuple>(array ? array->getNestedType().get() : arguments[i].get());
             if (tuple == nullptr)
-                throw Exception(getMsgPrefix(i) + " must contains tuple.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+                throw Exception(get_msg_prefix(i) + " must contains tuple.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
 
             const DataTypes & elements = tuple->getElements();
 
             if (elements.size() != 2)
-                throw Exception(getMsgPrefix(i) + " must have exactly two elements.", ErrorCodes::BAD_ARGUMENTS);
+                throw Exception(get_msg_prefix(i) + " must have exactly two elements.", ErrorCodes::BAD_ARGUMENTS);
 
             for (auto j : ext::range(0, elements.size()))
             {
                 if (!elements[j]->isNumber())
                 {
-                    throw Exception(getMsgPrefix(i) + " must contains numeric tuple at position " + toString(j + 1),
+                    throw Exception(get_msg_prefix(i) + " must contains numeric tuple at position " + toString(j + 1),
                                     ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
                 }
             }
@@ -156,10 +146,10 @@ class FunctionPointInPolygon : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const IColumn * point_col = block.getByPosition(arguments[0]).column.get();
-        auto const_tuple_col = checkAndGetColumn<ColumnConst>(point_col);
+        const auto * const_tuple_col = checkAndGetColumn<ColumnConst>(point_col);
         if (const_tuple_col)
             point_col = &const_tuple_col->getDataColumn();
-        auto tuple_col = checkAndGetColumn<ColumnTuple>(point_col);
+        const auto * tuple_col = checkAndGetColumn<ColumnTuple>(point_col);
 
         if (!tuple_col)
         {
@@ -207,18 +197,18 @@ class FunctionPointInPolygon : public IFunction
     {
         Polygon<Type> polygon;
 
-        auto getMsgPrefix = [this](size_t i) {
+        auto get_msg_prefix = [this](size_t i) {
             return "Argument " + toString(i + 1) + " for function " + getName();
         };
 
         for (size_t i = 1; i < arguments.size(); ++i)
         {
-            auto const_col = checkAndGetColumn<ColumnConst>(block.getByPosition(arguments[i]).column.get());
-            auto array_col = const_col ? checkAndGetColumn<ColumnArray>(&const_col->getDataColumn()) : nullptr;
-            auto tuple_col = array_col ? checkAndGetColumn<ColumnTuple>(&array_col->getData()) : nullptr;
+            const auto * const_col = checkAndGetColumn<ColumnConst>(block.getByPosition(arguments[i]).column.get());
+            const auto * array_col = const_col ? checkAndGetColumn<ColumnArray>(&const_col->getDataColumn()) : nullptr;
+            const auto * tuple_col = array_col ? checkAndGetColumn<ColumnTuple>(&array_col->getData()) : nullptr;
 
             if (!tuple_col)
-                throw Exception(getMsgPrefix(i) + " must be constant array of tuples.", ErrorCodes::ILLEGAL_COLUMN);
+                throw Exception(get_msg_prefix(i) + " must be constant array of tuples.", ErrorCodes::ILLEGAL_COLUMN);
 
             const auto & tuple_columns = tuple_col->getColumns();
             const auto & column_x = tuple_columns[0];
@@ -232,7 +222,7 @@ class FunctionPointInPolygon : public IFunction
             auto size = column_x->size();
 
             if (size == 0)
-                throw Exception(getMsgPrefix(i) + " shouldn't be empty.", ErrorCodes::ILLEGAL_COLUMN);
+                throw Exception(get_msg_prefix(i) + " shouldn't be empty.", ErrorCodes::ILLEGAL_COLUMN);
 
             for (auto j : ext::range(0, size))
             {
@@ -246,11 +236,11 @@ class FunctionPointInPolygon : public IFunction
                 container.push_back(container.front());
         }
 
-        auto callImpl = use_object_pool
+        auto call_impl = use_object_pool
             ? FunctionPointInPolygonDetail::callPointInPolygonImplWithPool<Polygon<Type>, PointInPolygonImpl<Type>>
             : FunctionPointInPolygonDetail::callPointInPolygonImpl<Polygon<Type>, PointInPolygonImpl<Type>>;
 
-        return callImpl(x, y, polygon);
+        return call_impl(x, y, polygon);
     }
 };
 
diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp
index b9f20e45134..76022b983ad 100644
--- a/dbms/src/Functions/FunctionsString.cpp
+++ b/dbms/src/Functions/FunctionsString.cpp
@@ -992,7 +992,7 @@ class FunctionStringOrArrayToT : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const ColumnPtr column = block.getByPosition(arguments[0]).column;
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column.get()))
         {
             auto col_res = ColumnVector<ResultType>::create();
 
@@ -1002,7 +1002,7 @@ class FunctionStringOrArrayToT : public IFunction
 
             block.getByPosition(result).column = std::move(col_res);
         }
-        else if (const ColumnFixedString * col = checkAndGetColumn<ColumnFixedString>(column.get()))
+        else if (const auto * col = checkAndGetColumn<ColumnFixedString>(column.get()))
         {
             if (Impl::is_fixed_to_constant)
             {
@@ -1022,7 +1022,7 @@ class FunctionStringOrArrayToT : public IFunction
                 block.getByPosition(result).column = std::move(col_res);
             }
         }
-        else if (const ColumnArray * col = checkAndGetColumn<ColumnArray>(column.get()))
+        else if (const auto * col = checkAndGetColumn<ColumnArray>(column.get()))
         {
             auto col_res = ColumnVector<ResultType>::create();
 
@@ -1081,13 +1081,13 @@ class FunctionReverse : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const ColumnPtr column = block.getByPosition(arguments[0]).column;
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column.get()))
         {
             auto col_res = ColumnString::create();
             ReverseImpl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
             block.getByPosition(result).column = std::move(col_res);
         }
-        else if (const ColumnFixedString * col = checkAndGetColumn<ColumnFixedString>(column.get()))
+        else if (const auto * col = checkAndGetColumn<ColumnFixedString>(column.get()))
         {
             auto col_res = ColumnFixedString::create(col->getN());
             ReverseImpl::vectorFixed(col->getChars(), col->getN(), col_res->getChars());
@@ -1131,7 +1131,7 @@ class FunctionJsonLength : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const ColumnPtr column = block.getByPosition(arguments[0]).column;
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column.get()))
         {
             auto col_res = ColumnUInt64::create();
             typename ColumnUInt64::Container & vec_col_res = col_res->getData();
@@ -1232,8 +1232,8 @@ class ConcatImpl : public IFunction
         const IColumn * c0 = block.getByPosition(arguments[0]).column.get();
         const IColumn * c1 = block.getByPosition(arguments[1]).column.get();
 
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
-        const ColumnString * c1_string = checkAndGetColumn<ColumnString>(c1);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
+        const auto * c1_string = checkAndGetColumn<ColumnString>(c1);
         const ColumnConst * c0_const_string = checkAndGetColumnConst<ColumnString>(c0);
         const ColumnConst * c1_const_string = checkAndGetColumnConst<ColumnString>(c1);
 
@@ -1552,7 +1552,7 @@ class FunctionSubstring : public IFunction
         if (number_of_arguments == 3)
             column_length = block.getByPosition(arguments[2]).column;
 
-        const ColumnConst * column_start_const = checkAndGetColumn<ColumnConst>(column_start.get());
+        const auto * column_start_const = checkAndGetColumn<ColumnConst>(column_start.get());
         const ColumnConst * column_length_const = nullptr;
 
         if (number_of_arguments == 3)
@@ -1572,9 +1572,9 @@ class FunctionSubstring : public IFunction
                 throw Exception("Third argument provided for function substring could not be negative.", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
         }
 
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column_string.get()))
             executeForSource(column_start, column_length, column_start_const, column_length_const, start_value, length_value, block, result, StringSource(*col));
-        else if (const ColumnFixedString * col = checkAndGetColumn<ColumnFixedString>(column_string.get()))
+        else if (const auto * col = checkAndGetColumn<ColumnFixedString>(column_string.get()))
             executeForSource(column_start, column_length, column_start_const, column_length_const, start_value, length_value, block, result, FixedStringSource(*col));
         else if (const ColumnConst * col = checkAndGetColumnConst<ColumnString>(column_string.get()))
             executeForSource(column_start, column_length, column_start_const, column_length_const, start_value, length_value, block, result, ConstSource<StringSource>(*col));
@@ -1676,7 +1676,7 @@ class FunctionSubstringUTF8 : public IFunction
                     return true;
                 }
 
-                const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get());
+                const auto * col = checkAndGetColumn<ColumnString>(column_string.get());
                 assert(col);
                 auto col_res = ColumnString::create();
                 getVectorConstConstFunc(implicit_length, is_positive)(col->getChars(), col->getOffsets(), start_abs, length, col_res->getChars(), col_res->getOffsets());
@@ -1732,7 +1732,7 @@ class FunctionSubstringUTF8 : public IFunction
 
                 // convert to vector if string is const.
                 ColumnPtr full_column_string = column_string->isColumnConst() ? column_string->convertToFullColumnIfConst() : column_string;
-                const ColumnString * col = checkAndGetColumn<ColumnString>(full_column_string.get());
+                const auto * col = checkAndGetColumn<ColumnString>(full_column_string.get());
                 assert(col);
                 auto col_res = ColumnString::create();
                 if (implicit_length)
@@ -1869,7 +1869,7 @@ class FunctionRightUTF8 : public IFunction
             using LengthFieldType = typename LengthType::FieldType;
 
             auto col_res = ColumnString::create();
-            if (const ColumnString * col_string = checkAndGetColumn<ColumnString>(column_string.get()))
+            if (const auto * col_string = checkAndGetColumn<ColumnString>(column_string.get()))
             {
                 if (column_length->isColumnConst())
                 {
@@ -1897,7 +1897,7 @@ class FunctionRightUTF8 : public IFunction
             else if (const ColumnConst * col_const_string = checkAndGetColumnConst<ColumnString>(column_string.get()))
             {
                 // const vector
-                const ColumnString * col_string_from_const = checkAndGetColumn<ColumnString>(col_const_string->getDataColumnPtr().get());
+                const auto * col_string_from_const = checkAndGetColumn<ColumnString>(col_const_string->getDataColumnPtr().get());
                 assert(col_string_from_const);
                 // When useDefaultImplementationForConstants is true, string and length are not both constants
                 assert(!column_length->isColumnConst());
@@ -1993,7 +1993,7 @@ class FunctionAppendTrailingCharIfAbsent : public IFunction
         if (!checkColumnConst<ColumnString>(column_char.get()))
             throw Exception(fmt::format("Second argument of function {} must be a constant string", getName()), ErrorCodes::ILLEGAL_COLUMN);
 
-        String trailing_char_str = static_cast<const ColumnConst &>(*column_char).getValue<String>();
+        auto trailing_char_str = static_cast<const ColumnConst &>(*column_char).getValue<String>();
 
         if (trailing_char_str.size() != 1)
             throw Exception(fmt::format("Second argument of function {} must be a one-character string", getName()), ErrorCodes::BAD_ARGUMENTS);
@@ -2101,7 +2101,7 @@ class TrimImpl : public IFunction
     void executeTrim(Block & block, const ColumnNumbers & arguments, const size_t result) const
     {
         const IColumn * c0 = block.getByPosition(arguments[0]).column.get();
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
         const ColumnConst * c0_const_string = checkAndGetColumnConst<ColumnString>(c0);
 
         auto c_res = ColumnString::create();
@@ -2121,8 +2121,8 @@ class TrimImpl : public IFunction
         const IColumn * c0 = block.getByPosition(arguments[0]).column.get();
         const IColumn * c1 = block.getByPosition(arguments[1]).column.get();
 
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
-        const ColumnString * c1_string = checkAndGetColumn<ColumnString>(c1);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
+        const auto * c1_string = checkAndGetColumn<ColumnString>(c1);
         const ColumnConst * c0_const_string = checkAndGetColumnConst<ColumnString>(c0);
         const ColumnConst * c1_const_string = checkAndGetColumnConst<ColumnString>(c1);
 
@@ -2202,7 +2202,7 @@ class TrimUTF8Impl : public IFunction
     void executeTrim(Block & block, const ColumnNumbers & arguments, const size_t result) const
     {
         const IColumn * c0 = block.getByPosition(arguments[0]).column.get();
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
         const ColumnConst * c0_const_string = checkAndGetColumnConst<ColumnString>(c0);
 
         auto c_res = ColumnString::create();
@@ -2225,7 +2225,7 @@ class TrimUTF8Impl : public IFunction
         const IColumn * c0 = block.getByPosition(arguments[0]).column.get();
         const IColumn * c1 = block.getByPosition(arguments[1]).column.get();
 
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
         const ColumnConst * c0_const_string = checkAndGetColumnConst<ColumnString>(c0);
         const ColumnConst * c1_const_string = checkAndGetColumnConst<ColumnString>(c1);
         const auto * column_trim_string = checkAndGetColumn<ColumnString>(c1_const_string->getDataColumnPtr().get());
@@ -2716,7 +2716,7 @@ class FunctionTiDBTrim : public IFunction
         ColumnPtr & column_data = block.getByPosition(arguments[0]).column;
         auto res_col = ColumnString::create();
 
-        const ColumnString * data_col = checkAndGetColumn<ColumnString>(column_data.get());
+        const auto * data_col = checkAndGetColumn<ColumnString>(column_data.get());
 
         static constexpr std::string_view default_rem = " ";
         static const auto * remstr_ptr = reinterpret_cast<const UInt8 *>(default_rem.data());
@@ -2738,25 +2738,25 @@ class FunctionTiDBTrim : public IFunction
         if (data_const && !remstr_const)
         {
             const ColumnConst * data_col = checkAndGetColumnConst<ColumnString>(column_data.get());
-            const ColumnString * remstr_col = checkAndGetColumn<ColumnString>(column_remstr.get());
+            const auto * remstr_col = checkAndGetColumn<ColumnString>(column_remstr.get());
 
-            const std::string data = data_col->getValue<String>();
+            const auto data = data_col->getValue<String>();
             const auto * data_ptr = reinterpret_cast<const UInt8 *>(data.c_str());
             constVector(is_ltrim, is_rtrim, data_ptr, data.size() + 1, remstr_col->getChars(), remstr_col->getOffsets(), res_col->getChars(), res_col->getOffsets());
         }
         else if (remstr_const && !data_const)
         {
             const ColumnConst * remstr_col = checkAndGetColumnConst<ColumnString>(column_remstr.get());
-            const ColumnString * data_col = checkAndGetColumn<ColumnString>(column_data.get());
+            const auto * data_col = checkAndGetColumn<ColumnString>(column_data.get());
 
-            const std::string remstr = remstr_col->getValue<String>();
+            const auto remstr = remstr_col->getValue<String>();
             const auto * remstr_ptr = reinterpret_cast<const UInt8 *>(remstr.c_str());
             vectorConst(is_ltrim, is_rtrim, data_col->getChars(), data_col->getOffsets(), remstr_ptr, remstr.size() + 1, res_col->getChars(), res_col->getOffsets());
         }
         else
         {
-            const ColumnString * data_col = checkAndGetColumn<ColumnString>(column_data.get());
-            const ColumnString * remstr_col = checkAndGetColumn<ColumnString>(column_remstr.get());
+            const auto * data_col = checkAndGetColumn<ColumnString>(column_data.get());
+            const auto * remstr_col = checkAndGetColumn<ColumnString>(column_remstr.get());
 
             vectorVector(is_ltrim, is_rtrim, data_col->getChars(), data_col->getOffsets(), remstr_col->getChars(), remstr_col->getOffsets(), res_col->getChars(), res_col->getOffsets());
         }
@@ -2769,7 +2769,7 @@ class FunctionTiDBTrim : public IFunction
         ColumnPtr & column_direction = block.getByPosition(arguments[2]).column;
         if (!column_direction->isColumnConst())
             throw Exception(fmt::format("3nd argument of function {} must be constant.", getName()));
-        const ColumnConst * direction_col = checkAndGetColumn<ColumnConst>(column_direction.get());
+        const auto * direction_col = checkAndGetColumn<ColumnConst>(column_direction.get());
 
         static constexpr Int64 trim_both_default = 0; // trims from both direction by default
         static constexpr Int64 trim_both = 1; // trims from both direction with explicit notation
@@ -2989,7 +2989,7 @@ class TidbPadImpl
                 {
                     continue;
                 }
-                int32_t len = static_cast<int32_t>(column_length->getInt(i));
+                auto len = static_cast<int32_t>(column_length->getInt(i));
                 if (len <= 0)
                 {
                     len = 0;
@@ -3051,7 +3051,7 @@ class TidbPadImpl
         }
         else
         {
-            const ColumnString * column_string = checkAndGetColumn<ColumnString>(column_string_ptr.get());
+            const auto * column_string = checkAndGetColumn<ColumnString>(column_string_ptr.get());
             const ColumnString::Offsets & string_offsets = column_string->getOffsets();
             const ColumnString::Chars_t & string_data = column_string->getChars();
 
@@ -3233,7 +3233,7 @@ class TidbPadImpl
             return true;
         }
 
-        ColumnString::Offset tmp_target_len = static_cast<ColumnString::Offset>(target_len);
+        auto tmp_target_len = static_cast<ColumnString::Offset>(target_len);
         ColumnString::Offset per_pad_offset = 0;
         ColumnString::Offset pad_bytes = 0;
         ColumnString::Offset left = 0;
@@ -3300,7 +3300,7 @@ class TidbPadImpl
             return true;
         }
 
-        ColumnString::Offset tmp_target_len = static_cast<ColumnString::Offset>(target_len);
+        auto tmp_target_len = static_cast<ColumnString::Offset>(target_len);
         if (data_len < tmp_target_len)
         {
             ColumnString::Offset left = tmp_target_len - data_len;
@@ -3421,7 +3421,7 @@ class PadImpl : public IFunction
         ColumnPtr column_length = block.getByPosition(arguments[1]).column;
         ColumnPtr column_padding = block.getByPosition(arguments[2]).column;
 
-        const ColumnConst * column_length_const = checkAndGetColumn<ColumnConst>(column_length.get());
+        const auto * column_length_const = checkAndGetColumn<ColumnConst>(column_length.get());
         const ColumnConst * column_padding_const = checkAndGetColumnConst<ColumnString>(column_padding.get());
 
         Int64 length_value = 0;
@@ -3441,7 +3441,7 @@ class PadImpl : public IFunction
 
         auto c_res = ColumnString::create();
 
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column_string.get()))
             pad<is_left, StringSource, ConstSource<StringSource>, StringSink>(
                 StringSource(*col),
                 ConstSource<StringSource>(*column_padding_const),
@@ -3548,7 +3548,7 @@ class PadUTF8Impl : public IFunction
         ColumnPtr column_length = block.getByPosition(arguments[1]).column;
         ColumnPtr column_padding = block.getByPosition(arguments[2]).column;
 
-        const ColumnConst * column_length_const = checkAndGetColumn<ColumnConst>(column_length.get());
+        const auto * column_length_const = checkAndGetColumn<ColumnConst>(column_length.get());
         const ColumnConst * column_padding_const = checkAndGetColumnConst<ColumnString>(column_padding.get());
 
         Int64 length_value = 0;
@@ -3568,7 +3568,7 @@ class PadUTF8Impl : public IFunction
 
         auto c_res = ColumnString::create();
         const auto * column_padding_string = checkAndGetColumn<ColumnString>(column_padding_const->getDataColumnPtr().get());
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column_string.get()))
             vector(col->getChars(), col->getOffsets(), length_value, column_padding_string->getChars(), column_padding_string->getOffsets(), c_res->getChars(), c_res->getOffsets());
         else if (const ColumnConst * col = checkAndGetColumnConst<ColumnString>(column_string.get()))
         {
@@ -4114,8 +4114,8 @@ class FunctionASCII : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const IColumn * c0_col = block.getByPosition(arguments[0]).column.get();
-        const ColumnConst * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0_col);
+        const auto * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0_col);
 
         Field res_field;
         int val_num = c0_col->size();
@@ -4165,8 +4165,8 @@ class FunctionLength : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const IColumn * c0_col = block.getByPosition(arguments[0]).column.get();
-        const ColumnConst * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0_col);
+        const auto * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0_col);
 
         Field res_field;
         int val_num = c0_col->size();
@@ -4215,13 +4215,13 @@ class FunctionPosition : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const IColumn * c0_col = block.getByPosition(arguments[0]).column.get();
-        const ColumnConst * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0_col);
+        const auto * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0_col);
         Field c0_field;
 
         const IColumn * c1_col = block.getByPosition(arguments[1]).column.get();
-        const ColumnConst * c1_const = checkAndGetColumn<ColumnConst>(c1_col);
-        const ColumnString * c1_string = checkAndGetColumn<ColumnString>(c1_col);
+        const auto * c1_const = checkAndGetColumn<ColumnConst>(c1_col);
+        const auto * c1_string = checkAndGetColumn<ColumnString>(c1_col);
         Field c1_field;
 
         if ((c0_const == nullptr && c0_string == nullptr) || (c1_const == nullptr && c1_string == nullptr))
@@ -4331,7 +4331,7 @@ class FunctionSubStringIndex : public IFunction
         column_str = column_str->isColumnConst() ? column_str->convertToFullColumnIfConst() : column_str;
         if (delim_const && count_const)
         {
-            const ColumnString * str_col = checkAndGetColumn<ColumnString>(column_str.get());
+            const auto * str_col = checkAndGetColumn<ColumnString>(column_str.get());
             const ColumnConst * delim_col = checkAndGetColumnConst<ColumnString>(column_delim.get());
             const ColumnConst * count_col = checkAndGetColumnConst<ColumnVector<IntType>>(column_count.get());
             if (str_col == nullptr || delim_col == nullptr || count_col == nullptr)
@@ -4339,7 +4339,7 @@ class FunctionSubStringIndex : public IFunction
                 return false;
             }
             auto col_res = ColumnString::create();
-            IntType count = count_col->getValue<IntType>();
+            auto count = count_col->getValue<IntType>();
             vectorConstConst(
                 str_col->getChars(),
                 str_col->getOffsets(),
@@ -4353,9 +4353,9 @@ class FunctionSubStringIndex : public IFunction
         {
             column_delim = column_delim->isColumnConst() ? column_delim->convertToFullColumnIfConst() : column_delim;
             column_count = column_count->isColumnConst() ? column_count->convertToFullColumnIfConst() : column_count;
-            const ColumnString * str_col = checkAndGetColumn<ColumnString>(column_str.get());
-            const ColumnString * delim_col = checkAndGetColumn<ColumnString>(column_delim.get());
-            const ColumnVector<IntType> * count_col = checkAndGetColumn<ColumnVector<IntType>>(column_count.get());
+            const auto * str_col = checkAndGetColumn<ColumnString>(column_str.get());
+            const auto * delim_col = checkAndGetColumn<ColumnString>(column_delim.get());
+            const auto * count_col = checkAndGetColumn<ColumnVector<IntType>>(column_count.get());
             if (str_col == nullptr || delim_col == nullptr || count_col == nullptr)
             {
                 return false;
@@ -4573,7 +4573,9 @@ class FormatImpl : public IFunction
             using NumberFieldType = typename NumberType::FieldType;
             using NumberColVec = std::conditional_t<IsDecimal<NumberFieldType>, ColumnDecimal<NumberFieldType>, ColumnVector<NumberFieldType>>;
             const auto * number_raw = block.getByPosition(arguments[0]).column.get();
+
             TiDBDecimalRoundInfo info{number_type, number_type};
+            info.output_prec = info.output_prec < 65 ? info.output_prec + 1 : 65;
 
             return getPrecisionType(precision_base_type, [&](const auto & precision_type, bool) {
                 using PrecisionType = std::decay_t<decltype(precision_type)>;
@@ -4723,10 +4725,11 @@ class FormatImpl : public IFunction
     static void format(
         T number,
         size_t max_num_decimals,
-        const TiDBDecimalRoundInfo & info,
+        TiDBDecimalRoundInfo & info,
         ColumnString::Chars_t & res_data,
         ColumnString::Offsets & res_offsets)
     {
+        info.output_scale = std::min(max_num_decimals, static_cast<size_t>(info.input_scale));
         auto round_number = round(number, max_num_decimals, info);
         std::string round_number_str = number2Str(round_number, info);
         std::string buffer = Format::apply(round_number_str, max_num_decimals);
@@ -4870,7 +4873,7 @@ class FunctionFormatWithLocale : public IFunction
             }
             else
             {
-                const String value = locale_const->getValue<String>();
+                const auto value = locale_const->getValue<String>();
                 if (!boost::iequals(value, supported_locale))
                 {
                     const auto & msg = genWarningMsg(value);
diff --git a/dbms/src/Functions/Regexps.h b/dbms/src/Functions/Regexps.h
index 119169be8b5..3eddd383cfb 100644
--- a/dbms/src/Functions/Regexps.h
+++ b/dbms/src/Functions/Regexps.h
@@ -18,13 +18,6 @@
 #include <Functions/ObjectPool.h>
 #include <Functions/likePatternToRegexp.h>
 
-
-namespace ProfileEvents
-{
-extern const Event RegexpCreated;
-}
-
-
 namespace DB
 {
 namespace Regexps
@@ -54,7 +47,6 @@ inline Pool::Pointer get(const std::string & pattern, int flags)
         if (no_capture)
             flags |= OptimizedRegularExpression::RE_NO_CAPTURE;
 
-        ProfileEvents::increment(ProfileEvents::RegexpCreated);
         return new Regexp{createRegexp<like>(pattern, flags)};
     });
 }
diff --git a/dbms/src/Functions/bitShiftRight.cpp b/dbms/src/Functions/bitShiftRight.cpp
index 961f7459f68..90b365771de 100644
--- a/dbms/src/Functions/bitShiftRight.cpp
+++ b/dbms/src/Functions/bitShiftRight.cpp
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include <Functions/FunctionBinaryArithmetic.h>
+#include <common/types.h>
+
+#include <limits>
 
 namespace DB
 {
@@ -29,7 +32,18 @@ struct BitShiftRightImpl<A, B, false>
     template <typename Result = ResultType>
     static Result apply(A a, B b)
     {
-        return static_cast<Result>(a) >> static_cast<Result>(b);
+        // It is an undefined behavior for shift operation in c++ that the right operand is negative or greater than
+        // or equal to the number of digits of the bits in the (promoted) left operand.
+        // See https://en.cppreference.com/w/cpp/language/operator_arithmetic for details.
+        if (static_cast<Result>(b) >= std::numeric_limits<decltype(static_cast<Result>(a))>::digits)
+        {
+            return static_cast<Result>(0);
+        }
+        // Note that we do not consider the case that the right operand is negative,
+        // since other types will all be cast to uint64 before shift operation
+        // according to DAGExpressionAnalyzerHelper::buildBitwiseFunction.
+        // Therefore, we simply suppress clang-tidy checking here.
+        return static_cast<Result>(a) >> static_cast<Result>(b); // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult)
     }
     template <typename Result = ResultType>
     static Result apply(A, B, UInt8 &)
@@ -87,4 +101,4 @@ void registerFunctionBitShiftRight(FunctionFactory & factory)
     factory.registerFunction<FunctionBitShiftRight>();
 }
 
-} // namespace DB
\ No newline at end of file
+} // namespace DB
diff --git a/dbms/src/Functions/tests/gtest_bitshiftright.cpp b/dbms/src/Functions/tests/gtest_bitshiftright.cpp
new file mode 100644
index 00000000000..a4af6336099
--- /dev/null
+++ b/dbms/src/Functions/tests/gtest_bitshiftright.cpp
@@ -0,0 +1,273 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/FunctionTestUtils.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+namespace DB
+{
+namespace tests
+{
+class TestFunctionBitShiftRight : public DB::tests::FunctionTest
+{
+};
+
+#define ASSERT_BITSHIFTRIGHT(t1, t2, result) \
+    ASSERT_COLUMN_EQ(result, executeFunction("bitShiftRight", {t1, t2}))
+
+TEST_F(TestFunctionBitShiftRight, Simple)
+try
+{
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int64>>({8}),
+                         createColumn<Nullable<Int64>>({2}),
+                         createColumn<Nullable<UInt64>>({2}));
+}
+CATCH
+
+/// Note: Only IntX and UIntX will be received by BitShiftRight, others will be casted by TiDB planner.
+/// Note: BitShiftRight will further cast other types to UInt64 before doing shift.
+TEST_F(TestFunctionBitShiftRight, TypePromotion)
+try
+{
+    // Type Promotion
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({-1}), createColumn<Nullable<Int16>>({1}), createColumn<Nullable<UInt64>>({9223372036854775807ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int16>>({-1}), createColumn<Nullable<Int32>>({1}), createColumn<Nullable<UInt64>>({9223372036854775807ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int32>>({-1}), createColumn<Nullable<Int64>>({1}), createColumn<Nullable<UInt64>>({9223372036854775807ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({-1}), createColumn<Nullable<Int64>>({1}), createColumn<Nullable<UInt64>>({9223372036854775807ull}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt8>>({1}), createColumn<Nullable<UInt16>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt16>>({1}), createColumn<Nullable<UInt32>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt32>>({1}), createColumn<Nullable<UInt64>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt8>>({1}), createColumn<Nullable<UInt64>>({0}), createColumn<Nullable<UInt64>>({1}));
+
+    // Type Promotion across signed/unsigned
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int16>>({-1}), createColumn<Nullable<UInt32>>({0}), createColumn<Nullable<UInt64>>({18446744073709551615ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int64>>({-1}), createColumn<Nullable<UInt8>>({0}), createColumn<Nullable<UInt64>>({18446744073709551615ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt32>>({1}), createColumn<Nullable<Int16>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt8>>({1}), createColumn<Nullable<Int64>>({0}), createColumn<Nullable<UInt64>>({1}));
+}
+CATCH
+
+TEST_F(TestFunctionBitShiftRight, Nullable)
+try
+{
+    // Non Nullable
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({1}), createColumn<Int16>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int16>({1}), createColumn<Int32>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int32>({1}), createColumn<Int64>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({1}), createColumn<Int64>({0}), createColumn<UInt64>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({1}), createColumn<UInt16>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt16>({1}), createColumn<UInt32>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt32>({1}), createColumn<UInt64>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({1}), createColumn<UInt64>({0}), createColumn<UInt64>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Int16>({1}), createColumn<UInt32>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int64>({1}), createColumn<UInt8>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt32>({1}), createColumn<Int16>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({1}), createColumn<Int64>({0}), createColumn<UInt64>({1}));
+
+    // Across Nullable and non-Nullable
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({1}), createColumn<Nullable<Int16>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int16>({1}), createColumn<Nullable<Int32>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int32>({1}), createColumn<Nullable<Int64>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({1}), createColumn<Nullable<Int64>>({0}), createColumn<Nullable<UInt64>>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({1}), createColumn<Nullable<UInt16>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt16>({1}), createColumn<Nullable<UInt32>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt32>({1}), createColumn<Nullable<UInt64>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({1}), createColumn<Nullable<UInt64>>({0}), createColumn<Nullable<UInt64>>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Int16>({1}), createColumn<Nullable<UInt32>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int64>({1}), createColumn<Nullable<UInt8>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt32>({1}), createColumn<Nullable<Int16>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({1}), createColumn<Nullable<Int64>>({0}), createColumn<Nullable<UInt64>>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({1}), createColumn<Int16>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int16>>({1}), createColumn<Int32>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int32>>({1}), createColumn<Int64>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({1}), createColumn<Int64>({0}), createColumn<Nullable<UInt64>>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt8>>({1}), createColumn<UInt16>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt16>>({1}), createColumn<UInt32>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt32>>({1}), createColumn<UInt64>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt8>>({1}), createColumn<UInt64>({0}), createColumn<Nullable<UInt64>>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int16>>({1}), createColumn<UInt32>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int64>>({1}), createColumn<UInt8>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt32>>({1}), createColumn<Int16>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt8>>({1}), createColumn<Int64>({0}), createColumn<Nullable<UInt64>>({1}));
+}
+CATCH
+
+TEST_F(TestFunctionBitShiftRight, TypeCastWithConst)
+try
+{
+    /// need test these kinds of columns:
+    /// 1. ColumnVector
+    /// 2. ColumnVector<Nullable>
+    /// 3. ColumnConst
+    /// 4. ColumnConst<Nullable>, value != null
+    /// 5. ColumnConst<Nullable>, value = null
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({0, 0, 1, 1}), createColumn<UInt64>({0, 1, 0, 1}), createColumn<UInt64>({0, 0, 1, 0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({0, 0, 1, 1}), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}), createColumn<Nullable<UInt64>>({0, 0, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({0, 0, 1, 1}), createConstColumn<UInt64>(4, 0), createColumn<UInt64>({0, 0, 1, 1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({0, 0, 1, 1}), createConstColumn<Nullable<UInt64>>(4, 0), createColumn<UInt64>({0, 0, 1, 1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({0, 0, 1, 1}), createConstColumn<Nullable<UInt64>>(4, std::nullopt), createConstColumn<Nullable<UInt64>>(4, std::nullopt)); // become const in wrapInNullable
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({0, 1, std::nullopt, std::nullopt}), createColumn<UInt64>({0, 1, 0, 1}), createColumn<Nullable<UInt64>>({0, 0, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({0, 1, std::nullopt, std::nullopt}), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}), createColumn<Nullable<UInt64>>({0, 0, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({0, 1, std::nullopt, std::nullopt}), createConstColumn<UInt64>(4, 0), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({0, 1, std::nullopt, std::nullopt}), createConstColumn<UInt64>(4, 0), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({0, 1, std::nullopt, std::nullopt}), createConstColumn<Nullable<UInt64>>(4, std::nullopt), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Int8>(4, 1), createColumn<UInt64>({0, 1, 0, 1}), createColumn<UInt64>({1, 0, 1, 0}));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Int8>(4, 1), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}), createColumn<Nullable<UInt64>>({1, 0, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Int8>(4, 1), createConstColumn<UInt64>(4, 0), createConstColumn<UInt64>(4, 1));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Int8>(4, 1), createConstColumn<Nullable<UInt64>>(4, 0), createConstColumn<UInt64>(4, 1));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Int8>(4, 1), createConstColumn<Nullable<UInt64>>(4, std::nullopt), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, 1), createColumn<UInt64>({0, 1, 0, 1}), createColumn<UInt64>({1, 0, 1, 0}));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, 1), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}), createColumn<Nullable<UInt64>>({1, 0, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, 1), createConstColumn<UInt64>(4, 0), createConstColumn<UInt64>(4, 1));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, 1), createConstColumn<Nullable<UInt64>>(4, 0), createConstColumn<UInt64>(4, 1));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, 1), createConstColumn<Nullable<UInt64>>(4, std::nullopt), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, std::nullopt), createColumn<UInt64>({0, 1, 0, 1}), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, std::nullopt), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, std::nullopt), createConstColumn<UInt64>(4, 0), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, std::nullopt), createConstColumn<UInt64>(4, 0), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, std::nullopt), createConstColumn<Nullable<UInt64>>(4, std::nullopt), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+}
+CATCH
+
+TEST_F(TestFunctionBitShiftRight, Boundary)
+try
+{
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({127, 127, -128, -128}), createColumn<UInt8>({0, 7, 0, 7}), createColumn<UInt64>({127, 0, 18446744073709551488ull, 144115188075855871ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({127, 127, -128, -128}), createColumn<UInt16>({0, 7, 0, 7}), createColumn<UInt64>({127, 0, 18446744073709551488ull, 144115188075855871ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int16>({32767, 32767, -32768, -32768}), createColumn<UInt8>({0, 15, 0, 15}), createColumn<UInt64>({32767, 0, 18446744073709518848ull, 562949953421311ull}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Int64>({0, 0, 1, 1, -1, -1, INT64_MAX, INT64_MAX, INT64_MIN, INT64_MIN}),
+                         createColumn<UInt64>({0, 63, 0, 63, 0, 63, 0, 63, 0, 63}),
+                         createColumn<UInt64>({0, 0, 1, 0, 18446744073709551615ull, 1, INT64_MAX, 0, 9223372036854775808ull, 1}));
+}
+CATCH
+
+TEST_F(TestFunctionBitShiftRight, UINT64)
+try
+{
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({0, UINT64_MAX}),
+                         createColumn<UInt64>({63, 63}),
+                         createColumn<UInt64>({0, 1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt64>>({0, UINT64_MAX, std::nullopt}),
+                         createColumn<Nullable<UInt64>>({63, 63, 63}),
+                         createColumn<Nullable<UInt64>>({0, 1, std::nullopt}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt64>>({0, UINT64_MAX, std::nullopt}),
+                         createColumn<UInt64>({63, 63, 63}),
+                         createColumn<Nullable<UInt64>>({0, 1, std::nullopt}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({0, UINT64_MAX}),
+                         createColumn<Nullable<UInt64>>({63, 63}),
+                         createColumn<Nullable<UInt64>>({0, 1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Int64>({0, 0, 1, 1, -1, -1, INT64_MAX, INT64_MAX, INT64_MIN, INT64_MIN}),
+                         createColumn<UInt64>({0, UINT64_MAX, 0, UINT64_MAX, 0, UINT64_MAX, 0, UINT64_MAX, 0, UINT64_MAX}),
+                         createColumn<UInt64>({0, 0, 1, 0, 18446744073709551615ull, 0, INT64_MAX, 0, 9223372036854775808ull, 0}));
+
+
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({0, 0, UINT64_MAX, UINT64_MAX}),
+                         createColumn<UInt64>({0, UINT64_MAX, 0, UINT64_MAX}),
+                         createColumn<UInt64>({0, 0, UINT64_MAX, 0}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt64>>({0, 0, UINT64_MAX, UINT64_MAX, 0, std::nullopt}),
+                         createColumn<Nullable<UInt64>>({0, UINT64_MAX, 0, UINT64_MAX, std::nullopt, 0}),
+                         createColumn<Nullable<UInt64>>({0, 0, UINT64_MAX, 0, std::nullopt, std::nullopt}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt64>>({0, 0, UINT64_MAX, UINT64_MAX, std::nullopt}),
+                         createColumn<UInt64>({0, UINT64_MAX, 0, UINT64_MAX, 0}),
+                         createColumn<Nullable<UInt64>>({0, 0, UINT64_MAX, 0, std::nullopt}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({0, UINT64_MAX, 0, UINT64_MAX, 0}),
+                         createColumn<Nullable<UInt64>>({0, 0, UINT64_MAX, UINT64_MAX, std::nullopt}),
+                         createColumn<Nullable<UInt64>>({0, UINT64_MAX, 0, 0, std::nullopt}));
+
+    /*
+    std::mt19937 gen(std::random_device{}());
+    std::uniform_int_distribution<uint64_t> dis(
+            std::numeric_limits<std::uint64_t>::min(),
+            std::numeric_limits<std::uint64_t>::max()
+    );
+    size_t count = 100;
+    std::vector<uint64_t> v1(count), v2(count), res(count);
+    for (size_t i=0; i<count; ++i) {
+        v1[i] = dis(gen);
+        v2[i] = dis(gen) % 64;
+        res[i] = v1[i] >> v2[i];
+    }
+    */
+    // clang-format off
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({4286230172992429668ull,11550684080080434735ull,775195682263841867ull,18390588538388462661ull,15578761645824658314ull,20662948907547635ull,8403266546632871011ull,10316916867086714284ull,14494183568060929367ull,11741337603037632348ull,10803264694948981380ull,2181969932373516503ull,9673801579564730047ull,12998855911221966916ull,13852157931865274857ull,9203926828777338586ull,8903261359104369984ull,3296258311466476456ull,14658801806079697908ull,7542518003247963618ull,7751150277360944372ull,12225694156629117269ull,3173837214287201256ull,10555082060194839563ull,14202570947308501213ull,13841194359225980123ull,9085267378073816945ull,15975493157631073381ull,1890233386459299033ull,2368634323417847398ull,691423931511513606ull,986000479038857169ull,6676906740954304741ull,2841686799872009560ull,6483676442160212821ull,12550114481083571140ull,1973026146580965947ull,15006687639313690830ull,6443617813685195609ull,13648732879238232658ull,173820604016606515ull,2669428687588070677ull,15361476519767969236ull,8957522718906827285ull,10484385204137290737ull,12390466571993898199ull,13655746682011856065ull,4183302523705398003ull,9898692767945122925ull,16701902679050716746ull,15003324714492513897ull,15554724240808081962ull,7754458312088240871ull,16060968032680196798ull,12619581440986221928ull,15462661961676206824ull,2991773628650321635ull,16341599119345297909ull,14943939970889580769ull,17589764776976679210ull,15274914527536421890ull,16268454608136611433ull,14617646699124891378ull,466927094873143934ull,10558583305251737283ull,255559140356160501ull,5962789691899784330ull,8004603198837555992ull,1881892337023478820ull,6549167700870881840ull,17551996157828573642ull,3349744237253314638ull,2876698686583880568ull,16792783373922568330ull,16231348759981899800ull,17731631990557975899ull,1305376485657663531ull,3568754485566225727ull,10076204423028931225ull,1206238310176455071ull,4297062324543635867ull,5116785256928623516ull,4216305034157620433ull,412817651268481791ull,11256299741838589766ull,10786197076871163667ull,8588357635228913652ull,6361409982074778071ull,4750871994764527580ull,12851835128796581697ull,13871712051825681122ull,12445309465661589227ull,1668617678034382020ull,10152918068481134781ull,16242941973571224246ull,12988338226657152812ull,2352083670492692674ull,10735026236980245779ull,14986388012066843516ull,17651064432466444102ull}),
+                         createColumn<UInt64>({0,58,55,24,5,35,34,54,43,45,17,36,51,54,19,55,55,8,37,49,15,11,36,0,5,41,46,54,2,59,11,25,43,29,31,8,59,2,11,19,56,35,57,13,2,35,6,54,17,0,49,5,15,3,60,44,16,6,57,44,58,54,26,23,58,23,26,29,56,40,45,2,21,9,57,40,4,46,17,15,62,21,5,54,22,47,10,24,53,61,43,52,23,10,61,43,26,31,38,2}),
+                         createColumn<UInt64>({4286230172992429668ull,40,21,1096164497041ull,486836301432020572ull,601370,489134489,572,1647797,333708,82422368583289ull,31751841,4296,721,26420894492846ull,255,247,12876009029165923ull,106656820,13398,236546334147978ull,5969577224916561ull,46185410,10555082060194839563ull,443830342103390662ull,6294246,129109,886,472558346614824758ull,4,337609341558356ull,29385104150ull,759076,5293054133ull,3019197118ull,49023884691732699ull,3,3751671909828422707ull,3146297760588474ull,26032891996838ull,2,77690599,106,1093447597522806ull,2621096301034322684ull,360610038,213371041906435251ull,232,75521032470284ull,16701902679050716746ull,26651,486085132525252561ull,236647287356208ull,2007621004085024599ull,10,878950,45650842722325ull,255337486239770279ull,103,999862,52,903,217819909738ull,55662047251ull,36,30465023560ull,88852490364ull,14909735319ull,26,5956433,498857,837436059313328659ull,1371716826717ull,32798405027192516ull,112,16126825,81586030353603970ull,50715,76875338920813ull,36811471868177ull,0,2439873341049ull,131759532317425638ull,22,2683710990390ull,76640,8387068003153235ull,379169582252ull,527,5,1577031,2763,198914727930ull,9914959051251108ull,7,1476603,35048777915ull,4998886136ull,54520161,4412766108116611025ull}));
+    // clang-format on
+}
+CATCH
+
+TEST_F(TestFunctionBitShiftRight, UB)
+try
+{
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({127, -128}), createColumn<UInt8>({64, 64}), createColumn<UInt64>({0, 0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({127, -128}), createColumn<UInt16>({64, 64}), createColumn<UInt64>({0, 0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int16>({32767, -32768}), createColumn<UInt8>({64, 64}), createColumn<UInt64>({0, 0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int32>({INT32_MAX, INT32_MIN}), createColumn<UInt8>({64, 64}), createColumn<UInt64>({0, 0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int64>({INT64_MAX, INT64_MIN}), createColumn<UInt8>({64, 64}), createColumn<UInt64>({0, 0}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({255}), createColumn<UInt8>({64}), createColumn<UInt64>({0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({255}), createColumn<UInt16>({64}), createColumn<UInt64>({0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt16>({65535}), createColumn<UInt8>({64}), createColumn<UInt64>({0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt32>({UINT32_MAX}), createColumn<UInt8>({64}), createColumn<UInt64>({0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({UINT64_MAX}), createColumn<UInt8>({64}), createColumn<UInt64>({0}));
+
+    /*
+    std::mt19937 gen(std::random_device{}());
+    std::uniform_int_distribution<uint64_t> dis1(
+            std::numeric_limits<std::uint64_t>::min(),
+            std::numeric_limits<std::uint64_t>::max()
+    );
+    std::uniform_int_distribution<uint64_t> dis2(
+            64,
+            std::numeric_limits<std::uint64_t>::max()
+    );
+    size_t count = 100;
+    std::vector<uint64_t> v1(count), v2(count), res(count);
+    for (size_t i=0; i<count; ++i) {
+        v1[i] = dis1(gen);
+        v2[i] = dis2(gen);
+        res[i] = 0;
+    }
+    */
+    // clang-format off
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({17563387625296433369ull,5842891814427459261ull,15074502074821508463ull,386435802999553003ull,5487893274931198395ull,8125923807366590570ull,13340330062727071249ull,14908193031091561411ull,296805448857369387ull,8684453485792353774ull,13117933444495098288ull,3225762988982100714ull,11290506757949810556ull,14617912756126856962ull,9479575714707174581ull,11720728318194739598ull,14410575429605211363ull,12068356718035872518ull,80682389916710599ull,11003236134534292734ull,4412447398096224810ull,5331184707993902906ull,13827083432789678788ull,958142831027309576ull,16716461997317184701ull,17128750834581527743ull,11590434571174666313ull,10204342520615148287ull,11067791415848657283ull,17583875436196878829ull,186304014359496415ull,9381729025189804702ull,11502205568225715300ull,16472133582690439104ull,3743303387826342067ull,12860029445868505658ull,2244056593742923769ull,3275687468466891223ull,1545828456957460699ull,14187252460708728077ull,7551907967738536187ull,9754400233340010491ull,16293183350230169116ull,6298812696728711031ull,5915538565572009956ull,2284684518775825662ull,1130711226902262476ull,17158957721471765323ull,4220824385439711070ull,16559772875254313109ull,15397179690017513678ull,6300413832999049491ull,13787530251307637715ull,10132349060092695582ull,10446586881482901699ull,15759779838283537085ull,14402587207027333363ull,5546051719872960161ull,6545031029710296628ull,17407295406267098658ull,4259019625544816073ull,791895457880289787ull,8549227257401578066ull,15246278171168501125ull,1674668228908076954ull,849762797502000057ull,13302651500925764574ull,12438174880334092333ull,17701249772557033303ull,10742459186038873636ull,15671491258945407856ull,9352557101631889001ull,8914093883925002585ull,17935292744735591949ull,606989231583658922ull,6528503454270721815ull,14980539549624989095ull,13765196438235456668ull,3058323869228644592ull,14346577759191739044ull,1543206286382906519ull,1025562312317433790ull,17052896445025268012ull,18349597294988935754ull,17174604730104962524ull,11924965352621110201ull,502032511104181724ull,13845633389643139332ull,15436039204445155412ull,17809579006694175565ull,15166364145138562881ull,14062748599121933798ull,1777457178576774356ull,4985224560472716170ull,3881603168175384251ull,11555031280550342082ull,1252677486917153396ull,8744807353133366467ull,2048964426549800495ull,11945831330508218140ull}),
+                         createColumn<UInt64>({7570379165150948640ull,2086259313016069849ull,3606689596671293211ull,14039117280692395662ull,13678665403528829741ull,16069000531561010558ull,18229345530821449414ull,433464578739092378ull,6298872104011095934ull,4518228872693063137ull,14988726875963869472ull,9568218424260764817ull,5383191468426384555ull,8698762658876708752ull,9487599666567205013ull,14370091126330876161ull,10702068376663045773ull,8045701071228357739ull,10878469353312437370ull,3183167829827610494ull,5928881618833110378ull,10410530709181481816ull,249988564503361262ull,13482614555530280987ull,5522946068620734806ull,12797173590813112894ull,14133419908717831141ull,10825732602137508628ull,13271177233899692778ull,1157753039017783757ull,3370600557036147696ull,2957689395775524062ull,11963898745206689513ull,4828931188614542720ull,15157289330857160797ull,369467010700905309ull,6278071805692607460ull,17817858137511910604ull,17789013631125929528ull,2861684947245777353ull,2583152408663154190ull,7935135702156687355ull,3033127046167579202ull,14224256960933395097ull,10838403249753694181ull,2154089102842257532ull,7860358918492191001ull,2982010253383852617ull,16385171982396620123ull,12241857497176342828ull,2080931105225959532ull,1046322072991155713ull,6146917059052005252ull,17411786298437646544ull,5497869583209795613ull,11701448129764809247ull,12642962700918363620ull,15936842187305218463ull,7811510447588439153ull,3558405966224377785ull,977960926168429540ull,9505800334935014018ull,12114068456102275321ull,5141880021314950000ull,6719615890604904521ull,1341445859098821585ull,3883912906202435997ull,2107770591867486616ull,2657186337437393032ull,2640917573672927653ull,3746140861437224253ull,15057648507099656234ull,12051189681068107042ull,2259769676757597701ull,2935229535510718769ull,6368233316971463582ull,14384644474340782197ull,2553547617837260603ull,14238122466576902747ull,9555765226032904481ull,15522640015319979866ull,10274396157562093026ull,5996101113505388770ull,16915812546351047056ull,4956089714130804219ull,17126605744801075545ull,12036643325202409080ull,11257234688654558199ull,375338337104024778ull,11152980243617851986ull,12325805905403174063ull,8653948654121626815ull,15348912598299408338ull,6883296938248095081ull,6484642948886870833ull,16936141613107270500ull,17012171815528507292ull,2574129622316042070ull,17178726110735453748ull,16578303277501346489ull}),
+                         createColumn<UInt64>({0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}));
+    // clang-format on
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Functions/tests/gtest_duration_pushdown.cpp b/dbms/src/Functions/tests/gtest_duration_pushdown.cpp
index 4501a4c9fae..106f3d84642 100644
--- a/dbms/src/Functions/tests/gtest_duration_pushdown.cpp
+++ b/dbms/src/Functions/tests/gtest_duration_pushdown.cpp
@@ -166,5 +166,85 @@ try
     ASSERT_COLUMN_EQ(microSecond_out, executeFunction("microSecond", input4));
 }
 CATCH
+
+TEST_F(DurationPushDown, timeToSecPushDownTest)
+try
+{
+    ColumnWithTypeAndName input(
+        createColumn<Nullable<DataTypeMyDuration::FieldType>>({(838 * 3600 + 59 * 60 + 59) * 1000000000L + 999999000L,
+                                                               -(838 * 3600 + 59 * 60 + 59) * 1000000000L - 123456000L,
+                                                               0,
+                                                               (1 * 3600 + 2 * 60 + 3) * 1000000000L + 4000L})
+            .column,
+        makeNullable(std::make_shared<DataTypeMyDuration>(6)),
+        "input");
+    auto second_output = createColumn<Nullable<Int64>>({3020399, -3020399, 0, 3723});
+    ASSERT_COLUMN_EQ(second_output, executeFunction("tidbTimeToSec", input));
+
+    // Test Overflow
+    ColumnWithTypeAndName input2(
+        createColumn<Nullable<DataTypeMyDuration::FieldType>>({(838 * 3600 + 59 * 60 + 59) * 1000000000L + 999999000L + 1000L}).column,
+        makeNullable(std::make_shared<DataTypeMyDuration>(6)),
+        "result");
+    try
+    {
+        auto result = executeFunction("tidbTimeToSec", input2);
+        FAIL() << "Expected overflow";
+    }
+    catch (DB::Exception & e)
+    {
+        ASSERT_EQ(e.message(), std::string("nanos must >= -3020399999999000 and <= 3020399999999000"));
+    }
+    catch (...)
+    {
+        FAIL() << "Expected overflow";
+    };
+
+    ColumnWithTypeAndName input3(
+        createColumn<Nullable<DataTypeMyDuration::FieldType>>({-(838 * 3600 + 59 * 60 + 59) * 1000000000L - 999999000L - 1000L}).column,
+        makeNullable(std::make_shared<DataTypeMyDuration>(6)),
+        "result");
+    try
+    {
+        auto result = executeFunction("tidbTimeToSec", input3);
+        FAIL() << "Expected overflow";
+    }
+    catch (DB::Exception & e)
+    {
+        ASSERT_EQ(e.message(), std::string("nanos must >= -3020399999999000 and <= 3020399999999000"));
+    }
+    catch (...)
+    {
+        FAIL() << "Expected overflow";
+    };
+
+    // Random Test
+    constexpr int rowNum = 1000;
+    auto dur_column = ColumnVector<Int64>::create();
+    auto & dur_data = dur_column->getData();
+    auto second_column = ColumnVector<Int64>::create();
+    auto & second_data = second_column->getData();
+    dur_data.resize(rowNum);
+    second_data.resize(rowNum);
+
+    std::random_device rd;
+    std::default_random_engine gen = std::default_random_engine(rd());
+    std::uniform_int_distribution<int> sign_dis(0, 1), hour_dis(0, 838), minute_dis(0, 59), second_dis(0, 59), microSecond_dis(0, 999999);
+    for (int i = 0; i < rowNum; ++i)
+    {
+        auto sign = (sign_dis(gen) == 0) ? 1 : -1;
+        auto hour = hour_dis(gen);
+        auto minute = minute_dis(gen);
+        auto second = second_dis(gen);
+        auto microSecond = microSecond_dis(gen);
+        dur_data[i] = sign * ((hour * 3600 + minute * 60 + second) * 1000000000L + microSecond * 1000L);
+        second_data[i] = sign * (hour * 3600 + minute * 60 + second);
+    }
+
+    ColumnWithTypeAndName input4(std::move(dur_column), std::make_shared<DataTypeMyDuration>(6), "duration");
+    ColumnWithTypeAndName second_out(std::move(second_column), std::make_shared<DataTypeInt64>(), "time_to_sec");
+    ASSERT_COLUMN_EQ(second_out, executeFunction("tidbTimeToSec", input4));
+}
+CATCH
 } // namespace tests
 } // namespace DB
\ No newline at end of file
diff --git a/dbms/src/Functions/tests/gtest_get_format.cpp b/dbms/src/Functions/tests/gtest_get_format.cpp
new file mode 100644
index 00000000000..61a8d80e7b4
--- /dev/null
+++ b/dbms/src/Functions/tests/gtest_get_format.cpp
@@ -0,0 +1,153 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Columns/ColumnConst.h>
+#include <Columns/ColumnString.h>
+#include <DataTypes/DataTypeString.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <Interpreters/Context.h>
+#include <TestUtils/FunctionTestUtils.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+#include <string>
+#include <vector>
+
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#include <Poco/Types.h>
+
+#pragma GCC diagnostic pop
+
+namespace DB::tests
+{
+class GetFormatTest : public DB::tests::FunctionTest
+{
+public:
+    static constexpr auto funcName = "getFormat";
+};
+
+TEST_F(GetFormatTest, testBoundary)
+try
+{
+    // const(non-null), vector
+    // time_type is a const with non null value
+    // location is a vector containing null
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"%m.%d.%Y", {}}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(2, "DATE"),
+            createColumn<Nullable<String>>({"USA", {}})));
+
+    // const(null), vector
+    // time_type is a const with null value
+    // location is a vector containing null
+    ASSERT_COLUMN_EQ(
+        createConstColumn<Nullable<String>>(2, {}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(2, {}),
+            createColumn<Nullable<String>>({"USA", {}})));
+
+    // const(non-null), const(non-null)
+    // time_type is a const with non null value
+    // location is a const with non null value
+    ASSERT_COLUMN_EQ(
+        createConstColumn<String>(2, "%m.%d.%Y"),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(2, "DATE"),
+            createConstColumn<Nullable<String>>(2, "USA")));
+
+    // const(non-null), const(null)
+    // time_type is a const with non null value
+    // location is a const with null value
+    ASSERT_COLUMN_EQ(
+        createConstColumn<Nullable<String>>(2, {}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(2, "DATE"),
+            createConstColumn<Nullable<String>>(2, {})));
+
+    // The time_type is a system pre_defined macro, thus assume time_type column is const
+    // Throw an exception is time_type is not ColumnConst
+    ASSERT_THROW(
+        executeFunction(
+            funcName,
+            createColumn<Nullable<String>>({"DATE", "TIME"}),
+            createColumn<Nullable<String>>({"USA", {}})),
+        DB::Exception);
+}
+CATCH
+
+TEST_F(GetFormatTest, testMoreCases)
+try
+{
+    // time_type: DATE
+    // all locations
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"%m.%d.%Y", "%Y-%m-%d", "%Y-%m-%d", "%d.%m.%Y", "%Y%m%d"}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(5, "DATE"),
+            createColumn<Nullable<String>>({"USA", "JIS", "ISO", "EUR", "INTERNAL"})));
+
+    // time_type: DATETIME
+    // all locations
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"%Y-%m-%d %H.%i.%s", "%Y-%m-%d %H:%i:%s", "%Y-%m-%d %H:%i:%s", "%Y-%m-%d %H.%i.%s", "%Y%m%d%H%i%s"}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(5, "DATETIME"),
+            createColumn<Nullable<String>>({"USA", "JIS", "ISO", "EUR", "INTERNAL"})));
+
+    // time_type: TIMESTAMP
+    // all locations
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"%Y-%m-%d %H.%i.%s", "%Y-%m-%d %H:%i:%s", "%Y-%m-%d %H:%i:%s", "%Y-%m-%d %H.%i.%s", "%Y%m%d%H%i%s"}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(5, "TIMESTAMP"),
+            createColumn<Nullable<String>>({"USA", "JIS", "ISO", "EUR", "INTERNAL"})));
+
+    // time_type: TIME
+    // all locations
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"%h:%i:%s %p", "%H:%i:%s", "%H:%i:%s", "%H.%i.%s", "%H%i%s"}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(5, "TIME"),
+            createColumn<Nullable<String>>({"USA", "JIS", "ISO", "EUR", "INTERNAL"})));
+
+    // the location is not in ("USA", "JIS", "ISO", "EUR", "INTERNAL")
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"", ""}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(2, "TIME"),
+            createColumn<Nullable<String>>({"CAN", ""})));
+
+    // the time_type is not in ("DATE", "DATETIME", "TIMESTAMP", "TIME")
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"", ""}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(2, "TIMEINUTC"),
+            createColumn<Nullable<String>>({"USA", "ISO"})));
+}
+CATCH
+
+} // namespace DB::tests
diff --git a/dbms/src/Functions/tests/gtest_strings_format.cpp b/dbms/src/Functions/tests/gtest_strings_format.cpp
index 2d571a9bb1b..8f3b899316e 100644
--- a/dbms/src/Functions/tests/gtest_strings_format.cpp
+++ b/dbms/src/Functions/tests/gtest_strings_format.cpp
@@ -34,7 +34,7 @@ class StringFormat : public DB::tests::FunctionTest
         using FieldType = DecimalField<Decimal>;
         using NullableDecimal = Nullable<Decimal>;
         ASSERT_COLUMN_EQ(
-            createColumn<Nullable<String>>({"0.0000", "-0.0120", "0.0120", "12,332.1000", "12,332", "12,332", "12,332.300000000000000000000000000000", "-12,332.30000", "-1,000.0", "-333.33", {}}),
+            createColumn<Nullable<String>>({"0.0000", "-0.0120", "0.0120", "12,332.1000", "12,332", "12,332", "12,332.300000000000000000000000000000", "-12,332.30000", "-1,000.0", "-333.33", {}, "99,999.9999000000", "100,000.000", "100,000"}),
             executeFunction(
                 func_name,
                 createColumn<NullableDecimal>(
@@ -49,8 +49,11 @@ class StringFormat : public DB::tests::FunctionTest
                      FieldType(static_cast<Native>(-123323000), 4),
                      FieldType(static_cast<Native>(-9999999), 4),
                      FieldType(static_cast<Native>(-3333330), 4),
-                     FieldType(static_cast<Native>(0), 0)}),
-                createColumn<Nullable<Int64>>({4, 4, 4, 4, 0, -1, 31, 5, 1, 2, {}})));
+                     FieldType(static_cast<Native>(0), 0),
+                     FieldType(static_cast<Native>(999999999), 4),
+                     FieldType(static_cast<Native>(999999999), 4),
+                     FieldType(static_cast<Native>(999999999), 4)}),
+                createColumn<Nullable<Int64>>({4, 4, 4, 4, 0, -1, 31, 5, 1, 2, {}, 10, 3, -5})));
         ASSERT_COLUMN_EQ(
             createColumn<Nullable<String>>({"12,332.100", "-12,332.300", "-1,000.000", "-333.333"}),
             executeFunction(
@@ -62,8 +65,6 @@ class StringFormat : public DB::tests::FunctionTest
                      FieldType(static_cast<Native>(-9999999), 4),
                      FieldType(static_cast<Native>(-3333330), 4)}),
                 createConstColumn<Nullable<Int16>>(4, 3)));
-        /// known issue https://github.com/pingcap/tiflash/issues/4891
-        /*
         ASSERT_COLUMN_EQ(
             createColumn<Nullable<String>>({"-999.9999", "-1,000", "-1,000", "-999.999900000000000000000000000000", "-999.99990", "-1,000.0", "-1,000.00"}),
             executeFunction(
@@ -74,7 +75,7 @@ class StringFormat : public DB::tests::FunctionTest
                     FieldType(static_cast<Native>(-9999999), 4)),
                 createColumn<Nullable<Int32>>({4, 0, -1, 31, 5, 1, 2})));
         ASSERT_COLUMN_EQ(
-            createConstColumn<Nullable<String>>(1, "-1,000.000"),
+            createConstColumn<String>(1, "-1,000.000"),
             executeFunction(
                 func_name,
                 createConstColumn<NullableDecimal>(
@@ -82,7 +83,6 @@ class StringFormat : public DB::tests::FunctionTest
                     1,
                     FieldType(static_cast<Native>(-9999999), 4)),
                 createConstColumn<Nullable<Int8>>(1, 3)));
-                */
         ASSERT_COLUMN_EQ(
             createColumn<Nullable<String>>({"12,332.1000", "12,332", "12,332.300000000000000000000000000000", "-12,332.30000", "-1,000.0", "-333.33", {}}),
             executeFunction(
@@ -108,8 +108,6 @@ class StringFormat : public DB::tests::FunctionTest
                      FieldType(static_cast<Native>(-9999999), 4),
                      FieldType(static_cast<Native>(-3333330), 4)}),
                 createConstColumn<Nullable<UInt16>>(4, 3)));
-        /// known issue https://github.com/pingcap/tiflash/issues/4891
-        /*
         ASSERT_COLUMN_EQ(
             createColumn<Nullable<String>>({"-999.9999", "-1,000", "-999.999900000000000000000000000000", "-999.99990", "-1,000.0", "-1,000.00"}),
             executeFunction(
@@ -120,7 +118,7 @@ class StringFormat : public DB::tests::FunctionTest
                     FieldType(static_cast<Native>(-9999999), 4)),
                 createColumn<Nullable<UInt32>>({4, 0, 31, 5, 1, 2})));
         ASSERT_COLUMN_EQ(
-            createConstColumn<Nullable<String>>(1, "-1,000.000"),
+            createConstColumn<String>(1, "-1,000.000"),
             executeFunction(
                 func_name,
                 createConstColumn<NullableDecimal>(
@@ -128,7 +126,6 @@ class StringFormat : public DB::tests::FunctionTest
                     1,
                     FieldType(static_cast<Native>(-9999999), 4)),
                 createConstColumn<Nullable<UInt8>>(1, 3)));
-         */
     }
 
     template <typename Integer>
diff --git a/dbms/src/Functions/tests/gtest_strings_reverse.cpp b/dbms/src/Functions/tests/gtest_strings_reverse.cpp
new file mode 100644
index 00000000000..304a403db83
--- /dev/null
+++ b/dbms/src/Functions/tests/gtest_strings_reverse.cpp
@@ -0,0 +1,120 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionsString.h>
+#include <Interpreters/Context.h>
+#include <TestUtils/FunctionTestUtils.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+#include <string>
+#include <vector>
+
+#pragma GCC diagnostic pop
+
+namespace DB::tests
+{
+class StringReverse : public DB::tests::FunctionTest
+{
+protected:
+    static ColumnWithTypeAndName toVec(const std::vector<String> & v)
+    {
+        return createColumn<String>(v);
+    }
+
+    static ColumnWithTypeAndName toNullableVec(const std::vector<std::optional<String>> & v)
+    {
+        return createColumn<Nullable<String>>(v);
+    }
+
+    static ColumnWithTypeAndName toConst(const String & s)
+    {
+        return createConstColumn<String>(1, s);
+    }
+};
+// test reverse
+TEST_F(StringReverse, stringReverseTest)
+try
+{
+    std::vector<String> candidate_strings = {"one week's time test", "abcdef", "abcabc", "moc.pacgnip"};
+    std::vector<String> reversed_strings = {"tset emit s'keew eno", "fedcba", "cbacba", "pingcap.com"};
+
+    // test vector
+    ASSERT_COLUMN_EQ(
+        toVec(reversed_strings),
+        executeFunction(
+            "reverse",
+            toVec(candidate_strings)));
+
+    // test nullable
+    ASSERT_COLUMN_EQ(
+        toNullableVec({"", " ", {}, "pacgnip"}),
+        executeFunction(
+            "reverse",
+            toNullableVec({"", " ", {}, "pingcap"})));
+
+    // test const
+    ASSERT_COLUMN_EQ(
+        toConst("pacgnip"),
+        executeFunction(
+            "reverse",
+            toConst("pingcap")));
+
+    // test null
+    ASSERT_COLUMN_EQ(
+        toConst({}),
+        executeFunction(
+            "reverse",
+            toConst({})));
+}
+CATCH
+
+// test reverseUTF8
+TEST_F(StringReverse, stringReverseUTF8Test)
+try
+{
+    std::vector<String> candidate_strings = {"one week's time test", "abc测试def", "abcテストabc", "ѐёђѓєѕіїјљњћќѝўџ", "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……љ&њ（ћ）ќ￥ѝ#ў@џ！^", "αβγδεζηθικλμνξοπρστυφχψωσ", "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★σ✕", "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"};
+    std::vector<String> reversed_strings = {"tset emit s'keew eno", "fed试测cba", "cbaトステcba", "џўѝќћњљјїіѕєѓђёѐ", "^！џ@ў#ѝ￥ќ）ћ（њ&љ……ј%ї@$і#ѕ@є!ѓ/ђ*ё-ѐ+", "σωψχφυτσρποξνμλκιθηζεδγβα", "✕σ★ω♘ψχ♖φυ♥τσ℉ρπ✚οξ✓νμ♫λκ€ιθ✂ηζ☎εδ➨γβ▼α▲", "շմնբվցղզխլկյհգֆդսապօիւըտռեոքծժճչրջձփթ"};
+
+    // test vector
+    ASSERT_COLUMN_EQ(
+        toVec(reversed_strings),
+        executeFunction(
+            "reverseUTF8",
+            toVec(candidate_strings)));
+
+    // test nullable
+    ASSERT_COLUMN_EQ(
+        toNullableVec({"", " ", {}, "pacgnip"}),
+        executeFunction(
+            "reverseUTF8",
+            toNullableVec({"", " ", {}, "pingcap"})));
+
+    // test const
+    ASSERT_COLUMN_EQ(
+        toConst("pacgnip"),
+        executeFunction(
+            "reverseUTF8",
+            toConst("pingcap")));
+
+    // test null
+    ASSERT_COLUMN_EQ(
+        toConst({}),
+        executeFunction(
+            "reverseUTF8",
+            toConst({})));
+}
+CATCH
+
+} // namespace DB::tests
\ No newline at end of file
diff --git a/dbms/src/IO/BufferWithOwnMemory.h b/dbms/src/IO/BufferWithOwnMemory.h
index 272f4fc5c01..babe2541b33 100644
--- a/dbms/src/IO/BufferWithOwnMemory.h
+++ b/dbms/src/IO/BufferWithOwnMemory.h
@@ -21,14 +21,6 @@
 
 #include <boost/noncopyable.hpp>
 
-
-namespace ProfileEvents
-{
-extern const Event IOBufferAllocs;
-extern const Event IOBufferAllocBytes;
-} // namespace ProfileEvents
-
-
 namespace DB
 {
 /** Replacement for std::vector<char> to use in buffers.
@@ -119,9 +111,6 @@ struct Memory
             return;
         }
 
-        ProfileEvents::increment(ProfileEvents::IOBufferAllocs);
-        ProfileEvents::increment(ProfileEvents::IOBufferAllocBytes, m_capacity);
-
         size_t new_capacity = align(m_capacity, alignment);
         m_data = static_cast<char *>(Allocator::alloc(new_capacity, alignment));
         m_capacity = new_capacity;
diff --git a/dbms/src/IO/ChecksumBuffer.h b/dbms/src/IO/ChecksumBuffer.h
index f6d60677a12..b095545ea6e 100644
--- a/dbms/src/IO/ChecksumBuffer.h
+++ b/dbms/src/IO/ChecksumBuffer.h
@@ -27,7 +27,6 @@ namespace ProfileEvents
 {
 // no need to update sync, since write buffers inherit that directly from `WriteBufferFromFileDescriptor`
 extern const Event WriteBufferFromFileDescriptorWrite;
-extern const Event WriteBufferFromFileDescriptorWriteFailed;
 extern const Event WriteBufferFromFileDescriptorWriteBytes;
 extern const Event ReadBufferFromFileDescriptorRead;
 extern const Event ReadBufferFromFileDescriptorReadBytes;
@@ -107,7 +106,6 @@ class FramedChecksumWriteBuffer : public WriteBufferFromFileDescriptor
             }
             if (unlikely(count == -1))
             {
-                ProfileEvents::increment(ProfileEvents::WriteBufferFromFileDescriptorWriteFailed);
                 if (errno == EINTR)
                     continue;
                 else
@@ -386,8 +384,6 @@ class FramedChecksumReadBuffer : public ReadBufferFromFileDescriptor
 
     off_t doSeek(off_t offset, int whence) override
     {
-        ProfileEvents::increment(ProfileEvents::Seek);
-
         auto & frame = reinterpret_cast<ChecksumFrame<Backend> &>(
             *(this->working_buffer.begin() - sizeof(ChecksumFrame<Backend>))); // align should not fail
 
diff --git a/dbms/src/IO/CompressedReadBufferBase.cpp b/dbms/src/IO/CompressedReadBufferBase.cpp
index dd54c1b47a8..58bf47a9298 100644
--- a/dbms/src/IO/CompressedReadBufferBase.cpp
+++ b/dbms/src/IO/CompressedReadBufferBase.cpp
@@ -28,14 +28,6 @@
 
 #include <vector>
 
-
-namespace ProfileEvents
-{
-extern const Event ReadCompressedBytes;
-extern const Event CompressedReadBufferBlocks;
-extern const Event CompressedReadBufferBytes;
-} // namespace ProfileEvents
-
 namespace DB
 {
 namespace ErrorCodes
@@ -83,8 +75,6 @@ size_t CompressedReadBufferBase<has_checksum>::readCompressedData(size_t & size_
     if (size_compressed > DBMS_MAX_COMPRESSED_SIZE)
         throw Exception("Too large size_compressed. Most likely corrupted data.", ErrorCodes::TOO_LARGE_SIZE_COMPRESSED);
 
-    ProfileEvents::increment(ProfileEvents::ReadCompressedBytes, size_compressed + sizeof(checksum));
-
     /// Is whole compressed block located in 'compressed_in' buffer?
     if (compressed_in->offset() >= COMPRESSED_BLOCK_HEADER_SIZE
         && compressed_in->position() + size_compressed - COMPRESSED_BLOCK_HEADER_SIZE <= compressed_in->buffer().end())
@@ -115,9 +105,6 @@ size_t CompressedReadBufferBase<has_checksum>::readCompressedData(size_t & size_
 template <bool has_checksum>
 void CompressedReadBufferBase<has_checksum>::decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum)
 {
-    ProfileEvents::increment(ProfileEvents::CompressedReadBufferBlocks);
-    ProfileEvents::increment(ProfileEvents::CompressedReadBufferBytes, size_decompressed);
-
     UInt8 method = compressed_buffer[0]; /// See CompressedWriteBuffer.h
 
     if (method == static_cast<UInt8>(CompressionMethodByte::LZ4))
diff --git a/dbms/src/IO/ReadBufferFromFileDescriptor.cpp b/dbms/src/IO/ReadBufferFromFileDescriptor.cpp
index 90cc6e3ca76..4b3d52f3741 100644
--- a/dbms/src/IO/ReadBufferFromFileDescriptor.cpp
+++ b/dbms/src/IO/ReadBufferFromFileDescriptor.cpp
@@ -77,7 +77,7 @@ bool ReadBufferFromFileDescriptor::nextImpl()
 
         if (profile_callback)
         {
-            ProfileInfo info;
+            ProfileInfo info; // NOLINT
             info.bytes_requested = internal_buffer.size();
             info.bytes_read = res;
             info.nanoseconds = watch->elapsed();
@@ -120,8 +120,6 @@ off_t ReadBufferFromFileDescriptor::doSeek(off_t offset, int whence)
     }
     else
     {
-        ProfileEvents::increment(ProfileEvents::Seek);
-
         pos = working_buffer.end();
         off_t res = doSeekInFile(new_pos, SEEK_SET);
         if (-1 == res)
@@ -145,7 +143,7 @@ bool ReadBufferFromFileDescriptor::poll(size_t timeout_microseconds)
     FD_SET(fd, &fds);
     timeval timeout = {time_t(timeout_microseconds / 1000000), suseconds_t(timeout_microseconds % 1000000)};
 
-    int res = select(1, &fds, 0, 0, &timeout);
+    int res = select(1, &fds, nullptr, nullptr, &timeout);
 
     if (-1 == res)
         throwFromErrno("Cannot select", ErrorCodes::CANNOT_SELECT);
diff --git a/dbms/src/IO/WriteBuffer.h b/dbms/src/IO/WriteBuffer.h
index 361081d1176..0c0fa2cb545 100644
--- a/dbms/src/IO/WriteBuffer.h
+++ b/dbms/src/IO/WriteBuffer.h
@@ -96,6 +96,24 @@ class WriteBuffer : public BufferBase
         }
     }
 
+    template <class T>
+    __attribute__((always_inline)) void writeFixed(const T * __restrict from)
+    {
+        if (likely(working_buffer.end() - pos >= static_cast<ptrdiff_t>(sizeof(T))))
+        {
+            tiflash_compiler_builtin_memcpy(pos, from, sizeof(T));
+            pos += sizeof(T);
+        }
+        else
+        {
+            [&]() __attribute__((noinline))
+            {
+                write(reinterpret_cast<const char *>(from), sizeof(T));
+            }
+            ();
+        }
+    }
+
 
     inline void write(char x)
     {
diff --git a/dbms/src/IO/WriteBufferFromFileDescriptor.cpp b/dbms/src/IO/WriteBufferFromFileDescriptor.cpp
index c18337497b7..49b6d871870 100644
--- a/dbms/src/IO/WriteBufferFromFileDescriptor.cpp
+++ b/dbms/src/IO/WriteBufferFromFileDescriptor.cpp
@@ -24,7 +24,6 @@ namespace ProfileEvents
 {
 extern const Event FileFSync;
 extern const Event WriteBufferFromFileDescriptorWrite;
-extern const Event WriteBufferFromFileDescriptorWriteFailed;
 extern const Event WriteBufferFromFileDescriptorWriteBytes;
 } // namespace ProfileEvents
 
@@ -57,7 +56,6 @@ void WriteBufferFromFileDescriptor::nextImpl()
 
         if ((-1 == res || 0 == res) && errno != EINTR)
         {
-            ProfileEvents::increment(ProfileEvents::WriteBufferFromFileDescriptorWriteFailed);
             throwFromErrno("Cannot write to file " + getFileName(), ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR);
         }
 
diff --git a/dbms/src/IO/createReadBufferFromFileBase.cpp b/dbms/src/IO/createReadBufferFromFileBase.cpp
index 24c9dfb204c..0d129d03a1a 100644
--- a/dbms/src/IO/createReadBufferFromFileBase.cpp
+++ b/dbms/src/IO/createReadBufferFromFileBase.cpp
@@ -20,13 +20,6 @@
 #endif
 #include <Common/ProfileEvents.h>
 
-
-namespace ProfileEvents
-{
-extern const Event CreatedReadBufferOrdinary;
-extern const Event CreatedReadBufferAIO;
-} // namespace ProfileEvents
-
 namespace DB
 {
 namespace ErrorCodes
@@ -46,13 +39,11 @@ createReadBufferFromFileBase(
 {
     if ((aio_threshold == 0) || (estimated_size < aio_threshold))
     {
-        ProfileEvents::increment(ProfileEvents::CreatedReadBufferOrdinary);
         return std::make_unique<ReadBufferFromFile>(filename_, buffer_size_, flags_, existing_memory_, alignment);
     }
     else
     {
 #if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(_MSC_VER)
-        ProfileEvents::increment(ProfileEvents::CreatedReadBufferAIO);
         return std::make_unique<ReadBufferAIO>(filename_, buffer_size_, flags_, existing_memory_);
 #else
         throw Exception("AIO is not implemented yet on MacOS X", ErrorCodes::NOT_IMPLEMENTED);
diff --git a/dbms/src/IO/createWriteBufferFromFileBase.cpp b/dbms/src/IO/createWriteBufferFromFileBase.cpp
index 96bf3e65558..0e741eb3e5d 100644
--- a/dbms/src/IO/createWriteBufferFromFileBase.cpp
+++ b/dbms/src/IO/createWriteBufferFromFileBase.cpp
@@ -19,13 +19,6 @@
 #endif
 #include <Common/ProfileEvents.h>
 
-
-namespace ProfileEvents
-{
-extern const Event CreatedWriteBufferOrdinary;
-extern const Event CreatedWriteBufferAIO;
-} // namespace ProfileEvents
-
 namespace DB
 {
 namespace ErrorCodes
@@ -45,13 +38,11 @@ WriteBufferFromFileBase * createWriteBufferFromFileBase(
 {
     if ((aio_threshold == 0) || (estimated_size < aio_threshold))
     {
-        ProfileEvents::increment(ProfileEvents::CreatedWriteBufferOrdinary);
         return new WriteBufferFromFile(filename_, buffer_size_, flags_, mode, existing_memory_, alignment);
     }
     else
     {
 #if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(_MSC_VER)
-        ProfileEvents::increment(ProfileEvents::CreatedWriteBufferAIO);
         return new WriteBufferAIO(filename_, buffer_size_, flags_, mode, existing_memory_);
 #else
         throw Exception("AIO is not implemented yet on MacOS X", ErrorCodes::NOT_IMPLEMENTED);
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 6e067b88d81..6cb947a1bfa 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -17,6 +17,7 @@
 #include <AggregateFunctions/AggregateFunctionState.h>
 #include <Columns/ColumnTuple.h>
 #include <Common/ClickHouseRevision.h>
+#include <Common/FailPoint.h>
 #include <Common/MemoryTracker.h>
 #include <Common/Stopwatch.h>
 #include <Common/ThreadManager.h>
@@ -37,19 +38,6 @@
 #include <iomanip>
 #include <thread>
 
-
-namespace ProfileEvents
-{
-extern const Event ExternalAggregationWritePart;
-extern const Event ExternalAggregationCompressedBytes;
-extern const Event ExternalAggregationUncompressedBytes;
-} // namespace ProfileEvents
-
-namespace CurrentMetrics
-{
-extern const Metric QueryThread;
-}
-
 namespace DB
 {
 namespace ErrorCodes
@@ -61,6 +49,11 @@ extern const int CANNOT_MERGE_DIFFERENT_AGGREGATED_DATA_VARIANTS;
 extern const int LOGICAL_ERROR;
 } // namespace ErrorCodes
 
+namespace FailPoints
+{
+extern const char random_aggregate_create_state_failpoint[];
+extern const char random_aggregate_merge_failpoint[];
+} // namespace FailPoints
 
 AggregatedDataVariants::~AggregatedDataVariants()
 {
@@ -330,6 +323,7 @@ void Aggregator::createAggregateStates(AggregateDataPtr & aggregate_data) const
               * In order that then everything is properly destroyed, we "roll back" some of the created states.
               * The code is not very convenient.
               */
+            FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_aggregate_create_state_failpoint);
             aggregate_functions[j]->create(aggregate_data + offsets_of_aggregate_states[j]);
         }
         catch (...)
@@ -658,7 +652,6 @@ void Aggregator::writeToTemporaryFile(AggregatedDataVariants & data_variants, co
     NativeBlockOutputStream block_out(compressed_buf, ClickHouseRevision::get(), getHeader(false));
 
     LOG_FMT_DEBUG(log, "Writing part of aggregation data into temporary file {}.", path);
-    ProfileEvents::increment(ProfileEvents::ExternalAggregationWritePart);
 
     /// Flush only two-level data and possibly overflow data.
 
@@ -694,9 +687,6 @@ void Aggregator::writeToTemporaryFile(AggregatedDataVariants & data_variants, co
         temporary_files.sum_size_compressed += compressed_bytes;
     }
 
-    ProfileEvents::increment(ProfileEvents::ExternalAggregationCompressedBytes, compressed_bytes);
-    ProfileEvents::increment(ProfileEvents::ExternalAggregationUncompressedBytes, uncompressed_bytes);
-
     LOG_FMT_TRACE(
         log,
         "Written part in {:.3f} sec., {} rows, "
@@ -1016,7 +1006,7 @@ Block Aggregator::prepareBlockAndFill(
             aggregate_columns[i] = header.getByName(aggregate_column_name).type->createColumn();
 
             /// The ColumnAggregateFunction column captures the shared ownership of the arena with the aggregate function states.
-            ColumnAggregateFunction & column_aggregate_func = assert_cast<ColumnAggregateFunction &>(*aggregate_columns[i]);
+            auto & column_aggregate_func = assert_cast<ColumnAggregateFunction &>(*aggregate_columns[i]);
 
             for (auto & pool : data_variants.aggregates_pools)
                 column_aggregate_func.addArena(pool);
@@ -1502,7 +1492,7 @@ class MergingAndConvertingBlockInputStream : public IProfilingBlockInputStream
 
     Block getHeader() const override { return aggregator.getHeader(final); }
 
-    ~MergingAndConvertingBlockInputStream()
+    ~MergingAndConvertingBlockInputStream() override
     {
         LOG_FMT_TRACE(&Poco::Logger::get(__PRETTY_FUNCTION__), "Waiting for threads to finish");
 
@@ -1521,6 +1511,8 @@ class MergingAndConvertingBlockInputStream : public IProfilingBlockInputStream
         if (current_bucket_num >= NUM_BUCKETS)
             return {};
 
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_aggregate_merge_failpoint);
+
         AggregatedDataVariantsPtr & first = data[0];
 
         if (current_bucket_num == -1)
@@ -1636,8 +1628,6 @@ class MergingAndConvertingBlockInputStream : public IProfilingBlockInputStream
 
     void thread(Int32 bucket_num)
     {
-        CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread};
-
         try
         {
             /// TODO: add no_more_keys support maybe
diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp
index a0adef5b50d..44699a324f4 100644
--- a/dbms/src/Interpreters/Context.cpp
+++ b/dbms/src/Interpreters/Context.cpp
@@ -28,6 +28,7 @@
 #include <Encryption/DataKeyManager.h>
 #include <Encryption/FileProvider.h>
 #include <Encryption/RateLimiter.h>
+#include <Flash/Coprocessor/DAGContext.h>
 #include <IO/ReadBufferFromFile.h>
 #include <IO/UncompressedCache.h>
 #include <Interpreters/Context.h>
@@ -77,10 +78,10 @@ namespace ProfileEvents
 extern const Event ContextLock;
 }
 
+#include <set>
+
 namespace CurrentMetrics
 {
-extern const Metric ContextLockWait;
-extern const Metric MemoryTrackingForMerges;
 extern const Metric GlobalStorageRunMode;
 } // namespace CurrentMetrics
 
@@ -308,8 +309,6 @@ Context::~Context()
 
 std::unique_lock<std::recursive_mutex> Context::getLock() const
 {
-    ProfileEvents::increment(ProfileEvents::ContextLock);
-    CurrentMetrics::Increment increment{CurrentMetrics::ContextLockWait};
     return std::unique_lock(shared->mutex);
 }
 
@@ -1879,6 +1878,30 @@ SharedQueriesPtr Context::getSharedQueries()
     return shared->shared_queries;
 }
 
+size_t Context::getMaxStreams() const
+{
+    size_t max_streams = settings.max_threads;
+    bool is_cop_request = false;
+    if (dag_context != nullptr)
+    {
+        if (dag_context->isTest())
+            max_streams = dag_context->initialize_concurrency;
+        else if (!dag_context->isBatchCop() && !dag_context->isMPPTask())
+        {
+            is_cop_request = true;
+            max_streams = 1;
+        }
+    }
+    if (max_streams > 1)
+        max_streams *= settings.max_streams_to_max_threads_ratio;
+    if (max_streams == 0)
+        max_streams = 1;
+    if (unlikely(max_streams != 1 && is_cop_request))
+        /// for cop request, the max_streams should be 1
+        throw Exception("Cop request only support running with max_streams = 1");
+    return max_streams;
+}
+
 SessionCleaner::~SessionCleaner()
 {
     try
diff --git a/dbms/src/Interpreters/Context.h b/dbms/src/Interpreters/Context.h
index 5d5c39263c6..b6e759e364b 100644
--- a/dbms/src/Interpreters/Context.h
+++ b/dbms/src/Interpreters/Context.h
@@ -459,6 +459,8 @@ class Context
 
     void reloadDeltaTreeConfig(const Poco::Util::AbstractConfiguration & config);
 
+    size_t getMaxStreams() const;
+
 private:
     /** Check if the current client has access to the specified database.
       * If access is denied, throw an exception.
diff --git a/dbms/src/Interpreters/ExpressionActions.cpp b/dbms/src/Interpreters/ExpressionActions.cpp
index 8e75a64427c..0ab8519e4d0 100644
--- a/dbms/src/Interpreters/ExpressionActions.cpp
+++ b/dbms/src/Interpreters/ExpressionActions.cpp
@@ -27,12 +27,6 @@
 #include <optional>
 #include <set>
 
-
-namespace ProfileEvents
-{
-extern const Event FunctionExecute;
-}
-
 namespace DB
 {
 namespace ErrorCodes
@@ -339,7 +333,6 @@ void ExpressionAction::execute(Block & block) const
         size_t num_columns_without_result = block.columns();
         block.insert({nullptr, result_type, result_name});
 
-        ProfileEvents::increment(ProfileEvents::FunctionExecute);
         function->execute(block, arguments, num_columns_without_result);
 
         break;
diff --git a/dbms/src/Interpreters/InterpreterInsertQuery.cpp b/dbms/src/Interpreters/InterpreterInsertQuery.cpp
index aa64cf8ca94..782a254925a 100644
--- a/dbms/src/Interpreters/InterpreterInsertQuery.cpp
+++ b/dbms/src/Interpreters/InterpreterInsertQuery.cpp
@@ -30,12 +30,6 @@
 #include <Storages/MutableSupport.h>
 #include <TableFunctions/TableFunctionFactory.h>
 
-
-namespace ProfileEvents
-{
-extern const Event InsertQuery;
-}
-
 namespace DB
 {
 namespace ErrorCodes
@@ -54,7 +48,6 @@ InterpreterInsertQuery::InterpreterInsertQuery(
     , context(context_)
     , allow_materialized(allow_materialized_)
 {
-    ProfileEvents::increment(ProfileEvents::InsertQuery);
 }
 
 
@@ -62,7 +55,7 @@ StoragePtr InterpreterInsertQuery::getTable(const ASTInsertQuery & query)
 {
     if (query.table_function)
     {
-        auto table_function = typeid_cast<const ASTFunction *>(query.table_function.get());
+        const auto * table_function = typeid_cast<const ASTFunction *>(query.table_function.get());
         const auto & factory = TableFunctionFactory::instance();
         return factory.get(table_function->name, context)->execute(query.table_function, context);
     }
@@ -71,7 +64,7 @@ StoragePtr InterpreterInsertQuery::getTable(const ASTInsertQuery & query)
     return context.getTable(query.database, query.table);
 }
 
-Block InterpreterInsertQuery::getSampleBlock(const ASTInsertQuery & query, const StoragePtr & table)
+Block InterpreterInsertQuery::getSampleBlock(const ASTInsertQuery & query, const StoragePtr & table) // NOLINT
 {
     Block table_sample_non_materialized;
     if (query.is_import)
diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.cpp b/dbms/src/Interpreters/InterpreterSelectQuery.cpp
index 51b55f65bd4..3514f915626 100644
--- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <Columns/Collator.h>
+#include <Common/FailPoint.h>
 #include <Common/Logger.h>
 #include <Common/TiFlashException.h>
 #include <Common/typeid_cast.h>
@@ -93,6 +94,12 @@ extern const int SCHEMA_VERSION_ERROR;
 extern const int UNKNOWN_EXCEPTION;
 } // namespace ErrorCodes
 
+
+namespace FailPoints
+{
+extern const char pause_query_init[];
+} // namespace FailPoints
+
 InterpreterSelectQuery::InterpreterSelectQuery(
     const ASTPtr & query_ptr_,
     const Context & context_,
@@ -131,7 +138,14 @@ InterpreterSelectQuery::~InterpreterSelectQuery() = default;
 
 void InterpreterSelectQuery::init(const Names & required_result_column_names)
 {
-    ProfileEvents::increment(ProfileEvents::SelectQuery);
+    /// the failpoint pause_query_init should use with the failpoint unblock_query_init_after_write,
+    /// to fulfill that the select query action will be blocked before init state to wait the write action finished.
+    /// In using, we need enable unblock_query_init_after_write in our test code,
+    /// and before each write statement take effect, we need enable pause_query_init.
+    /// When the write action finished, the pause_query_init will be disabled automatically,
+    /// and then the select query could be continued.
+    /// you can refer multi_alter_with_write.test for an example.
+    FAIL_POINT_PAUSE(FailPoints::pause_query_init);
 
     if (!context.hasQueryContext())
         context.setQueryContext(context);
@@ -498,13 +512,13 @@ void InterpreterSelectQuery::executeImpl(Pipeline & pipeline, const BlockInputSt
             {
                 const auto & join = static_cast<const ASTTableJoin &>(*query.join()->table_join);
                 if (join.kind == ASTTableJoin::Kind::Full || join.kind == ASTTableJoin::Kind::Right)
-                    pipeline.stream_with_non_joined_data = expressions.before_join->createStreamWithNonJoinedDataIfFullOrRightJoin(
+                    pipeline.streams_with_non_joined_data.push_back(expressions.before_join->createStreamWithNonJoinedDataIfFullOrRightJoin(
                         pipeline.firstStream()->getHeader(),
                         0,
                         1,
-                        settings.max_block_size);
+                        settings.max_block_size));
 
-                for (auto & stream : pipeline.streams) /// Applies to all sources except stream_with_non_joined_data.
+                for (auto & stream : pipeline.streams) /// Applies to all sources except streams_with_non_joined_data.
                     stream = std::make_shared<ExpressionBlockInputStream>(stream, expressions.before_join, /*req_id=*/"");
             }
 
@@ -589,7 +603,7 @@ void InterpreterSelectQuery::executeImpl(Pipeline & pipeline, const BlockInputSt
             if (need_second_distinct_pass
                 || query.limit_length
                 || query.limit_by_expression_list
-                || pipeline.stream_with_non_joined_data)
+                || !pipeline.streams_with_non_joined_data.empty())
             {
                 need_merge_streams = true;
             }
@@ -973,11 +987,11 @@ void InterpreterSelectQuery::executeAggregation(Pipeline & pipeline, const Expre
     Aggregator::Params params(header, keys, aggregates, overflow_row, settings.max_rows_to_group_by, settings.group_by_overflow_mode, allow_to_use_two_level_group_by ? settings.group_by_two_level_threshold : SettingUInt64(0), allow_to_use_two_level_group_by ? settings.group_by_two_level_threshold_bytes : SettingUInt64(0), settings.max_bytes_before_external_group_by, settings.empty_result_for_aggregation_by_empty_set, context.getTemporaryPath());
 
     /// If there are several sources, then we perform parallel aggregation
-    if (pipeline.streams.size() > 1)
+    if (pipeline.streams.size() > 1 || pipeline.streams_with_non_joined_data.size() > 1)
     {
-        pipeline.firstStream() = std::make_shared<ParallelAggregatingBlockInputStream>(
+        auto stream = std::make_shared<ParallelAggregatingBlockInputStream>(
             pipeline.streams,
-            pipeline.stream_with_non_joined_data,
+            pipeline.streams_with_non_joined_data,
             params,
             file_provider,
             final,
@@ -987,19 +1001,21 @@ void InterpreterSelectQuery::executeAggregation(Pipeline & pipeline, const Expre
                 : static_cast<size_t>(settings.max_threads),
             /*req_id=*/"");
 
-        pipeline.stream_with_non_joined_data = nullptr;
         pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
+        pipeline.firstStream() = std::move(stream);
     }
     else
     {
         BlockInputStreams inputs;
         if (!pipeline.streams.empty())
             inputs.push_back(pipeline.firstStream());
-        else
-            pipeline.streams.resize(1);
 
-        if (pipeline.stream_with_non_joined_data)
-            inputs.push_back(pipeline.stream_with_non_joined_data);
+        if (!pipeline.streams_with_non_joined_data.empty())
+            inputs.push_back(pipeline.streams_with_non_joined_data.at(0));
+
+        pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
 
         pipeline.firstStream() = std::make_shared<AggregatingBlockInputStream>(
             std::make_shared<ConcatBlockInputStream>(inputs, /*req_id=*/""),
@@ -1007,8 +1023,6 @@ void InterpreterSelectQuery::executeAggregation(Pipeline & pipeline, const Expre
             file_provider,
             final,
             /*req_id=*/"");
-
-        pipeline.stream_with_non_joined_data = nullptr;
     }
 }
 
@@ -1230,21 +1244,33 @@ void InterpreterSelectQuery::executeDistinct(Pipeline & pipeline, bool before_or
 
 void InterpreterSelectQuery::executeUnion(Pipeline & pipeline)
 {
-    /// If there are still several streams, then we combine them into one
-    if (pipeline.hasMoreThanOneStream())
+    switch (pipeline.streams.size() + pipeline.streams_with_non_joined_data.size())
+    {
+    case 0:
+        break;
+    case 1:
+    {
+        if (pipeline.streams.size() == 1)
+            break;
+        // streams_with_non_joined_data's size is 1.
+        pipeline.streams.push_back(pipeline.streams_with_non_joined_data.at(0));
+        pipeline.streams_with_non_joined_data.clear();
+        break;
+    }
+    default:
     {
-        pipeline.firstStream() = std::make_shared<UnionBlockInputStream<>>(
+        BlockInputStreamPtr stream = std::make_shared<UnionBlockInputStream<>>(
             pipeline.streams,
-            pipeline.stream_with_non_joined_data,
+            pipeline.streams_with_non_joined_data,
             max_streams,
             /*req_id=*/"");
-        pipeline.stream_with_non_joined_data = nullptr;
+        ;
+
         pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
+        pipeline.firstStream() = std::move(stream);
+        break;
     }
-    else if (pipeline.stream_with_non_joined_data)
-    {
-        pipeline.streams.push_back(pipeline.stream_with_non_joined_data);
-        pipeline.stream_with_non_joined_data = nullptr;
     }
 }
 
diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.h b/dbms/src/Interpreters/InterpreterSelectQuery.h
index 474ace7ee84..d1bcec2a3dd 100644
--- a/dbms/src/Interpreters/InterpreterSelectQuery.h
+++ b/dbms/src/Interpreters/InterpreterSelectQuery.h
@@ -95,7 +95,7 @@ class InterpreterSelectQuery : public IInterpreter
           * It has a special meaning, since reading from it should be done after reading from the main streams.
           * It is appended to the main streams in UnionBlockInputStream or ParallelAggregatingBlockInputStream.
           */
-        BlockInputStreamPtr stream_with_non_joined_data;
+        BlockInputStreams streams_with_non_joined_data;
 
         BlockInputStreamPtr & firstStream() { return streams.at(0); }
 
@@ -105,13 +105,13 @@ class InterpreterSelectQuery : public IInterpreter
             for (auto & stream : streams)
                 transform(stream);
 
-            if (stream_with_non_joined_data)
-                transform(stream_with_non_joined_data);
+            for (auto & stream : streams_with_non_joined_data)
+                transform(stream);
         }
 
         bool hasMoreThanOneStream() const
         {
-            return streams.size() + (stream_with_non_joined_data ? 1 : 0) > 1;
+            return streams.size() + streams_with_non_joined_data.size() > 1;
         }
     };
 
diff --git a/dbms/src/Interpreters/InterpreterSelectWithUnionQuery.cpp b/dbms/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
index 5e73b1e5f3e..076c290cc9d 100644
--- a/dbms/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
+++ b/dbms/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
@@ -224,7 +224,7 @@ BlockIO InterpreterSelectWithUnionQuery::execute()
     }
     else
     {
-        result_stream = std::make_shared<UnionBlockInputStream<>>(nested_streams, nullptr, settings.max_threads, /*req_id=*/"");
+        result_stream = std::make_shared<UnionBlockInputStream<>>(nested_streams, BlockInputStreams{}, settings.max_threads, /*req_id=*/"");
         nested_streams.clear();
     }
 
diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp
index 820618a6e8b..181ebcaaa64 100644
--- a/dbms/src/Interpreters/Join.cpp
+++ b/dbms/src/Interpreters/Join.cpp
@@ -17,6 +17,7 @@
 #include <Columns/ColumnNullable.h>
 #include <Columns/ColumnString.h>
 #include <Common/ColumnsHashing.h>
+#include <Common/FailPoint.h>
 #include <Common/typeid_cast.h>
 #include <Core/ColumnNumbers.h>
 #include <DataStreams/IProfilingBlockInputStream.h>
@@ -26,9 +27,17 @@
 #include <Functions/FunctionHelpers.h>
 #include <Interpreters/Join.h>
 #include <Interpreters/NullableUtils.h>
+#include <common/logger_useful.h>
+
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_join_build_failpoint[];
+extern const char random_join_prob_failpoint[];
+} // namespace FailPoints
+
 namespace ErrorCodes
 {
 extern const int UNKNOWN_SET_DATA_VARIANT;
@@ -621,6 +630,7 @@ void NO_INLINE insertFromBlockImplTypeCaseWithLock(
     }
     for (size_t insert_index = 0; insert_index < segment_index_info.size(); insert_index++)
     {
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_join_build_failpoint);
         size_t segment_index = (insert_index + stream_index) % segment_index_info.size();
         if (segment_index == segment_size)
         {
@@ -1513,7 +1523,7 @@ void Join::joinBlockImpl(Block & block, const Maps & maps) const
     default:
         throw Exception("Unknown JOIN keys variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT);
     }
-
+    FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_join_prob_failpoint);
     for (size_t i = 0; i < num_columns_to_add; ++i)
     {
         const ColumnWithTypeAndName & sample_col = sample_block_with_columns_to_add.getByPosition(i);
diff --git a/dbms/src/Interpreters/ProcessList.h b/dbms/src/Interpreters/ProcessList.h
index fdc009237aa..5ed586c263d 100644
--- a/dbms/src/Interpreters/ProcessList.h
+++ b/dbms/src/Interpreters/ProcessList.h
@@ -31,12 +31,6 @@
 #include <mutex>
 #include <unordered_map>
 
-
-namespace CurrentMetrics
-{
-extern const Metric Query;
-}
-
 namespace DB
 {
 class IStorage;
@@ -90,8 +84,6 @@ class ProcessListElement
 
     QueryPriorities::Handle priority_handle;
 
-    CurrentMetrics::Increment num_queries{CurrentMetrics::Query};
-
     std::atomic<bool> is_killed{false};
 
     /// Be careful using it. For example, queries field could be modified concurrently.
diff --git a/dbms/src/Interpreters/QueryPriorities.h b/dbms/src/Interpreters/QueryPriorities.h
index 5f34ae616c7..ca01e4f0a6c 100644
--- a/dbms/src/Interpreters/QueryPriorities.h
+++ b/dbms/src/Interpreters/QueryPriorities.h
@@ -23,13 +23,6 @@
 #include <memory>
 #include <mutex>
 
-
-namespace CurrentMetrics
-{
-extern const Metric QueryPreempted;
-}
-
-
 namespace DB
 {
 /** Implements query priorities in very primitive way.
@@ -95,7 +88,6 @@ class QueryPriorities
             if (!found)
                 return true;
 
-            CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryPreempted};
             if (std::cv_status::timeout == condvar.wait_for(lock, cur_timeout))
                 return false;
             else
diff --git a/dbms/src/Interpreters/Settings.h b/dbms/src/Interpreters/Settings.h
index b827cb8b6a0..21366c336a8 100644
--- a/dbms/src/Interpreters/Settings.h
+++ b/dbms/src/Interpreters/Settings.h
@@ -272,7 +272,7 @@ struct Settings
     M(SettingUInt64, dt_segment_delta_small_column_file_size, 8388608, "Determine whether a column file in delta is small or not. 8MB by default.")                                                                                     \
     M(SettingUInt64, dt_segment_stable_pack_rows, DEFAULT_MERGE_BLOCK_SIZE, "Expected stable pack rows in DeltaTree Engine.")                                                                                                           \
     M(SettingFloat, dt_segment_wait_duration_factor, 1, "The factor of wait duration in a write stall.")                                                                                                                                \
-    M(SettingUInt64, dt_bg_gc_check_interval, 60, "Background gc thread check interval, the unit is second.")                                                                                                                            \
+    M(SettingUInt64, dt_bg_gc_check_interval, 60, "Background gc thread check interval, the unit is second.")                                                                                                                           \
     M(SettingInt64, dt_bg_gc_max_segments_to_check_every_round, 100, "Max segments to check in every gc round, value less than or equal to 0 means gc no segments.")                                                                    \
     M(SettingFloat, dt_bg_gc_ratio_threhold_to_trigger_gc, 1.2, "Trigger segment's gc when the ratio of invalid version exceed this threhold. Values smaller than or equal to 1.0 means gc all "                                        \
                                                                 "segments")                                                                                                                                                             \
@@ -361,10 +361,12 @@ struct Settings
     M(SettingUInt64, async_pollers_per_cq, 200, "grpc async pollers per cqs")                                                                                                                                                           \
     M(SettingUInt64, async_cqs, 1, "grpc async cqs")                                                                                                                                                                                    \
     M(SettingUInt64, preallocated_request_count_per_poller, 20, "grpc preallocated_request_count_per_poller")                                                                                                                           \
+                                                                                                                                                                                                                                        \
     M(SettingUInt64, manual_compact_pool_size, 1, "The number of worker threads to handle manual compact requests.")                                                                                                                    \
     M(SettingUInt64, manual_compact_max_concurrency, 10, "Max concurrent tasks. It should be larger than pool size.")                                                                                                                   \
     M(SettingUInt64, manual_compact_more_until_ms, 60000, "Continuously compact more segments until reaching specified elapsed time. If 0 is specified, only one segment will be compacted each round.")                                \
     M(SettingBool, enable_planner, true, "Enable planner")
+
 // clang-format on
 #define DECLARE(TYPE, NAME, DEFAULT, DESCRIPTION) TYPE NAME{DEFAULT};
 
diff --git a/dbms/src/Interpreters/WindowDescription.cpp b/dbms/src/Interpreters/WindowDescription.cpp
index 2ab407bb18e..09d96411673 100644
--- a/dbms/src/Interpreters/WindowDescription.cpp
+++ b/dbms/src/Interpreters/WindowDescription.cpp
@@ -44,7 +44,7 @@ WindowFrame::FrameType getFrameTypeFromTipb(const tipb::WindowFrameType & type)
         return WindowFrame::FrameType::Groups;
     default:
         throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                        "Unknowed frame type {}",
+                        "Unknown frame type {}",
                         type);
     }
 }
@@ -60,4 +60,38 @@ void WindowDescription::setWindowFrame(const tipb::WindowFrame & frame_)
     frame.end_preceding = (frame_.end().type() == tipb::WindowBoundType::Preceding);
     frame.is_default = false;
 }
+
+String frameTypeToString(const WindowFrame::FrameType & type)
+{
+    switch (type)
+    {
+    case WindowFrame::FrameType::Rows:
+        return "Rows";
+    case WindowFrame::FrameType::Groups:
+        return "Groups";
+    case WindowFrame::FrameType::Ranges:
+        return "Ranges";
+    default:
+        throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                        "Unknown frame type {}",
+                        type);
+    }
+}
+
+String boundaryTypeToString(const WindowFrame::BoundaryType & type)
+{
+    switch (type)
+    {
+    case WindowFrame::BoundaryType::Unbounded:
+        return "Unbounded";
+    case WindowFrame::BoundaryType::Current:
+        return "Current";
+    case WindowFrame::BoundaryType::Offset:
+        return "Offset";
+    default:
+        throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                        "Unknown boundary type {}",
+                        type);
+    }
+}
 } // namespace DB
diff --git a/dbms/src/Interpreters/WindowDescription.h b/dbms/src/Interpreters/WindowDescription.h
index cdcade1b750..a3c2bac5747 100644
--- a/dbms/src/Interpreters/WindowDescription.h
+++ b/dbms/src/Interpreters/WindowDescription.h
@@ -87,6 +87,10 @@ struct WindowFrame
             && other.end_preceding == end_preceding;
     }
 };
+
+String frameTypeToString(const WindowFrame::FrameType & type);
+String boundaryTypeToString(const WindowFrame::BoundaryType & type);
+
 class ExpressionActions;
 using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>;
 struct WindowDescription
diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp
index 96cfc0a58ae..78ad4b41ce6 100644
--- a/dbms/src/Interpreters/executeQuery.cpp
+++ b/dbms/src/Interpreters/executeQuery.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/ProfileEvents.h>
 #include <Common/formatReadable.h>
 #include <Common/typeid_cast.h>
@@ -53,7 +54,10 @@ extern const int LOGICAL_ERROR;
 extern const int QUERY_IS_TOO_LARGE;
 extern const int INTO_OUTFILE_NOT_ALLOWED;
 } // namespace ErrorCodes
-
+namespace FailPoints
+{
+extern const char random_interpreter_failpoint[];
+} // namespace FailPoints
 namespace
 {
 void checkASTSizeLimits(const IAST & ast, const Settings & settings)
@@ -226,6 +230,7 @@ std::tuple<ASTPtr, BlockIO> executeQueryImpl(
             context.setProcessListElement(&process_list_entry->get());
         }
 
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_interpreter_failpoint);
         auto interpreter = query_src.interpreter(context, stage);
         res = interpreter->execute();
 
diff --git a/dbms/src/Server/CLIService.h b/dbms/src/Server/CLIService.h
index 9078fa991f3..0acffebb577 100644
--- a/dbms/src/Server/CLIService.h
+++ b/dbms/src/Server/CLIService.h
@@ -126,6 +126,8 @@ CLIService<Func, Args>::TiFlashProxyConfig::TiFlashProxyConfig(Poco::Util::Layer
         args.push_back(v.first.data());
         args.push_back(v.second.data());
     }
+    // Start the decryption service without starting the raftstore service
+    args.push_back("--only-decryption");
     is_proxy_runnable = true;
 }
 template <typename Func, typename Args>
diff --git a/dbms/src/Server/CMakeLists.txt b/dbms/src/Server/CMakeLists.txt
index 63cf6d0e1f9..2948bb076db 100644
--- a/dbms/src/Server/CMakeLists.txt
+++ b/dbms/src/Server/CMakeLists.txt
@@ -22,6 +22,7 @@ option(ENABLE_CLICKHOUSE_SERVER "Enable server" ${ENABLE_CLICKHOUSE_ALL})
 option(ENABLE_CLICKHOUSE_CLIENT "Enable client" ${ENABLE_CLICKHOUSE_ALL})
 option(ENABLE_TIFLASH_DTTOOL "Enable dttool: tools to manage dmfile" ${ENABLE_CLICKHOUSE_ALL})
 option(ENABLE_TIFLASH_DTWORKLOAD "Enable dtworkload: tools to test and stress DeltaTree" ${ENABLE_CLICKHOUSE_ALL})
+option(ENABLE_TIFLASH_PAGEWORKLOAD "Enable pageworkload: tools to test and stress PageStorage" ${ENABLE_CLICKHOUSE_ALL})
 option(ENABLE_TIFLASH_PAGECTL "Enable pagectl: tools to debug page storage" ${ENABLE_CLICKHOUSE_ALL})
 
 configure_file (config_tools.h.in ${CMAKE_CURRENT_BINARY_DIR}/config_tools.h)
@@ -33,6 +34,7 @@ add_library (clickhouse-server-lib
     NotFoundHandler.cpp
     PingRequestHandler.cpp
     RootRequestHandler.cpp
+    ServerInfo.cpp
     Server.cpp
     StatusFile.cpp
     TCPHandler.cpp
@@ -136,6 +138,9 @@ endif ()
 if (ENABLE_TIFLASH_DTWORKLOAD)
     target_link_libraries(tiflash dt-workload-lib)
 endif ()
+if (ENABLE_TIFLASH_PAGEWORKLOAD)
+    target_link_libraries(tiflash page-workload-lib)
+endif()
 if (ENABLE_TIFLASH_PAGECTL)
     target_link_libraries(tiflash page-ctl-lib)
 endif ()
diff --git a/dbms/src/Server/HTTPHandler.h b/dbms/src/Server/HTTPHandler.h
index bd06d56bd4e..74b5cc4b4c7 100644
--- a/dbms/src/Server/HTTPHandler.h
+++ b/dbms/src/Server/HTTPHandler.h
@@ -14,24 +14,20 @@
 
 #pragma once
 
-#include "IServer.h"
-
-#include <Poco/Net/HTTPRequestHandler.h>
-
 #include <Common/CurrentMetrics.h>
 #include <Common/HTMLForm.h>
+#include <Poco/Net/HTTPRequestHandler.h>
 
+#include "IServer.h"
 
-namespace CurrentMetrics
+
+namespace Poco
 {
-    extern const Metric HTTPConnection;
+class Logger;
 }
 
-namespace Poco { class Logger; }
-
 namespace DB
 {
-
 class WriteBufferFromHTTPServerResponse;
 
 
@@ -69,11 +65,9 @@ class HTTPHandler : public Poco::Net::HTTPRequestHandler
     IServer & server;
     Poco::Logger * log;
 
-    /// It is the name of the server that will be sent in an http-header X-ClickHouse-Server-Display-Name. 
+    /// It is the name of the server that will be sent in an http-header X-ClickHouse-Server-Display-Name.
     String server_display_name;
 
-    CurrentMetrics::Increment metric_increment{CurrentMetrics::HTTPConnection};
-
     /// Also initializes 'used_output'.
     void processQuery(
         Poco::Net::HTTPServerRequest & request,
@@ -91,4 +85,4 @@ class HTTPHandler : public Poco::Net::HTTPRequestHandler
     void pushDelayedResults(Output & used_output);
 };
 
-}
+} // namespace DB
diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp
index 04676ef969d..a398aa9c74d 100644
--- a/dbms/src/Server/Server.cpp
+++ b/dbms/src/Server/Server.cpp
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "Server.h"
-
 #include <AggregateFunctions/registerAggregateFunctions.h>
 #include <Common/CPUAffinityManager.h>
 #include <Common/ClickHouseRevision.h>
 #include <Common/Config/ConfigReloader.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/DynamicThreadPool.h>
+#include <Common/FailPoint.h>
 #include <Common/Macros.h>
 #include <Common/RedactHelpers.h>
 #include <Common/StringUtils/StringUtils.h>
@@ -56,6 +55,8 @@
 #include <Poco/StringTokenizer.h>
 #include <Poco/Timestamp.h>
 #include <Server/RaftConfigParser.h>
+#include <Server/Server.h>
+#include <Server/ServerInfo.h>
 #include <Server/StorageConfigParser.h>
 #include <Server/UserConfigParser.h>
 #include <Storages/FormatVersion.h>
@@ -72,7 +73,6 @@
 #include <WindowFunctions/registerWindowFunctions.h>
 #include <common/ErrorHandlers.h>
 #include <common/config_common.h>
-#include <common/getMemoryAmount.h>
 #include <common/logger_useful.h>
 #include <sys/resource.h>
 
@@ -152,6 +152,7 @@ void loadMiConfig(Logger * log)
 }
 #undef TRY_LOAD_CONF
 #endif
+
 namespace
 {
 [[maybe_unused]] void tryLoadBoolConfigFromEnv(Poco::Logger * log, bool & target, const char * name)
@@ -179,8 +180,9 @@ namespace
 
 namespace CurrentMetrics
 {
-extern const Metric Revision;
-}
+extern const Metric LogicalCPUCores;
+extern const Metric MemoryCapacity;
+} // namespace CurrentMetrics
 
 namespace DB
 {
@@ -190,6 +192,7 @@ extern const int NO_ELEMENTS_IN_CONFIG;
 extern const int SUPPORT_IS_DISABLED;
 extern const int ARGUMENT_OUT_OF_BOUND;
 extern const int INVALID_CONFIG_PARAMETER;
+extern const int IP_ADDRESS_NOT_ALLOWED;
 } // namespace ErrorCodes
 
 namespace Debug
@@ -627,6 +630,10 @@ class Server::FlashGrpcServerHolder
             }
         }
         flash_grpc_server = builder.BuildAndStart();
+        if (!flash_grpc_server)
+        {
+            throw Exception("Exception happens when start grpc server, the flash.service_addr may be invalid, flash.service_addr is " + raft_config.flash_server_addr, ErrorCodes::IP_ADDRESS_NOT_ALLOWED);
+        }
         LOG_FMT_INFO(log, "Flash grpc server listening on [{}]", raft_config.flash_server_addr);
         Debug::setServiceAddr(raft_config.flash_server_addr);
         if (enable_async_server)
@@ -967,7 +974,10 @@ class Server::TcpHttpServersHolder
             LOG_DEBUG(log, debug_msg);
     }
 
-    const std::vector<std::unique_ptr<Poco::Net::TCPServer>> & getServers() const { return servers; }
+    const std::vector<std::unique_ptr<Poco::Net::TCPServer>> & getServers() const
+    {
+        return servers;
+    }
 
 private:
     Server & server;
@@ -983,6 +993,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
     Poco::Logger * log = &logger();
 #ifdef FIU_ENABLE
     fiu_init(0); // init failpoint
+    FailPointHelper::initRandomFailPoints(config(), log);
 #endif
 
     UpdateMallocConfig(log);
@@ -1002,7 +1013,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
 #ifdef TIFLASH_ENABLE_SVE_SUPPORT
     tryLoadBoolConfigFromEnv(log, simd_option::ENABLE_SVE, "TIFLASH_ENABLE_SVE");
 #endif
-
     registerFunctions();
     registerAggregateFunctions();
     registerWindowFunctions();
@@ -1049,7 +1059,22 @@ int Server::main(const std::vector<std::string> & /*args*/)
         LOG_FMT_INFO(log, "tiflash proxy thread is joined");
     });
 
-    CurrentMetrics::set(CurrentMetrics::Revision, ClickHouseRevision::get());
+    /// get CPU/memory/disk info of this server
+    if (tiflash_instance_wrap.proxy_helper)
+    {
+        diagnosticspb::ServerInfoRequest request;
+        request.set_tp(static_cast<diagnosticspb::ServerInfoType>(1));
+        diagnosticspb::ServerInfoResponse response;
+        std::string req = request.SerializeAsString();
+        auto * helper = tiflash_instance_wrap.proxy_helper;
+        helper->fn_server_info(helper->proxy_ptr, strIntoView(&req), &response);
+        server_info.parseSysInfo(response);
+        LOG_FMT_INFO(log, "ServerInfo: {}", server_info.debugString());
+    }
+    else
+    {
+        LOG_FMT_INFO(log, "TiFlashRaftProxyHelper is null, failed to get server info");
+    }
 
     // print necessary grpc log.
     grpc_log = &Poco::Logger::get("grpc");
@@ -1408,12 +1433,14 @@ int Server::main(const std::vector<std::string> & /*args*/)
 
         {
             // on ARM processors it can show only enabled at current moment cores
+            CurrentMetrics::set(CurrentMetrics::LogicalCPUCores, server_info.cpu_info.logical_cores);
+            CurrentMetrics::set(CurrentMetrics::MemoryCapacity, server_info.memory_info.capacity);
             LOG_FMT_INFO(
                 log,
-                "Available RAM = {}; physical cores = {}; threads = {}.",
-                formatReadableSizeWithBinarySuffix(getMemoryAmount()),
-                getNumberOfPhysicalCPUCores(),
-                std::thread::hardware_concurrency());
+                "Available RAM = {}; physical cores = {}; logical cores = {}.",
+                server_info.memory_info.capacity,
+                server_info.cpu_info.physical_cores,
+                server_info.cpu_info.logical_cores);
         }
 
         LOG_FMT_INFO(log, "Ready for connections.");
diff --git a/dbms/src/Server/Server.h b/dbms/src/Server/Server.h
index 278349f2aa4..07c5b955a92 100644
--- a/dbms/src/Server/Server.h
+++ b/dbms/src/Server/Server.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
+#include <Server/IServer.h>
+#include <Server/ServerInfo.h>
 #include <daemon/BaseDaemon.h>
 
-#include "IServer.h"
-
 /** Server provides three interfaces:
   * 1. HTTP - simple interface for any applications.
   * 2. TCP - interface for native clickhouse-client and for server to server internal communications.
@@ -39,7 +39,7 @@ class Server : public BaseDaemon
         return BaseDaemon::config();
     }
 
-    virtual const TiFlashSecurityConfig & securityConfig() const override { return security_config; };
+    const TiFlashSecurityConfig & securityConfig() const override { return security_config; };
 
     Poco::Logger & logger() const override
     {
@@ -70,6 +70,8 @@ class Server : public BaseDaemon
 
     TiFlashSecurityConfig security_config;
 
+    ServerInfo server_info;
+
     class FlashGrpcServerHolder;
     class TcpHttpServersHolder;
 };
diff --git a/dbms/src/Server/ServerInfo.cpp b/dbms/src/Server/ServerInfo.cpp
new file mode 100644
index 00000000000..9cba40c4775
--- /dev/null
+++ b/dbms/src/Server/ServerInfo.cpp
@@ -0,0 +1,199 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Common/FmtUtils.h>
+#include <Server/ServerInfo.h>
+
+#include <unordered_map>
+
+namespace DB
+{
+using diagnosticspb::ServerInfoItem;
+using diagnosticspb::ServerInfoResponse;
+
+void ServerInfo::parseCPUInfo(const diagnosticspb::ServerInfoItem & cpu_info_item)
+{
+    for (const auto & pair : cpu_info_item.pairs())
+    {
+        const auto & key = pair.key();
+        if (key == "cpu-logical-cores")
+        {
+            cpu_info.logical_cores = static_cast<UInt16>(std::stoi(pair.value()));
+        }
+        else if (key == "cpu-physical-cores")
+        {
+            cpu_info.physical_cores = static_cast<UInt16>(std::stoi(pair.value()));
+        }
+        else if (key == "cpu-frequency")
+        {
+            cpu_info.frequency = pair.value();
+        }
+        else if (key == "l1-cache-size")
+        {
+            cpu_info.l1_cache_size = static_cast<UInt32>(std::stoull(pair.value()));
+        }
+        else if (key == "l1-cache-line-size")
+        {
+            cpu_info.l1_cache_line_size = static_cast<UInt8>(std::stoi(pair.value()));
+        }
+        else if (key == "l2-cache-size")
+        {
+            cpu_info.l2_cache_size = static_cast<UInt32>(std::stoull(pair.value()));
+        }
+        else if (key == "l2-cache-line-size")
+        {
+            cpu_info.l2_cache_line_size = static_cast<UInt8>(std::stoi(pair.value()));
+        }
+        else if (key == "l3-cache-size")
+        {
+            cpu_info.l3_cache_size = static_cast<UInt32>(std::stoull(pair.value()));
+        }
+        else if (key == "l3-cache-line-size")
+        {
+            cpu_info.l3_cache_line_size = static_cast<UInt8>(std::stoi(pair.value()));
+        }
+        else if (key == "cpu-arch")
+        {
+            cpu_info.arch = pair.value();
+        }
+    }
+}
+
+void ServerInfo::parseDiskInfo(const diagnosticspb::ServerInfoItem & disk_info_item)
+{
+    Disk disk;
+    disk.name = disk_info_item.name();
+    for (const auto & pair : disk_info_item.pairs())
+    {
+        const auto & key = pair.key();
+        if (key == "type")
+        {
+            if (pair.value() == "HDD")
+            {
+                disk.disk_type = Disk::DiskType::HDD;
+            }
+            else if (pair.value() == "SSD")
+            {
+                disk.disk_type = Disk::DiskType::SSD;
+            }
+            else
+            {
+                disk.disk_type = Disk::DiskType::UNKNOWN;
+            }
+        }
+        else if (key == "total")
+        {
+            disk.total_space = static_cast<UInt64>(std::stoull(pair.value()));
+        }
+        else if (key == "free")
+        {
+            disk.free_space = static_cast<UInt64>(std::stoull(pair.value()));
+        }
+        else if (key == "path")
+        {
+            disk.mount_point = pair.value();
+        }
+        else if (key == "fstype")
+        {
+            disk.fs_type = pair.value();
+        }
+    }
+    disk_infos.push_back(disk);
+}
+
+void ServerInfo::parseMemoryInfo(const diagnosticspb::ServerInfoItem & memory_info_item)
+{
+    for (const auto & pair : memory_info_item.pairs())
+    {
+        if (pair.key() == "capacity")
+        {
+            memory_info.capacity = std::stoull(pair.value());
+        }
+    }
+}
+
+void ServerInfo::parseSysInfo(const diagnosticspb::ServerInfoResponse & sys_info_response)
+{
+    for (const auto & item : sys_info_response.items())
+    {
+        const auto & tp = item.tp();
+        if (tp == "cpu")
+        {
+            parseCPUInfo(item);
+        }
+        else if (tp == "disk")
+        {
+            parseDiskInfo(item);
+        }
+        else if (tp == "memory")
+        {
+            parseMemoryInfo(item);
+        }
+    }
+}
+
+String ServerInfo::debugString() const
+{
+    FmtBuffer fmt_buf;
+    // append cpu info
+    fmt_buf.fmtAppend("CPU: \n"
+                      "     logical cores: {}\n"
+                      "     physical cores: {}\n"
+                      "     frequency: {}\n"
+                      "     l1 cache size: {}\n"
+                      "     l1 cache line size: {}\n"
+                      "     l2 cache size: {}\n"
+                      "     l2 cache line size: {}\n"
+                      "     l3 cache size: {}\n"
+                      "     l3 cache line size: {}\n"
+                      "     arch: {}\n",
+                      cpu_info.logical_cores,
+                      cpu_info.physical_cores,
+                      cpu_info.frequency,
+                      cpu_info.l1_cache_size,
+                      cpu_info.l1_cache_line_size,
+                      cpu_info.l2_cache_size,
+                      cpu_info.l2_cache_line_size,
+                      cpu_info.l3_cache_size,
+                      cpu_info.l3_cache_line_size,
+                      cpu_info.arch);
+    // append disk info
+    {
+        const static String disk_type_str[] = {"UNKNOWN", "HDD", "SSD"};
+        for (const auto & disk_info : disk_infos)
+        {
+            fmt_buf.fmtAppend("Disk: \n"
+                              "     name: {}\n"
+                              "     type: {}\n"
+                              "     total space: {}\n"
+                              "     free space: {}\n"
+                              "     mount point: {}\n"
+                              "     fstype: {}\n",
+                              disk_info.name,
+                              disk_type_str[static_cast<UInt8>(disk_info.disk_type)],
+                              disk_info.total_space,
+                              disk_info.free_space,
+                              disk_info.mount_point,
+                              disk_info.fs_type);
+        }
+    }
+    // append memory info
+    fmt_buf.fmtAppend("Memory: \n"
+                      "     capacity: {}\n",
+                      memory_info.capacity);
+
+    return fmt_buf.toString();
+}
+
+} // namespace DB
\ No newline at end of file
diff --git a/dbms/src/Server/ServerInfo.h b/dbms/src/Server/ServerInfo.h
new file mode 100644
index 00000000000..9663bd37568
--- /dev/null
+++ b/dbms/src/Server/ServerInfo.h
@@ -0,0 +1,99 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <Common/getNumberOfPhysicalCPUCores.h>
+#include <common/getMemoryAmount.h>
+#include <common/types.h>
+
+#include <thread>
+#include <vector>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#ifdef __clang__
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+#include <kvproto/diagnosticspb.grpc.pb.h>
+#pragma GCC diagnostic pop
+
+namespace DB
+{
+class ServerInfo
+{
+public:
+    struct CPUInfo
+    {
+        /// number of logical CPU cores
+        UInt16 logical_cores = std::thread::hardware_concurrency();
+        /// number of physical CPU cores
+        UInt16 physical_cores = getNumberOfPhysicalCPUCores();
+        /// number of L1 cache size
+        /// units: Byte
+        UInt32 l1_cache_size = 16384; // 16KB (typical value)
+        /// number of L2 cache size
+        /// units: Byte
+        UInt32 l2_cache_size = 65536; // 64KB (typical value)
+        /// number of L3 cache size
+        /// units: Byte
+        UInt32 l3_cache_size = 2097152; // 2MB (typical value)
+        /// number of L1 cache line size
+        UInt8 l1_cache_line_size = 64; // 64B (typical value)
+        /// number of L2 cache line size
+        UInt8 l2_cache_line_size = 64; // 64B (typical value)
+        /// number of L3 cache line size
+        UInt8 l3_cache_line_size = 64; // 64B (typical value)
+        /// CPU architecture
+        String arch;
+        /// CPU frequency
+        String frequency;
+    };
+
+    struct Disk
+    {
+        String name;
+        enum DiskType
+        {
+            UNKNOWN = 0,
+            HDD = 1,
+            SSD = 2
+        };
+        DiskType disk_type;
+        UInt64 total_space = 0;
+        UInt64 free_space = 0;
+        String mount_point;
+        String fs_type;
+    };
+    using DiskInfo = std::vector<Disk>;
+
+    struct MemoryInfo
+    {
+        /// total memory size
+        /// units: Byte
+        UInt64 capacity = getMemoryAmount();
+    };
+
+    ServerInfo() = default;
+    ~ServerInfo() = default;
+    void parseCPUInfo(const diagnosticspb::ServerInfoItem & cpu_info_item);
+    void parseDiskInfo(const diagnosticspb::ServerInfoItem & disk_info_item);
+    void parseMemoryInfo(const diagnosticspb::ServerInfoItem & memory_info_item);
+    void parseSysInfo(const diagnosticspb::ServerInfoResponse & sys_info_response);
+    String debugString() const;
+
+    CPUInfo cpu_info;
+    DiskInfo disk_infos;
+    MemoryInfo memory_info;
+};
+} // namespace DB
diff --git a/dbms/src/Server/TCPHandler.h b/dbms/src/Server/TCPHandler.h
index ed0af52dc98..2fde0b11d9b 100644
--- a/dbms/src/Server/TCPHandler.h
+++ b/dbms/src/Server/TCPHandler.h
@@ -29,11 +29,6 @@
 
 #include "IServer.h"
 
-namespace CurrentMetrics
-{
-extern const Metric TCPConnection;
-}
-
 namespace Poco
 {
 class Logger;
@@ -131,8 +126,6 @@ class TCPHandler : public Poco::Net::TCPServerConnection
     /// At the moment, only one ongoing query in the connection is supported at a time.
     QueryState state;
 
-    CurrentMetrics::Increment metric_increment{CurrentMetrics::TCPConnection};
-
     /// It is the name of the server that will be sent to the client.
     String server_display_name;
 
diff --git a/dbms/src/Server/config_tools.h.in b/dbms/src/Server/config_tools.h.in
index 61aa3f41591..03a478a6473 100644
--- a/dbms/src/Server/config_tools.h.in
+++ b/dbms/src/Server/config_tools.h.in
@@ -6,4 +6,5 @@
 #cmakedefine01 ENABLE_CLICKHOUSE_CLIENT
 #cmakedefine01 ENABLE_TIFLASH_DTTOOL
 #cmakedefine01 ENABLE_TIFLASH_DTWORKLOAD
+#cmakedefine01 ENABLE_TIFLASH_PAGEWORKLOAD
 #cmakedefine01 ENABLE_TIFLASH_PAGECTL
diff --git a/dbms/src/Server/main.cpp b/dbms/src/Server/main.cpp
index 11cccf84729..dbcaa4f38fc 100644
--- a/dbms/src/Server/main.cpp
+++ b/dbms/src/Server/main.cpp
@@ -36,7 +36,10 @@
 #include <Server/DTTool/DTTool.h>
 #endif
 #if ENABLE_TIFLASH_DTWORKLOAD
-#include <Storages/DeltaMerge/tools/workload/DTWorkload.h>
+#include <Storages/DeltaMerge/workload/DTWorkload.h>
+#endif
+#if ENABLE_TIFLASH_PAGEWORKLOAD
+#include <Storages/Page/workload/PSWorkload.h>
 #endif
 #if ENABLE_TIFLASH_PAGECTL
 #include <Storages/Page/tools/PageCtl/PageStorageCtl.h>
@@ -107,6 +110,9 @@ std::pair<const char *, MainFunc> clickhouse_applications[] = {
 #if ENABLE_TIFLASH_DTWORKLOAD
     {"dtworkload", DB::DM::tests::DTWorkload::mainEntry},
 #endif
+#if ENABLE_TIFLASH_PAGEWORKLOAD
+    {"pageworkload", DB::PS::tests::StressWorkload::mainEntry},
+#endif
 #if ENABLE_TIFLASH_PAGECTL
     {"pagectl", DB::PageStorageCtl::mainEntry},
 #endif
diff --git a/dbms/src/Storages/BackgroundProcessingPool.cpp b/dbms/src/Storages/BackgroundProcessingPool.cpp
index 96c2c6cc622..45ba032bf53 100644
--- a/dbms/src/Storages/BackgroundProcessingPool.cpp
+++ b/dbms/src/Storages/BackgroundProcessingPool.cpp
@@ -42,7 +42,6 @@ inline static pid_t getTid()
 
 namespace CurrentMetrics
 {
-extern const Metric BackgroundPoolTask;
 extern const Metric MemoryTrackingInBackgroundProcessingPool;
 } // namespace CurrentMetrics
 
@@ -215,8 +214,6 @@ void BackgroundProcessingPool::threadFunction()
                 continue;
 
             {
-                CurrentMetrics::Increment metric_increment{CurrentMetrics::BackgroundPoolTask};
-
                 bool done_work = false;
                 if (!task->multi)
                 {
diff --git a/dbms/src/Storages/CMakeLists.txt b/dbms/src/Storages/CMakeLists.txt
index 90cc7a01d5b..68a2e6c9a74 100644
--- a/dbms/src/Storages/CMakeLists.txt
+++ b/dbms/src/Storages/CMakeLists.txt
@@ -15,16 +15,15 @@
 add_subdirectory (System)
 add_subdirectory (Page)
 add_subdirectory (DeltaMerge/File/dtpb)
-add_subdirectory (DeltaMerge/tools)
+add_subdirectory (DeltaMerge/workload)
+add_subdirectory (Page/workload)
 
 if (ENABLE_TESTS)
     add_subdirectory (tests EXCLUDE_FROM_ALL)
     add_subdirectory (Transaction/tests EXCLUDE_FROM_ALL)
     add_subdirectory (Page/V2/tests EXCLUDE_FROM_ALL)
-    if (ENABLE_V3_PAGESTORAGE)
-        add_subdirectory (Page/V3 EXCLUDE_FROM_ALL)
-        add_subdirectory (Page/V3/tests EXCLUDE_FROM_ALL)
-    endif ()
+    add_subdirectory (Page/V3 EXCLUDE_FROM_ALL)
+    add_subdirectory (Page/V3/tests EXCLUDE_FROM_ALL)
     add_subdirectory (DeltaMerge/tests EXCLUDE_FROM_ALL)
 endif ()
 
diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp
index 132732d6989..8a69b7573e2 100644
--- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp
+++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp
@@ -141,6 +141,19 @@ bool DeltaValueSpace::ingestColumnFiles(DMContext & /*context*/, const RowKeyRan
 
 bool DeltaValueSpace::flush(DMContext & context)
 {
+    bool v = false;
+    if (!is_flushing.compare_exchange_strong(v, true))
+    {
+        // other thread is flushing, just return.
+        LOG_FMT_DEBUG(log, "{}, Flush stop because other thread is flushing", simpleInfo());
+        return false;
+    }
+    SCOPE_EXIT({
+        bool v = true;
+        if (!is_flushing.compare_exchange_strong(v, false))
+            throw Exception(simpleInfo() + " is expected to be flushing", ErrorCodes::LOGICAL_ERROR);
+    });
+
     LOG_FMT_DEBUG(log, "{}, Flush start", info());
 
     /// We have two types of data needed to flush to disk:
diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h
index 8f14682caa8..04fb97b3004 100644
--- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h
+++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h
@@ -77,6 +77,11 @@ class DeltaValueSpace
     /// Note that those things can not be done at the same time.
     std::atomic_bool is_updating = false;
 
+    /// Note that it's safe to do multiple flush concurrently but only one of them can succeed,
+    /// and other thread's work is just a waste of resource.
+    /// So we only allow one flush task running at any time to aviod waste resource.
+    std::atomic_bool is_flushing = false;
+
     std::atomic<size_t> last_try_flush_rows = 0;
     std::atomic<size_t> last_try_flush_bytes = 0;
     std::atomic<size_t> last_try_compact_column_files = 0;
@@ -159,6 +164,8 @@ class DeltaValueSpace
     size_t getTotalCacheBytes() const;
     size_t getValidCacheRows() const;
 
+    bool isFlushing() const { return is_flushing; }
+
     bool isUpdating() const { return is_updating; }
 
     bool tryLockUpdating()
diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp
index a74404f3dbb..73ad22d6d1f 100644
--- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp
+++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp
@@ -34,6 +34,7 @@
 #include <Storages/Page/V2/VersionSet/PageEntriesVersionSetWithDelta.h>
 #include <Storages/PathPool.h>
 #include <Storages/Transaction/TMTContext.h>
+#include <common/logger_useful.h>
 
 #include <atomic>
 #include <ext/scope_guard.h>
@@ -979,14 +980,14 @@ void DeltaMergeStore::deleteRange(const Context & db_context, const DB::Settings
         checkSegmentUpdate(dm_context, segment, ThreadType::Write);
 }
 
-void DeltaMergeStore::flushCache(const DMContextPtr & dm_context, const RowKeyRange & range)
+bool DeltaMergeStore::flushCache(const DMContextPtr & dm_context, const RowKeyRange & range, bool try_until_succeed)
 {
     RowKeyRange cur_range = range;
     while (!cur_range.none())
     {
         RowKeyRange segment_range;
 
-        // Keep trying until succeeded.
+        // Keep trying until succeeded if needed.
         while (true)
         {
             SegmentPtr segment;
@@ -1009,10 +1010,15 @@ void DeltaMergeStore::flushCache(const DMContextPtr & dm_context, const RowKeyRa
             {
                 break;
             }
+            else if (!try_until_succeed)
+            {
+                return false;
+            }
         }
 
         cur_range.setStart(segment_range.end);
     }
+    return true;
 }
 
 void DeltaMergeStore::mergeDeltaAll(const Context & context)
@@ -1054,6 +1060,13 @@ std::optional<DM::RowKeyRange> DeltaMergeStore::mergeDeltaBySegment(const Contex
             segment = segment_it->second;
         }
 
+        if (!segment->flushCache(*dm_context))
+        {
+            // If the flush failed, it means there are parallel updates to the segment in the background.
+            // In this case, we try again.
+            continue;
+        }
+
         const auto new_segment = segmentMergeDelta(*dm_context, segment, run_thread);
         if (new_segment)
         {
@@ -1137,9 +1150,11 @@ BlockInputStreams DeltaMergeStore::readRaw(const Context & db_context,
     }
 
     fiu_do_on(FailPoints::force_slow_page_storage_snapshot_release, {
-        std::thread thread_hold_snapshots([tasks]() {
+        std::thread thread_hold_snapshots([this, tasks]() {
+            LOG_FMT_WARNING(log, "failpoint force_slow_page_storage_snapshot_release begin");
             std::this_thread::sleep_for(std::chrono::seconds(5 * 60));
             (void)tasks;
+            LOG_FMT_WARNING(log, "failpoint force_slow_page_storage_snapshot_release end");
         });
         thread_hold_snapshots.detach();
     });
@@ -1344,6 +1359,12 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const
         && (delta_rows - delta_last_try_flush_rows >= delta_cache_limit_rows
             || delta_bytes - delta_last_try_flush_bytes >= delta_cache_limit_bytes);
     bool should_foreground_flush = unsaved_rows >= delta_cache_limit_rows * 3 || unsaved_bytes >= delta_cache_limit_bytes * 3;
+    /// For write thread, we want to avoid foreground flush to block the process of apply raft command.
+    /// So we increase the threshold of foreground flush for write thread.
+    if (thread_type == ThreadType::Write)
+    {
+        should_foreground_flush = unsaved_rows >= delta_cache_limit_rows * 10 || unsaved_bytes >= delta_cache_limit_bytes * 10;
+    }
 
     bool should_background_merge_delta = ((delta_check_rows >= delta_limit_rows || delta_check_bytes >= delta_limit_bytes) //
                                           && (delta_rows - delta_last_try_merge_delta_rows >= delta_cache_limit_rows
@@ -1401,9 +1422,16 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const
         }
         else if (should_background_flush)
         {
-            delta_last_try_flush_rows = delta_rows;
-            delta_last_try_flush_bytes = delta_bytes;
-            try_add_background_task(BackgroundTask{TaskType::Flush, dm_context, segment, {}});
+            /// It's meaningless to add more flush tasks if the segment is flushing.
+            /// Because only one flush task can proceed at any time.
+            /// And after the current flush task finished,
+            /// it will call `checkSegmentUpdate` again to check whether there is more flush task to do.
+            if (!segment->isFlushing())
+            {
+                delta_last_try_flush_rows = delta_rows;
+                delta_last_try_flush_bytes = delta_bytes;
+                try_add_background_task(BackgroundTask{TaskType::Flush, dm_context, segment, {}});
+            }
         }
     }
 
@@ -1499,7 +1527,12 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const
         return false;
     };
     auto try_bg_compact = [&]() {
-        if (should_compact)
+        /// Compact task should be a really low priority task.
+        /// And if the segment is flushing,
+        /// we should avoid adding background compact task to reduce lock contention on the segment and save disk throughput.
+        /// And after the current flush task complete,
+        /// it will call `checkSegmentUpdate` again to check whether there is other kinds of task to do.
+        if (should_compact && !segment->isFlushing())
         {
             delta_last_try_compact_column_files = column_file_count;
             try_add_background_task(BackgroundTask{TaskType::Compact, dm_context, segment, {}});
diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h
index 705481ca107..57c2a42b807 100644
--- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h
+++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h
@@ -367,14 +367,14 @@ class DeltaMergeStore : private boost::noncopyable
                            const SegmentIdSet & read_segments = {},
                            size_t extra_table_id_index = InvalidColumnID);
 
-    /// Force flush all data to disk.
-    void flushCache(const Context & context, const RowKeyRange & range)
+    /// Try flush all data in `range` to disk and return whether the task succeed.
+    bool flushCache(const Context & context, const RowKeyRange & range, bool try_until_succeed = true)
     {
         auto dm_context = newDMContext(context, context.getSettingsRef());
-        flushCache(dm_context, range);
+        return flushCache(dm_context, range, try_until_succeed);
     }
 
-    void flushCache(const DMContextPtr & dm_context, const RowKeyRange & range);
+    bool flushCache(const DMContextPtr & dm_context, const RowKeyRange & range, bool try_until_succeed = true);
 
     /// Merge delta into the stable layer for all segments.
     ///
diff --git a/dbms/src/Storages/DeltaMerge/DeltaTree.h b/dbms/src/Storages/DeltaMerge/DeltaTree.h
index 47674ab2cfc..29e127fe35f 100644
--- a/dbms/src/Storages/DeltaMerge/DeltaTree.h
+++ b/dbms/src/Storages/DeltaMerge/DeltaTree.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <Common/TargetSpecific.h>
 #include <Core/Types.h>
 #include <IO/WriteHelpers.h>
 #include <Storages/DeltaMerge/Tuple.h>
@@ -810,6 +811,20 @@ class DeltaTree
     template <typename T>
     InternPtr afterNodeUpdated(T * node);
 
+#ifdef __x86_64__
+    template <typename T>
+    InternPtr afterNodeUpdatedGeneric(T * node);
+
+    template <typename T>
+    InternPtr afterNodeUpdatedAVX512(T * node);
+
+    template <typename T>
+    InternPtr afterNodeUpdatedAVX(T * node);
+
+    template <typename T>
+    InternPtr afterNodeUpdatedSSE4(T * node);
+#endif
+
     inline void afterLeafUpdated(LeafPtr leaf)
     {
         if (leaf->count == 0 && isRootOnly())
@@ -1348,158 +1363,86 @@ typename DT_CLASS::InterAndSid DT_CLASS::submitMinSid(T * node, UInt64 subtree_m
     }
 }
 
-DT_TEMPLATE
-template <class T>
-typename DT_CLASS::InternPtr DT_CLASS::afterNodeUpdated(T * node)
+#ifndef __x86_64__
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdated
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+#else
+
+// generic implementation
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdatedGeneric
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+
+// avx512 implementation
+TIFLASH_BEGIN_AVX512_SPECIFIC_CODE
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdatedAVX512
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+TIFLASH_END_TARGET_SPECIFIC_CODE
+
+// avx implementation
+TIFLASH_BEGIN_AVX_SPECIFIC_CODE
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdatedAVX
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+TIFLASH_END_TARGET_SPECIFIC_CODE
+
+// sse4 implementation
+TIFLASH_BEGIN_SSE4_SPECIFIC_CODE
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdatedSSE4
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+TIFLASH_END_TARGET_SPECIFIC_CODE
+
+namespace Impl
 {
-    if (!node)
-        return {};
-
-    constexpr bool is_leaf = std::is_same<Leaf, T>::value;
+enum class DeltaTreeVariant
+{
+    Generic,
+    SSE4,
+    AVX,
+    AVX512
+};
 
-    if (root == asNode(node) && !isLeaf(root) && node->count == 1)
+static inline DeltaTreeVariant resolveDeltaTreeVariant()
+{
+    if (DB::TargetSpecific::AVX512Checker::runtimeSupport())
     {
-        /// Decrease tree height.
-        root = as(Intern, root)->children[0];
-
-        --(node->count);
-        freeNode<T>(node);
-
-        if (isLeaf(root))
-            as(Leaf, root)->parent = nullptr;
-        else
-            as(Intern, root)->parent = nullptr;
-        --height;
-
-        LOG_FMT_TRACE(log, "height {} -> {}", (height + 1), height);
-
-        return {};
+        return DeltaTreeVariant::AVX512;
     }
-
-    auto parent = node->parent;
-    bool parent_updated = false;
-
-    if (T::overflow(node->count)) // split
+    if (DB::TargetSpecific::AVXChecker::runtimeSupport())
     {
-        if (!parent)
-        {
-            /// Increase tree height.
-            parent = createNode<Intern>();
-            root = asNode(parent);
-
-            parent->deltas[0] = checkDelta(node->getDelta());
-            parent->children[0] = asNode(node);
-            ++(parent->count);
-            parent->refreshChildParent();
-
-            ++height;
-
-            LOG_FMT_TRACE(log, "height {} -> {}", (height - 1), height);
-        }
-
-        auto pos = parent->searchChild(asNode(node));
-
-        T * next_n = createNode<T>();
-
-        UInt64 sep_sid = node->split(next_n);
-
-        // handle parent update
-        parent->shiftEntries(pos + 1, 1);
-        // for current node
-        parent->deltas[pos] = checkDelta(node->getDelta());
-        // for next node
-        parent->sids[pos] = sep_sid;
-        parent->deltas[pos + 1] = checkDelta(next_n->getDelta());
-        parent->children[pos + 1] = asNode(next_n);
-
-        ++(parent->count);
-
-        if constexpr (is_leaf)
-        {
-            if (as(Leaf, node) == right_leaf)
-                right_leaf = as(Leaf, next_n);
-        }
-
-        parent_updated = true;
+        return DeltaTreeVariant::AVX;
     }
-    else if (T::underflow(node->count) && root != asNode(node)) // adopt or merge
+    if (DB::TargetSpecific::SSE4Checker::runtimeSupport())
     {
-        auto pos = parent->searchChild(asNode(node));
-
-        // currently we always adopt from the right one if possible
-        bool is_sibling_left;
-        size_t sibling_pos;
-        T * sibling;
-
-        if (unlikely(parent->count <= 1))
-            throw Exception("Unexpected parent entry count: " + DB::toString(parent->count));
-
-        if (pos == parent->count - 1)
-        {
-            is_sibling_left = true;
-            sibling_pos = pos - 1;
-            sibling = as(T, parent->children[sibling_pos]);
-        }
-        else
-        {
-            is_sibling_left = false;
-            sibling_pos = pos + 1;
-            sibling = as(T, parent->children[sibling_pos]);
-        }
-
-        if (unlikely(sibling->parent != node->parent))
-            throw Exception("parent not the same");
-
-        auto after_adopt = (node->count + sibling->count) / 2;
-        if (T::underflow(after_adopt))
-        {
-            // Do merge.
-            // adoption won't work because the sibling doesn't have enough entries.
-
-            node->merge(sibling, is_sibling_left, pos);
-            freeNode<T>(sibling);
-
-            pos = std::min(pos, sibling_pos);
-            parent->deltas[pos] = checkDelta(node->getDelta());
-            parent->children[pos] = asNode(node);
-            parent->shiftEntries(pos + 2, -1);
-
-            if constexpr (is_leaf)
-            {
-                if (is_sibling_left && (as(Leaf, sibling) == left_leaf))
-                    left_leaf = as(Leaf, node);
-                else if (!is_sibling_left && as(Leaf, sibling) == right_leaf)
-                    right_leaf = as(Leaf, node);
-            }
-            --(parent->count);
-        }
-        else
-        {
-            // Do adoption.
-
-            auto adopt_count = after_adopt - node->count;
-            auto new_sep_sid = node->adopt(sibling, is_sibling_left, adopt_count, pos);
+        return DeltaTreeVariant::SSE4;
+    }
+    return DeltaTreeVariant::Generic;
+}
 
-            parent->sids[std::min(pos, sibling_pos)] = new_sep_sid;
-            parent->deltas[pos] = checkDelta(node->getDelta());
-            parent->deltas[sibling_pos] = checkDelta(sibling->getDelta());
-        }
+static inline DeltaTreeVariant DELTA_TREE_VARIANT = resolveDeltaTreeVariant();
+} // namespace Impl
 
-        parent_updated = true;
-    }
-    else if (parent)
+DT_TEMPLATE
+template <class T>
+typename DT_CLASS::InternPtr DT_CLASS::afterNodeUpdated(T * node)
+{
+    switch (Impl::DELTA_TREE_VARIANT)
     {
-        auto pos = parent->searchChild(asNode(node));
-        auto delta = node->getDelta();
-        parent_updated = parent->deltas[pos] != delta;
-        parent->deltas[pos] = checkDelta(delta);
+    case Impl::DeltaTreeVariant::Generic:
+        return afterNodeUpdatedGeneric(node);
+    case Impl::DeltaTreeVariant::SSE4:
+        return afterNodeUpdatedSSE4(node);
+    case Impl::DeltaTreeVariant::AVX:
+        return afterNodeUpdatedAVX(node);
+    case Impl::DeltaTreeVariant::AVX512:
+        return afterNodeUpdatedAVX512(node);
     }
-
-    if (parent_updated)
-        return parent;
-    else
-        return {};
 }
+#endif
+
 
 #undef as
 #undef asNode
diff --git a/dbms/src/Storages/DeltaMerge/DeltaTree.ipp b/dbms/src/Storages/DeltaMerge/DeltaTree.ipp
new file mode 100644
index 00000000000..27b8a3b96f1
--- /dev/null
+++ b/dbms/src/Storages/DeltaMerge/DeltaTree.ipp
@@ -0,0 +1,165 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+DT_TEMPLATE
+template <class T>
+__attribute__((noinline, flatten)) typename DT_CLASS::InternPtr DT_CLASS::TIFLASH_DT_IMPL_NAME(T * node)
+{
+    if (!node)
+        return {};
+
+    constexpr bool is_leaf = std::is_same<Leaf, T>::value;
+
+    if (root == asNode(node) && !isLeaf(root) && node->count == 1)
+    {
+        /// Decrease tree height.
+        root = as(Intern, root)->children[0];
+
+        --(node->count);
+        freeNode<T>(node);
+
+        if (isLeaf(root))
+            as(Leaf, root)->parent = nullptr;
+        else
+            as(Intern, root)->parent = nullptr;
+        --height;
+
+        LOG_FMT_TRACE(log, "height {} -> {}", (height + 1), height);
+
+        return {};
+    }
+
+    auto parent = node->parent;
+    bool parent_updated = false;
+
+    if (T::overflow(node->count)) // split
+    {
+        if (!parent)
+        {
+            /// Increase tree height.
+            parent = createNode<Intern>();
+            root = asNode(parent);
+
+            parent->deltas[0] = checkDelta(node->getDelta());
+            parent->children[0] = asNode(node);
+            ++(parent->count);
+            parent->refreshChildParent();
+
+            ++height;
+
+            LOG_FMT_TRACE(log, "height {} -> {}", (height - 1), height);
+        }
+
+        auto pos = parent->searchChild(asNode(node));
+
+        T * next_n = createNode<T>();
+
+        UInt64 sep_sid = node->split(next_n);
+
+        // handle parent update
+        parent->shiftEntries(pos + 1, 1);
+        // for current node
+        parent->deltas[pos] = checkDelta(node->getDelta());
+        // for next node
+        parent->sids[pos] = sep_sid;
+        parent->deltas[pos + 1] = checkDelta(next_n->getDelta());
+        parent->children[pos + 1] = asNode(next_n);
+
+        ++(parent->count);
+
+        if constexpr (is_leaf)
+        {
+            if (as(Leaf, node) == right_leaf)
+                right_leaf = as(Leaf, next_n);
+        }
+
+        parent_updated = true;
+    }
+    else if (T::underflow(node->count) && root != asNode(node)) // adopt or merge
+    {
+        auto pos = parent->searchChild(asNode(node));
+
+        // currently we always adopt from the right one if possible
+        bool is_sibling_left;
+        size_t sibling_pos;
+        T * sibling;
+
+        if (unlikely(parent->count <= 1))
+            throw Exception("Unexpected parent entry count: " + DB::toString(parent->count));
+
+        if (pos == parent->count - 1)
+        {
+            is_sibling_left = true;
+            sibling_pos = pos - 1;
+            sibling = as(T, parent->children[sibling_pos]);
+        }
+        else
+        {
+            is_sibling_left = false;
+            sibling_pos = pos + 1;
+            sibling = as(T, parent->children[sibling_pos]);
+        }
+
+        if (unlikely(sibling->parent != node->parent))
+            throw Exception("parent not the same");
+
+        auto after_adopt = (node->count + sibling->count) / 2;
+        if (T::underflow(after_adopt))
+        {
+            // Do merge.
+            // adoption won't work because the sibling doesn't have enough entries.
+
+            node->merge(sibling, is_sibling_left, pos);
+            freeNode<T>(sibling);
+
+            pos = std::min(pos, sibling_pos);
+            parent->deltas[pos] = checkDelta(node->getDelta());
+            parent->children[pos] = asNode(node);
+            parent->shiftEntries(pos + 2, -1);
+
+            if constexpr (is_leaf)
+            {
+                if (is_sibling_left && (as(Leaf, sibling) == left_leaf))
+                    left_leaf = as(Leaf, node);
+                else if (!is_sibling_left && as(Leaf, sibling) == right_leaf)
+                    right_leaf = as(Leaf, node);
+            }
+            --(parent->count);
+        }
+        else
+        {
+            // Do adoption.
+
+            auto adopt_count = after_adopt - node->count;
+            auto new_sep_sid = node->adopt(sibling, is_sibling_left, adopt_count, pos);
+
+            parent->sids[std::min(pos, sibling_pos)] = new_sep_sid;
+            parent->deltas[pos] = checkDelta(node->getDelta());
+            parent->deltas[sibling_pos] = checkDelta(sibling->getDelta());
+        }
+
+        parent_updated = true;
+    }
+    else if (parent)
+    {
+        auto pos = parent->searchChild(asNode(node));
+        auto delta = node->getDelta();
+        parent_updated = parent->deltas[pos] != delta;
+        parent->deltas[pos] = checkDelta(delta);
+    }
+
+    if (parent_updated)
+        return parent;
+    else
+        return {};
+}
\ No newline at end of file
diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp b/dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp
index 3bff05ef19f..272d548eee1 100644
--- a/dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp
+++ b/dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp
@@ -72,10 +72,9 @@ DMFileWriter::DMFileWriter(const DMFilePtr & dmfile_,
     for (auto & cd : write_columns)
     {
         // TODO: currently we only generate index for Integers, Date, DateTime types, and this should be configurable by user.
-        // TODO: If column type is nullable, we won't generate index for it
         /// for handle column always generate index
-        bool do_index = cd.id == EXTRA_HANDLE_COLUMN_ID || cd.type->isInteger() || cd.type->isDateOrDateTime();
-
+        auto type = removeNullable(cd.type);
+        bool do_index = cd.id == EXTRA_HANDLE_COLUMN_ID || type->isInteger() || type->isDateOrDateTime();
         if (options.flags.isSingleFile())
         {
             if (do_index)
@@ -122,7 +121,7 @@ void DMFileWriter::addStreams(ColId col_id, DataTypePtr type, bool do_index)
 void DMFileWriter::write(const Block & block, const BlockProperty & block_property)
 {
     is_empty_file = false;
-    DMFile::PackStat stat;
+    DMFile::PackStat stat{};
     stat.rows = block.rows();
     stat.not_clean = block_property.not_clean_rows;
     stat.bytes = block.bytes(); // This is bytes of pack data in memory.
@@ -219,7 +218,7 @@ void DMFileWriter::writeColumn(ColId col_id, const IDataType & type, const IColu
                         "Type shouldn be nullable when substream_path's type is NullMap.",
                         Errors::DeltaTree::Internal);
 
-                const ColumnNullable & col = static_cast<const ColumnNullable &>(column);
+                const auto & col = static_cast<const ColumnNullable &>(column);
                 col.checkConsistency();
                 DataTypeUInt8().serializeBinaryBulk(col.getNullMapColumn(), single_file_stream->original_layer, 0, rows);
             }
@@ -230,8 +229,8 @@ void DMFileWriter::writeColumn(ColId col_id, const IDataType & type, const IColu
                         "Type shouldn be nullable when substream_path's type is NullableElements.",
                         Errors::DeltaTree::Internal);
 
-                const DataTypeNullable & nullable_type = static_cast<const DataTypeNullable &>(type);
-                const ColumnNullable & col = static_cast<const ColumnNullable &>(column);
+                const auto & nullable_type = static_cast<const DataTypeNullable &>(type);
+                const auto & col = static_cast<const ColumnNullable &>(column);
                 nullable_type.getNestedType()->serializeBinaryBulk(col.getNestedColumn(), single_file_stream->original_layer, 0, rows);
             }
             else
diff --git a/dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.cpp b/dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.cpp
index 2681284948c..6229d54c169 100644
--- a/dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.cpp
+++ b/dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.cpp
@@ -61,7 +61,6 @@ inline std::pair<size_t, size_t> minmax(const IColumn & column, const ColumnVect
 
 void MinMaxIndex::addPack(const IColumn & column, const ColumnVector<UInt8> * del_mark)
 {
-    const IColumn * column_ptr = &column;
     auto size = column.size();
     bool has_null = false;
     if (column.isColumnNullable())
@@ -70,7 +69,6 @@ void MinMaxIndex::addPack(const IColumn & column, const ColumnVector<UInt8> * de
 
         const auto & nullable_column = static_cast<const ColumnNullable &>(column);
         const auto & null_mark_data = nullable_column.getNullMapColumn().getData();
-        column_ptr = &nullable_column.getNestedColumn();
 
         for (size_t i = 0; i < size; ++i)
         {
@@ -82,14 +80,13 @@ void MinMaxIndex::addPack(const IColumn & column, const ColumnVector<UInt8> * de
         }
     }
 
-    const IColumn & updated_column = *column_ptr;
-    auto [min_index, max_index] = details::minmax(updated_column, del_mark, 0, updated_column.size());
+    auto [min_index, max_index] = details::minmax(column, del_mark, 0, column.size());
     if (min_index != NONE_EXIST)
     {
         has_null_marks->push_back(has_null);
         has_value_marks->push_back(1);
-        minmaxes->insertFrom(updated_column, min_index);
-        minmaxes->insertFrom(updated_column, max_index);
+        minmaxes->insertFrom(column, min_index);
+        minmaxes->insertFrom(column, max_index);
     }
     else
     {
@@ -158,6 +155,62 @@ std::pair<UInt64, UInt64> MinMaxIndex::getUInt64MinMax(size_t pack_index)
     return {minmaxes->get64(pack_index * 2), minmaxes->get64(pack_index * 2 + 1)};
 }
 
+RSResult MinMaxIndex::checkNullableEqual(size_t pack_index, const Field & value, const DataTypePtr & type)
+{
+    const auto & column_nullable = static_cast<const ColumnNullable &>(*minmaxes);
+
+    const auto * raw_type = type.get();
+
+#define DISPATCH(TYPE)                                                                         \
+    if (typeid_cast<const DataType##TYPE *>(raw_type))                                         \
+    {                                                                                          \
+        auto & minmaxes_data = toColumnVectorData<TYPE>(column_nullable.getNestedColumnPtr()); \
+        auto min = minmaxes_data[pack_index * 2];                                              \
+        auto max = minmaxes_data[pack_index * 2 + 1];                                          \
+        return RoughCheck::checkEqual<TYPE>(value, type, min, max);                            \
+    }
+    FOR_NUMERIC_TYPES(DISPATCH)
+#undef DISPATCH
+    if (typeid_cast<const DataTypeDate *>(raw_type))
+    {
+        const auto & minmaxes_data = toColumnVectorData<DataTypeDate::FieldType>(column_nullable.getNestedColumnPtr());
+        auto min = minmaxes_data[pack_index * 2];
+        auto max = minmaxes_data[pack_index * 2 + 1];
+        return RoughCheck::checkEqual<DataTypeDate::FieldType>(value, type, min, max);
+    }
+    if (typeid_cast<const DataTypeDateTime *>(raw_type))
+    {
+        const auto & minmaxes_data = toColumnVectorData<DataTypeDateTime::FieldType>(column_nullable.getNestedColumnPtr());
+        auto min = minmaxes_data[pack_index * 2];
+        auto max = minmaxes_data[pack_index * 2 + 1];
+        return RoughCheck::checkEqual<DataTypeDateTime::FieldType>(value, type, min, max);
+    }
+    if (typeid_cast<const DataTypeMyDateTime *>(raw_type) || typeid_cast<const DataTypeMyDate *>(raw_type))
+    {
+        // For DataTypeMyDateTime / DataTypeMyDate, simply compare them as comparing UInt64 is OK.
+        // Check `struct MyTimeBase` for more details.
+        const auto & minmaxes_data = toColumnVectorData<DataTypeMyTimeBase::FieldType>(column_nullable.getNestedColumnPtr());
+        auto min = minmaxes_data[pack_index * 2];
+        auto max = minmaxes_data[pack_index * 2 + 1];
+        return RoughCheck::checkEqual<DataTypeMyTimeBase::FieldType>(value, type, min, max);
+    }
+    if (typeid_cast<const DataTypeString *>(raw_type))
+    {
+        const auto * string_column = checkAndGetColumn<ColumnString>(column_nullable.getNestedColumnPtr().get());
+        const auto & chars = string_column->getChars();
+        const auto & offsets = string_column->getOffsets();
+        size_t pos = pack_index * 2;
+        size_t prev_offset = pos == 0 ? 0 : offsets[pos - 1];
+        // todo use StringRef instead of String
+        auto min = String(chars[prev_offset], offsets[pos] - prev_offset - 1);
+        pos = pack_index * 2 + 1;
+        prev_offset = offsets[pos - 1];
+        auto max = String(chars[prev_offset], offsets[pos] - prev_offset - 1);
+        return RoughCheck::checkEqual<String>(value, type, min, max);
+    }
+    return RSResult::Some;
+}
+
 RSResult MinMaxIndex::checkEqual(size_t pack_index, const Field & value, const DataTypePtr & type)
 {
     if ((*has_null_marks)[pack_index] || value.isNull())
@@ -165,7 +218,13 @@ RSResult MinMaxIndex::checkEqual(size_t pack_index, const Field & value, const D
     if (!(*has_value_marks)[pack_index])
         return RSResult::None;
 
+    // if minmaxes_data has null value, the value of minmaxes_data[i] is meaningless and maybe just some random value.
+    // But we have checked the has_null_marks above and ensured that there is no null value in MinMax Indexes.
     const auto * raw_type = type.get();
+    if (typeid_cast<const DataTypeNullable *>(raw_type))
+    {
+        return checkNullableEqual(pack_index, value, removeNullable(type));
+    }
 #define DISPATCH(TYPE)                                              \
     if (typeid_cast<const DataType##TYPE *>(raw_type))              \
     {                                                               \
@@ -215,6 +274,62 @@ RSResult MinMaxIndex::checkEqual(size_t pack_index, const Field & value, const D
     }
     return RSResult::Some;
 }
+
+RSResult MinMaxIndex::checkNullableGreater(size_t pack_index, const Field & value, const DataTypePtr & type)
+{
+    const auto & column_nullable = static_cast<const ColumnNullable &>(*minmaxes);
+    const auto * raw_type = type.get();
+
+#define DISPATCH(TYPE)                                                                         \
+    if (typeid_cast<const DataType##TYPE *>(raw_type))                                         \
+    {                                                                                          \
+        auto & minmaxes_data = toColumnVectorData<TYPE>(column_nullable.getNestedColumnPtr()); \
+        auto min = minmaxes_data[pack_index * 2];                                              \
+        auto max = minmaxes_data[pack_index * 2 + 1];                                          \
+        return RoughCheck::checkGreater<TYPE>(value, type, min, max);                          \
+    }
+    FOR_NUMERIC_TYPES(DISPATCH)
+#undef DISPATCH
+    if (typeid_cast<const DataTypeDate *>(raw_type))
+    {
+        const auto & minmaxes_data = toColumnVectorData<DataTypeDate::FieldType>(column_nullable.getNestedColumnPtr());
+        auto min = minmaxes_data[pack_index * 2];
+        auto max = minmaxes_data[pack_index * 2 + 1];
+        return RoughCheck::checkGreater<DataTypeDate::FieldType>(value, type, min, max);
+    }
+    if (typeid_cast<const DataTypeDateTime *>(raw_type))
+    {
+        const auto & minmaxes_data = toColumnVectorData<DataTypeDateTime::FieldType>(column_nullable.getNestedColumnPtr());
+        auto min = minmaxes_data[pack_index * 2];
+        auto max = minmaxes_data[pack_index * 2 + 1];
+        return RoughCheck::checkGreater<DataTypeDateTime::FieldType>(value, type, min, max);
+    }
+    if (typeid_cast<const DataTypeMyDateTime *>(raw_type) || typeid_cast<const DataTypeMyDate *>(raw_type))
+    {
+        // For DataTypeMyDateTime / DataTypeMyDate, simply compare them as comparing UInt64 is OK.
+        // Check `struct MyTimeBase` for more details.
+        const auto & minmaxes_data = toColumnVectorData<DataTypeMyTimeBase::FieldType>(column_nullable.getNestedColumnPtr());
+        auto min = minmaxes_data[pack_index * 2];
+        auto max = minmaxes_data[pack_index * 2 + 1];
+        return RoughCheck::checkGreater<DataTypeMyTimeBase::FieldType>(value, type, min, max);
+    }
+    if (typeid_cast<const DataTypeString *>(raw_type))
+    {
+        const auto * string_column = checkAndGetColumn<ColumnString>(column_nullable.getNestedColumnPtr().get());
+        const auto & chars = string_column->getChars();
+        const auto & offsets = string_column->getOffsets();
+        size_t pos = pack_index * 2;
+        size_t prev_offset = pos == 0 ? 0 : offsets[pos - 1];
+        // todo use StringRef instead of String
+        auto min = String(chars[prev_offset], offsets[pos] - prev_offset - 1);
+        pos = pack_index * 2 + 1;
+        prev_offset = offsets[pos - 1];
+        auto max = String(chars[prev_offset], offsets[pos] - prev_offset - 1);
+        return RoughCheck::checkGreater<String>(value, type, min, max);
+    }
+    return RSResult::Some;
+}
+
 RSResult MinMaxIndex::checkGreater(size_t pack_index, const Field & value, const DataTypePtr & type, int /*nan_direction_hint*/)
 {
     if ((*has_null_marks)[pack_index] || value.isNull())
@@ -223,6 +338,10 @@ RSResult MinMaxIndex::checkGreater(size_t pack_index, const Field & value, const
         return RSResult::None;
 
     const auto * raw_type = type.get();
+    if (typeid_cast<const DataTypeNullable *>(raw_type))
+    {
+        return checkNullableGreater(pack_index, value, removeNullable(type));
+    }
 #define DISPATCH(TYPE)                                                \
     if (typeid_cast<const DataType##TYPE *>(raw_type))                \
     {                                                                 \
@@ -272,6 +391,62 @@ RSResult MinMaxIndex::checkGreater(size_t pack_index, const Field & value, const
     }
     return RSResult::Some;
 }
+
+RSResult MinMaxIndex::checkNullableGreaterEqual(size_t pack_index, const Field & value, const DataTypePtr & type)
+{
+    const auto & column_nullable = static_cast<const ColumnNullable &>(*minmaxes);
+
+    const auto * raw_type = type.get();
+#define DISPATCH(TYPE)                                                                         \
+    if (typeid_cast<const DataType##TYPE *>(raw_type))                                         \
+    {                                                                                          \
+        auto & minmaxes_data = toColumnVectorData<TYPE>(column_nullable.getNestedColumnPtr()); \
+        auto min = minmaxes_data[pack_index * 2];                                              \
+        auto max = minmaxes_data[pack_index * 2 + 1];                                          \
+        return RoughCheck::checkGreaterEqual<TYPE>(value, type, min, max);                     \
+    }
+    FOR_NUMERIC_TYPES(DISPATCH)
+#undef DISPATCH
+    if (typeid_cast<const DataTypeDate *>(raw_type))
+    {
+        const auto & minmaxes_data = toColumnVectorData<DataTypeDate::FieldType>(column_nullable.getNestedColumnPtr());
+        auto min = minmaxes_data[pack_index * 2];
+        auto max = minmaxes_data[pack_index * 2 + 1];
+        return RoughCheck::checkGreaterEqual<DataTypeDate::FieldType>(value, type, min, max);
+    }
+    if (typeid_cast<const DataTypeDateTime *>(raw_type))
+    {
+        const auto & minmaxes_data = toColumnVectorData<DataTypeDateTime::FieldType>(column_nullable.getNestedColumnPtr());
+        auto min = minmaxes_data[pack_index * 2];
+        auto max = minmaxes_data[pack_index * 2 + 1];
+        return RoughCheck::checkGreaterEqual<DataTypeDateTime::FieldType>(value, type, min, max);
+    }
+    if (typeid_cast<const DataTypeMyDateTime *>(raw_type) || typeid_cast<const DataTypeMyDate *>(raw_type))
+    {
+        // For DataTypeMyDateTime / DataTypeMyDate, simply compare them as comparing UInt64 is OK.
+        // Check `struct MyTimeBase` for more details.
+        const auto & minmaxes_data = toColumnVectorData<DataTypeMyTimeBase::FieldType>(column_nullable.getNestedColumnPtr());
+        auto min = minmaxes_data[pack_index * 2];
+        auto max = minmaxes_data[pack_index * 2 + 1];
+        return RoughCheck::checkGreaterEqual<DataTypeMyTimeBase::FieldType>(value, type, min, max);
+    }
+    if (typeid_cast<const DataTypeString *>(raw_type))
+    {
+        const auto * string_column = checkAndGetColumn<ColumnString>(column_nullable.getNestedColumnPtr().get());
+        const auto & chars = string_column->getChars();
+        const auto & offsets = string_column->getOffsets();
+        size_t pos = pack_index * 2;
+        size_t prev_offset = pos == 0 ? 0 : offsets[pos - 1];
+        // todo use StringRef instead of String
+        auto min = String(reinterpret_cast<const char *>(&chars[prev_offset]), offsets[pos] - prev_offset - 1);
+        pos = pack_index * 2 + 1;
+        prev_offset = offsets[pos - 1];
+        auto max = String(reinterpret_cast<const char *>(&chars[prev_offset]), offsets[pos] - prev_offset - 1);
+        return RoughCheck::checkGreaterEqual<String>(value, type, min, max);
+    }
+    return RSResult::Some;
+}
+
 RSResult MinMaxIndex::checkGreaterEqual(size_t pack_index, const Field & value, const DataTypePtr & type, int /*nan_direction_hint*/)
 {
     if ((*has_null_marks)[pack_index] || value.isNull())
@@ -280,6 +455,10 @@ RSResult MinMaxIndex::checkGreaterEqual(size_t pack_index, const Field & value,
         return RSResult::None;
 
     const auto * raw_type = type.get();
+    if (typeid_cast<const DataTypeNullable *>(raw_type))
+    {
+        return checkNullableGreaterEqual(pack_index, value, removeNullable(type));
+    }
 #define DISPATCH(TYPE)                                                     \
     if (typeid_cast<const DataType##TYPE *>(raw_type))                     \
     {                                                                      \
diff --git a/dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.h b/dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.h
index 7efd37fafa4..73284333c73 100644
--- a/dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.h
+++ b/dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.h
@@ -81,6 +81,9 @@ class MinMaxIndex
     RSResult checkGreaterEqual(size_t pack_index, const Field & value, const DataTypePtr & type, int nan_direction);
 
     static String toString();
+    RSResult checkNullableEqual(size_t pack_index, const Field & value, const DataTypePtr & type);
+    RSResult checkNullableGreater(size_t pack_index, const Field & value, const DataTypePtr & type);
+    RSResult checkNullableGreaterEqual(size_t pack_index, const Field & value, const DataTypePtr & type);
 };
 
 
diff --git a/dbms/src/Storages/DeltaMerge/Segment.h b/dbms/src/Storages/DeltaMerge/Segment.h
index cccfc5091b9..8058329ae91 100644
--- a/dbms/src/Storages/DeltaMerge/Segment.h
+++ b/dbms/src/Storages/DeltaMerge/Segment.h
@@ -300,6 +300,8 @@ class Segment : private boost::noncopyable
 
     void drop(const FileProviderPtr & file_provider, WriteBatches & wbs);
 
+    bool isFlushing() const { return delta->isFlushing(); }
+
     RowsAndBytes getRowsAndBytesInRange(
         DMContext & dm_context,
         const SegmentSnapshotPtr & segment_snap,
diff --git a/dbms/src/Storages/DeltaMerge/StoragePool.cpp b/dbms/src/Storages/DeltaMerge/StoragePool.cpp
index 752898f9c75..2791a74e9e3 100644
--- a/dbms/src/Storages/DeltaMerge/StoragePool.cpp
+++ b/dbms/src/Storages/DeltaMerge/StoragePool.cpp
@@ -624,8 +624,8 @@ PageId StoragePool::newDataPageIdForDTFile(StableDiskDelegator & delegator, cons
 
         auto existed_path = delegator.getDTFilePath(dtfile_id, /*throw_on_not_exist=*/false);
         fiu_do_on(FailPoints::force_set_dtfile_exist_when_acquire_id, {
-            static size_t fail_point_called = 0;
-            if (existed_path.empty() && fail_point_called % 10 == 0)
+            static std::atomic<UInt64> fail_point_called(0);
+            if (existed_path.empty() && fail_point_called.load() % 10 == 0)
             {
                 existed_path = "<mock for existed path>";
             }
diff --git a/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h b/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h
index b35dae0cbe2..84fafbc46ef 100644
--- a/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h
+++ b/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h
@@ -273,7 +273,8 @@ class DMTestEnv
                                          DataTypePtr pk_type = EXTRA_HANDLE_COLUMN_INT_TYPE,
                                          bool is_common_handle = false,
                                          size_t rowkey_column_size = 1,
-                                         bool with_internal_columns = true)
+                                         bool with_internal_columns = true,
+                                         bool is_deleted = false)
     {
         Block block;
         const size_t num_rows = (end - beg);
@@ -324,7 +325,7 @@ class DMTestEnv
                 VERSION_COLUMN_ID));
             // tag_col
             block.insert(DB::tests::createColumn<UInt8>(
-                std::vector<UInt64>(num_rows, 0),
+                std::vector<UInt64>(num_rows, is_deleted),
                 TAG_COLUMN_NAME,
                 TAG_COLUMN_ID));
         }
diff --git a/dbms/src/Storages/DeltaMerge/tests/MultiSegmentTestUtil.h b/dbms/src/Storages/DeltaMerge/tests/MultiSegmentTestUtil.h
index 7c5b0b2416d..787a521ded3 100644
--- a/dbms/src/Storages/DeltaMerge/tests/MultiSegmentTestUtil.h
+++ b/dbms/src/Storages/DeltaMerge/tests/MultiSegmentTestUtil.h
@@ -88,6 +88,7 @@ class MultiSegmentTestUtil : private boost::noncopyable
             // Check there is only one segment
             ASSERT_EQ(store->segments.size(), 1);
             const auto & [_key, seg] = *store->segments.begin();
+            (void)_key;
             ASSERT_EQ(seg->getDelta()->getRows(), n_avg_rows_per_segment * 4);
             ASSERT_EQ(seg->getStable()->getRows(), 0);
 
@@ -108,6 +109,7 @@ class MultiSegmentTestUtil : private boost::noncopyable
             auto segment_idx = 0;
             for (auto & [_key, seg] : store->segments)
             {
+                (void)_key;
                 LOG_FMT_INFO(log, "Segment #{}: Range = {}", segment_idx, seg->getRowKeyRange().toDebugString());
                 ASSERT_EQ(seg->getDelta()->getRows(), 0);
                 ASSERT_GT(seg->getStable()->getRows(), 0); // We don't check the exact rows of each segment.
@@ -147,6 +149,7 @@ class MultiSegmentTestUtil : private boost::noncopyable
         auto segment_idx = 0;
         for (auto & [_key, seg] : store->segments)
         {
+            (void)_key;
             ASSERT_EQ(seg->getDelta()->getRows(), expected_delta_rows[segment_idx]) << "Assert failed for segment #" << segment_idx;
             ASSERT_EQ(seg->getStable()->getRows(), expected_stable_rows[segment_idx]) << "Assert failed for segment #" << segment_idx;
             segment_idx++;
diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp
index e934f7a2049..b7913c44a2c 100644
--- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp
+++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp
@@ -3564,7 +3564,6 @@ class DeltaMergeStoreMergeDeltaBySegmentTest
 public:
     DeltaMergeStoreMergeDeltaBySegmentTest()
     {
-        log = &Poco::Logger::get(DB::base::TiFlashStorageTestBasic::getCurrentFullTestName());
         std::tie(ps_ver, pk_type) = GetParam();
     }
 
@@ -3607,8 +3606,6 @@ class DeltaMergeStoreMergeDeltaBySegmentTest
 
     UInt64 ps_ver;
     DMTestEnv::PkType pk_type;
-
-    [[maybe_unused]] Poco::Logger * log;
 };
 
 INSTANTIATE_TEST_CASE_P(
@@ -3765,6 +3762,55 @@ try
 CATCH
 
 
+// Verify that unflushed data will also be compacted.
+TEST_P(DeltaMergeStoreMergeDeltaBySegmentTest, Flush)
+try
+{
+    {
+        // Write data to first 3 segments and flush.
+        auto newly_written_rows = helper->rows_by_segments[0] + helper->rows_by_segments[1] + helper->rows_by_segments[2];
+        Block block = DMTestEnv::prepareSimpleWriteBlock(0, newly_written_rows, false, pk_type, 5 /* new tso */);
+        store->write(*db_context, db_context->getSettingsRef(), block);
+        store->flushCache(dm_context, RowKeyRange::newAll(store->isCommonHandle(), store->getRowKeyColumnSize()));
+
+        helper->expected_delta_rows[0] += helper->rows_by_segments[0];
+        helper->expected_delta_rows[1] += helper->rows_by_segments[1];
+        helper->expected_delta_rows[2] += helper->rows_by_segments[2];
+        helper->verifyExpectedRowsForAllSegments();
+
+        auto segment1 = std::next(store->segments.begin())->second;
+        ASSERT_EQ(segment1->getDelta()->getUnsavedRows(), 0);
+    }
+    {
+        // Write new data to segment[1] without flush.
+        auto newly_written_rows = helper->rows_by_segments[1];
+        Block block = DMTestEnv::prepareSimpleWriteBlock(helper->rows_by_segments[0], helper->rows_by_segments[0] + newly_written_rows, false, pk_type, 10 /* new tso */);
+        store->write(*db_context, db_context->getSettingsRef(), block);
+
+        helper->expected_delta_rows[1] += helper->rows_by_segments[1];
+        helper->verifyExpectedRowsForAllSegments();
+
+        auto segment1 = std::next(store->segments.begin())->second;
+        ASSERT_GT(segment1->getDelta()->getUnsavedRows(), 0);
+    }
+    {
+        auto segment1 = std::next(store->segments.begin())->second;
+        auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start, DeltaMergeStore::TaskRunThread::Foreground);
+        ASSERT_NE(result, std::nullopt);
+
+        segment1 = std::next(store->segments.begin())->second;
+        ASSERT_EQ(*result, segment1->getRowKeyRange());
+
+        helper->expected_stable_rows[1] += helper->expected_delta_rows[1];
+        helper->expected_delta_rows[1] = 0;
+        helper->verifyExpectedRowsForAllSegments();
+
+        ASSERT_EQ(segment1->getDelta()->getUnsavedRows(), 0);
+    }
+}
+CATCH
+
+
 } // namespace tests
 } // namespace DM
 } // namespace DB
diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_minmax_index.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_minmax_index.cpp
index 96c0070b73b..bb31b687186 100644
--- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_minmax_index.cpp
+++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_minmax_index.cpp
@@ -214,14 +214,6 @@ try
     ASSERT_EQ(true, checkMatch(case_name, *context, "MyDateTime", "2020-09-27", createLessEqual(attr("MyDateTime"), parseMyDateTime("2020-09-27"), 0)));
     ASSERT_EQ(false, checkMatch(case_name, *context, "MyDateTime", "2020-09-27", createLessEqual(attr("MyDateTime"), parseMyDateTime("2020-09-26"), 0)));
 
-    /// Currently we don't do filtering for null values. i.e. if a pack contains any null values, then the pack will pass the filter.
-    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "100"}, {"1", "1", "0", "\\N"}}, createEqual(attr("Nullable(Int64)"), Field((Int64)101))));
-    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "100"}, {"1", "1", "0", "\\N"}}, createIn(attr("Nullable(Int64)"), {Field((Int64)101)})));
-    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "100"}, {"1", "1", "0", "\\N"}}, createGreater(attr("Nullable(Int64)"), Field((Int64)100), 0)));
-    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "100"}, {"1", "1", "0", "\\N"}}, createGreaterEqual(attr("Nullable(Int64)"), Field((Int64)101), 0)));
-    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "100"}, {"1", "1", "0", "\\N"}}, createLess(attr("Nullable(Int64)"), Field((Int64)100), 0)));
-    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "100"}, {"1", "1", "0", "\\N"}}, createLessEqual(attr("Nullable(Int64)"), Field((Int64)99), 0)));
-
     ASSERT_EQ(false, checkDelMatch(case_name, *context, "Int64", "100", createEqual(attr("Int64"), Field((Int64)100))));
     ASSERT_EQ(true, checkPkMatch(case_name, *context, "Int64", "100", createEqual(pkAttr(), Field((Int64)100)), true));
     ASSERT_EQ(true, checkPkMatch(case_name, *context, "Int64", "100", createGreater(pkAttr(), Field((Int64)99), 0), true));
@@ -236,6 +228,80 @@ try
 }
 CATCH
 
+TEST_F(DMMinMaxIndexTest, NullableToNullable)
+try
+{
+    const auto * case_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+    // clang-format off
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(Int64)", "100", createEqual(attr("Nullable(Int64)"), Field((Int64)101))));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", "100", createEqual(attr("Nullable(Int64)"), Field((Int64)100))));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", "100", createIn(attr("Nullable(Int64)"), {Field((Int64)100)})));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(Int64)", "100", createIn(attr("Nullable(Int64)"), {Field((Int64)101)})));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", "100", createGreater(attr("Nullable(Int64)"), Field((Int64)99), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(Int64)", "100", createGreater(attr("Nullable(Int64)"), Field((Int64)100), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", "100", createGreaterEqual(attr("Nullable(Int64)"), Field((Int64)100), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(Int64)", "100", createGreaterEqual(attr("Nullable(Int64)"), Field((Int64)101), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", "100", createLess(attr("Nullable(Int64)"), Field((Int64)101), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(Int64)", "100", createLess(attr("Nullable(Int64)"), Field((Int64)100), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", "100", createLessEqual(attr("Nullable(Int64)"), Field((Int64)100), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(Int64)", "100", createLessEqual(attr("Nullable(Int64)"), Field((Int64)99), 0)));
+
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Date)", "2020-09-27", createEqual(attr("Nullable(Date)"), Field((String) "2020-09-27"))));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(Date)", "2020-09-27", createEqual(attr("Nullable(Date)"), Field((String) "2020-09-28"))));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Date)", "2020-09-27", createIn(attr("Nullable(Date)"), {Field((String) "2020-09-27")})));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(Date)", "2020-09-27", createIn(attr("Nullable(Date)"), {Field((String) "2020-09-28")})));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Date)", "2020-09-27", createGreater(attr("Nullable(Date)"), Field((String) "2020-09-26"), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(Date)", "2020-09-27", createGreater(attr("Nullable(Date)"), Field((String) "2020-09-27"), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Date)", "2020-09-27", createGreaterEqual(attr("Nullable(Date)"), Field((String) "2020-09-27"), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(Date)", "2020-09-27", createGreaterEqual(attr("Nullable(Date)"), Field((String) "2020-09-28"), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Date)", "2020-09-27", createLess(attr("Nullable(Date)"), Field((String) "2020-09-28"), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(Date)", "2020-09-27", createLess(attr("Nullable(Date)"), Field((String) "2020-09-27"), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Date)", "2020-09-27", createLessEqual(attr("Nullable(Date)"), Field((String) "2020-09-27"), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(Date)", "2020-09-27", createLessEqual(attr("Nullable(Date)"), Field((String) "2020-09-26"), 0)));
+
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(DateTime)", "2020-01-01 05:00:01", createEqual(attr("Nullable(DateTime)"), Field((String) "2020-01-01 05:00:01"))));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(DateTime)", "2020-01-01 05:00:01", createEqual(attr("Nullable(DateTime)"), Field((String) "2020-01-01 05:00:02"))));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(DateTime)", "2020-01-01 05:00:01", createIn(attr("Nullable(DateTime)"), {Field((String) "2020-01-01 05:00:01")})));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(DateTime)", "2020-01-01 05:00:01", createIn(attr("Nullable(DateTime)"), {Field((String) "2020-01-01 05:00:02")})));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(DateTime)", "2020-01-01 05:00:01", createGreater(attr("Nullable(DateTime)"), Field((String) "2020-01-01 05:00:00"), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(DateTime)", "2020-01-01 05:00:01", createGreater(attr("Nullable(DateTime)"), Field((String) "2020-01-01 05:00:01"), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(DateTime)", "2020-01-01 05:00:01", createGreaterEqual(attr("Nullable(DateTime)"), Field((String) "2020-01-01 05:00:01"), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(DateTime)", "2020-01-01 05:00:01", createGreaterEqual(attr("Nullable(DateTime)"), Field((String) "2020-01-01 05:00:02"), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(DateTime)", "2020-01-01 05:00:01", createLess(attr("Nullable(DateTime)"), Field((String) "2020-01-01 05:00:02"), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(DateTime)", "2020-01-01 05:00:01", createLess(attr("Nullable(DateTime)"), Field((String) "2020-01-01 05:00:01"), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(DateTime)", "2020-01-01 05:00:01", createLessEqual(attr("Nullable(DateTime)"), Field((String) "2020-01-01 05:00:01"), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(DateTime)", "2020-01-01 05:00:01", createLessEqual(attr("Nullable(DateTime)"), Field((String) "2020-01-01 05:00:00"), 0)));
+
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(MyDateTime)", "2020-09-27", createEqual(attr("Nullable(MyDateTime)"), parseMyDateTime("2020-09-27"))));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(MyDateTime)", "2020-09-27", createEqual(attr("Nullable(MyDateTime)"), parseMyDateTime("2020-09-28"))));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(MyDateTime)", "2020-09-27", createIn(attr("Nullable(MyDateTime)"), {parseMyDateTime("2020-09-27")})));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(MyDateTime)", "2020-09-27", createIn(attr("Nullable(MyDateTime)"), {parseMyDateTime("2020-09-28")})));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(MyDateTime)", "2020-09-27", createGreater(attr("Nullable(MyDateTime)"), parseMyDateTime("2020-09-26"), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(MyDateTime)", "2020-09-27", createGreater(attr("Nullable(MyDateTime)"), parseMyDateTime("2020-09-27"), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(MyDateTime)", "2020-09-27", createGreaterEqual(attr("Nullable(MyDateTime)"), parseMyDateTime("2020-09-27"), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(MyDateTime)", "2020-09-27", createGreaterEqual(attr("Nullable(MyDateTime)"), parseMyDateTime("2020-09-28"), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(MyDateTime)", "2020-09-27", createLess(attr("Nullable(MyDateTime)"), parseMyDateTime("2020-09-28"), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(MyDateTime)", "2020-09-27", createLess(attr("Nullable(MyDateTime)"), parseMyDateTime("2020-09-27"), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(MyDateTime)", "2020-09-27", createLessEqual(attr("Nullable(MyDateTime)"), parseMyDateTime("2020-09-27"), 0)));
+    ASSERT_EQ(false, checkMatch(case_name, *context, "Nullable(MyDateTime)", "2020-09-27", createLessEqual(attr("Nullable(MyDateTime)"), parseMyDateTime("2020-09-26"), 0)));
+
+    // has null
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "100"}, {"1", "1", "0", "\\N"}}, createEqual(attr("Nullable(Int64)"), Field((Int64)101))));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "100"}, {"1", "1", "0", "\\N"}}, createIn(attr("Nullable(Int64)"), {Field((Int64)101)})));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "100"}, {"1", "1", "0", "\\N"}}, createGreater(attr("Nullable(Int64)"), Field((Int64)100), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "100"}, {"1", "1", "0", "\\N"}}, createGreaterEqual(attr("Nullable(Int64)"), Field((Int64)101), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "100"}, {"1", "1", "0", "\\N"}}, createLess(attr("Nullable(Int64)"), Field((Int64)100), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "100"}, {"1", "1", "0", "\\N"}}, createLessEqual(attr("Nullable(Int64)"), Field((Int64)99), 0)));
+
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "\\N"}}, createEqual(attr("Nullable(Int64)"), Field((Int64)101))));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "\\N"}}, createIn(attr("Nullable(Int64)"), {Field((Int64)101)})));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "\\N"}}, createGreater(attr("Nullable(Int64)"), Field((Int64)100), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "\\N"}}, createGreaterEqual(attr("Nullable(Int64)"), Field((Int64)101), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "\\N"}}, createLess(attr("Nullable(Int64)"), Field((Int64)100), 0)));
+    ASSERT_EQ(true, checkMatch(case_name, *context, "Nullable(Int64)", {{"0", "0", "0", "\\N"}}, createLessEqual(attr("Nullable(Int64)"), Field((Int64)99), 0)));
+}
+CATCH
+
 TEST_F(DMMinMaxIndexTest, Logical)
 try
 {
diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp
new file mode 100644
index 00000000000..dc43ef3713b
--- /dev/null
+++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp
@@ -0,0 +1,100 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <Common/CurrentMetrics.h>
+#include <DataStreams/OneBlockInputStream.h>
+#include <Storages/DeltaMerge/DeltaMergeStore.h>
+#include <Storages/DeltaMerge/tests/gtest_segment_test_basic.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+
+namespace DB
+{
+namespace DM
+{
+namespace tests
+{
+class SegmentOperationTest : public SegmentTestBasic
+{
+protected:
+    static void SetUpTestCase() {}
+};
+
+TEST_F(SegmentOperationTest, Issue4956)
+try
+{
+    SegmentTestOptions options;
+    reloadWithOptions(options);
+
+    // flush data, make the segment can be split.
+    writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID);
+    // write data to cache, reproduce the https://github.com/pingcap/tiflash/issues/4956
+    writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    deleteRangeSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    auto segment_id = splitSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    ASSERT_TRUE(segment_id.has_value());
+
+    mergeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id);
+}
+CATCH
+
+TEST_F(SegmentOperationTest, TestSegment)
+try
+{
+    SegmentTestOptions options;
+    reloadWithOptions(options);
+    writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID);
+    mergeSegmentDelta(DELTA_MERGE_FIRST_SEGMENT_ID);
+    auto segment_id = splitSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    ASSERT_TRUE(segment_id.has_value());
+
+    size_t origin_rows = getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID);
+
+    writeSegment(*segment_id);
+    flushSegmentCache(*segment_id);
+    deleteRangeSegment(*segment_id);
+    writeSegmentWithDeletedPack(*segment_id);
+    mergeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id);
+
+    EXPECT_EQ(getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID), origin_rows);
+}
+CATCH
+
+TEST_F(SegmentOperationTest, TestSegmentRandom)
+try
+{
+    srand(time(nullptr));
+    SegmentTestOptions options;
+    options.is_common_handle = true;
+    reloadWithOptions(options);
+    randomSegmentTest(100);
+}
+CATCH
+
+// run in CI weekly
+TEST_F(SegmentOperationTest, DISABLED_TestSegmentRandomForCI)
+try
+{
+    srand(time(nullptr));
+    SegmentTestOptions options;
+    options.is_common_handle = true;
+    reloadWithOptions(options);
+    randomSegmentTest(10000);
+}
+CATCH
+
+} // namespace tests
+} // namespace DM
+} // namespace DB
diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp
new file mode 100644
index 00000000000..c676f2e08d5
--- /dev/null
+++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp
@@ -0,0 +1,430 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <Common/CurrentMetrics.h>
+#include <DataStreams/OneBlockInputStream.h>
+#include <Storages/DeltaMerge/DMContext.h>
+#include <Storages/DeltaMerge/DeltaMergeStore.h>
+#include <Storages/DeltaMerge/Segment.h>
+#include <Storages/DeltaMerge/tests/DMTestEnv.h>
+#include <Storages/DeltaMerge/tests/gtest_segment_test_basic.h>
+#include <Storages/Transaction/TMTContext.h>
+#include <Storages/tests/TiFlashStorageTestBasic.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+namespace DB
+{
+namespace DM
+{
+namespace tests
+{
+void SegmentTestBasic::reloadWithOptions(SegmentTestOptions config)
+{
+    TiFlashStorageTestBasic::SetUp();
+    options = config;
+    table_columns = std::make_shared<ColumnDefines>();
+
+    root_segment = reload(config.is_common_handle);
+    ASSERT_EQ(root_segment->segmentId(), DELTA_MERGE_FIRST_SEGMENT_ID);
+    segments.clear();
+    segments[DELTA_MERGE_FIRST_SEGMENT_ID] = root_segment;
+}
+
+PageId SegmentTestBasic::createNewSegmentWithSomeData()
+{
+    SegmentPtr new_segment;
+    std::tie(root_segment, new_segment) = root_segment->split(dmContext(), tableColumns());
+
+    const size_t num_rows_write_per_batch = 100;
+    {
+        // write to segment and flush
+        Block block = DMTestEnv::prepareSimpleWriteBlock(0, num_rows_write_per_batch, false);
+        new_segment->write(dmContext(), std::move(block), true);
+    }
+    {
+        // write to segment and don't flush
+        Block block = DMTestEnv::prepareSimpleWriteBlock(num_rows_write_per_batch, 2 * num_rows_write_per_batch, false);
+        new_segment->write(dmContext(), std::move(block), false);
+    }
+    return new_segment->segmentId();
+}
+
+size_t SegmentTestBasic::getSegmentRowNumWithoutMVCC(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    auto in = segment->getInputStreamRaw(dmContext(), *tableColumns());
+
+    size_t num_rows_read = 0;
+    in->readPrefix();
+    while (Block block = in->read())
+    {
+        num_rows_read += block.rows();
+    }
+    in->readSuffix();
+    return num_rows_read;
+}
+
+size_t SegmentTestBasic::getSegmentRowNum(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    auto in = segment->getInputStream(dmContext(), *tableColumns(), {segment->getRowKeyRange()});
+
+    size_t num_rows_read = 0;
+    in->readPrefix();
+    while (Block block = in->read())
+    {
+        num_rows_read += block.rows();
+    }
+    in->readSuffix();
+    return num_rows_read;
+}
+
+void SegmentTestBasic::checkSegmentRow(PageId segment_id, size_t expected_row_num)
+{
+    auto segment = segments[segment_id];
+    // read written data
+    auto in = segment->getInputStream(dmContext(), *tableColumns(), {segment->getRowKeyRange()});
+
+    size_t num_rows_read = 0;
+    in->readPrefix();
+    while (Block block = in->read())
+    {
+        num_rows_read += block.rows();
+    }
+    in->readSuffix();
+    ASSERT_EQ(num_rows_read, expected_row_num);
+}
+
+std::optional<PageId> SegmentTestBasic::splitSegment(PageId segment_id)
+{
+    auto origin_segment = segments[segment_id];
+    size_t origin_segment_row_num = getSegmentRowNum(segment_id);
+    SegmentPtr segment, new_segment;
+    std::tie(segment, new_segment) = origin_segment->split(dmContext(), tableColumns());
+    if (new_segment)
+    {
+        segments[new_segment->segmentId()] = new_segment;
+        segments[segment_id] = segment;
+
+        EXPECT_EQ(origin_segment_row_num, getSegmentRowNum(segment_id) + getSegmentRowNum(new_segment->segmentId()));
+        return new_segment->segmentId();
+    }
+    return std::nullopt;
+}
+
+void SegmentTestBasic::mergeSegment(PageId left_segment_id, PageId right_segment_id)
+{
+    auto left_segment = segments[left_segment_id];
+    auto right_segment = segments[right_segment_id];
+
+    size_t left_segment_row_num = getSegmentRowNum(left_segment_id);
+    size_t right_segment_row_num = getSegmentRowNum(right_segment_id);
+    LOG_FMT_TRACE(&Poco::Logger::root(), "merge in segment:{}:{} and {}:{}", left_segment->segmentId(), left_segment_row_num, right_segment->segmentId(), right_segment_row_num);
+
+    SegmentPtr merged_segment = Segment::merge(dmContext(), tableColumns(), left_segment, right_segment);
+    segments[merged_segment->segmentId()] = merged_segment;
+    auto it = segments.find(right_segment->segmentId());
+    if (it != segments.end())
+    {
+        segments.erase(it);
+    }
+    EXPECT_EQ(getSegmentRowNum(merged_segment->segmentId()), left_segment_row_num + right_segment_row_num);
+}
+
+void SegmentTestBasic::mergeSegmentDelta(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    size_t segment_row_num = getSegmentRowNum(segment_id);
+    SegmentPtr merged_segment = segment->mergeDelta(dmContext(), tableColumns());
+    segments[merged_segment->segmentId()] = merged_segment;
+    EXPECT_EQ(getSegmentRowNum(merged_segment->segmentId()), segment_row_num);
+}
+
+void SegmentTestBasic::flushSegmentCache(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    size_t segment_row_num = getSegmentRowNum(segment_id);
+    segment->flushCache(dmContext());
+    EXPECT_EQ(getSegmentRowNum(segment_id), segment_row_num);
+}
+
+std::pair<Int64, Int64> SegmentTestBasic::getSegmentKeyRange(SegmentPtr segment)
+{
+    Int64 start_key, end_key;
+    if (!options.is_common_handle)
+    {
+        start_key = segment->getRowKeyRange().getStart().int_value;
+        end_key = segment->getRowKeyRange().getEnd().int_value;
+        return {start_key, end_key};
+    }
+    EXPECT_EQ(segment->getRowKeyRange().getStart().data[0], TiDB::CodecFlagInt);
+    EXPECT_EQ(segment->getRowKeyRange().getEnd().data[0], TiDB::CodecFlagInt);
+    {
+        size_t cursor = 1;
+        start_key = DecodeInt64(cursor, String(segment->getRowKeyRange().getStart().data, segment->getRowKeyRange().getStart().size));
+    }
+    {
+        size_t cursor = 1;
+        end_key = DecodeInt64(cursor, String(segment->getRowKeyRange().getEnd().data, segment->getRowKeyRange().getEnd().size));
+    }
+    return {start_key, end_key};
+}
+
+void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows)
+{
+    if (write_rows == 0)
+    {
+        return;
+    }
+    auto segment = segments[segment_id];
+    size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id);
+    std::pair<Int64, Int64> keys = getSegmentKeyRange(segment);
+    Int64 start_key = keys.first;
+    Int64 end_key = keys.second;
+    UInt64 remain_row_num = 0;
+    if (static_cast<UInt64>(end_key - start_key) > write_rows)
+    {
+        end_key = start_key + write_rows;
+    }
+    else
+    {
+        remain_row_num = write_rows - static_cast<UInt64>(end_key - start_key);
+    }
+    {
+        // write to segment and not flush
+        Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, end_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle);
+        segment->write(dmContext(), std::move(block), false);
+        LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, end_key);
+        version++;
+    }
+    while (remain_row_num > 0)
+    {
+        UInt64 write_num = std::min(remain_row_num, static_cast<UInt64>(end_key - start_key));
+        Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, write_num + start_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle);
+        segment->write(dmContext(), std::move(block), false);
+        remain_row_num -= write_num;
+        LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, write_num + start_key);
+        version++;
+    }
+    EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows);
+}
+
+void SegmentTestBasic::writeSegmentWithDeletedPack(PageId segment_id)
+{
+    UInt64 write_rows = DEFAULT_MERGE_BLOCK_SIZE;
+    auto segment = segments[segment_id];
+    size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id);
+    std::pair<Int64, Int64> keys = getSegmentKeyRange(segment);
+    Int64 start_key = keys.first;
+    Int64 end_key = keys.second;
+    UInt64 remain_row_num = 0;
+    if (static_cast<UInt64>(end_key - start_key) > write_rows)
+    {
+        end_key = start_key + write_rows;
+    }
+    else
+    {
+        remain_row_num = write_rows - static_cast<UInt64>(end_key - start_key);
+    }
+    {
+        // write to segment and not flush
+        Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, end_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle, 1, true, true);
+        segment->write(dmContext(), std::move(block), true);
+        LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, end_key);
+        version++;
+    }
+    while (remain_row_num > 0)
+    {
+        UInt64 write_num = std::min(remain_row_num, static_cast<UInt64>(end_key - start_key));
+        Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, write_num + start_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle, 1, true, true);
+        segment->write(dmContext(), std::move(block), true);
+        remain_row_num -= write_num;
+        LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, write_num + start_key);
+        version++;
+    }
+    EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows);
+}
+
+void SegmentTestBasic::deleteRangeSegment(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    segment->write(dmContext(), /*delete_range*/ segment->getRowKeyRange());
+    EXPECT_EQ(getSegmentRowNum(segment_id), 0);
+}
+
+void SegmentTestBasic::writeRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start write segment:{}", random_segment_id);
+    writeSegment(random_segment_id);
+}
+void SegmentTestBasic::writeRandomSegmentWithDeletedPack()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start write segment with deleted pack:{}", random_segment_id);
+    writeSegmentWithDeletedPack(random_segment_id);
+}
+
+void SegmentTestBasic::deleteRangeRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start delete range segment:{}", random_segment_id);
+    deleteRangeSegment(random_segment_id);
+}
+
+void SegmentTestBasic::splitRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start split segment:{}", random_segment_id);
+    splitSegment(random_segment_id);
+}
+
+void SegmentTestBasic::mergeRandomSegment()
+{
+    if (segments.empty() || segments.size() == 1)
+    {
+        return;
+    }
+    std::pair<PageId, PageId> segment_pair;
+    segment_pair = getRandomMergeablePair();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start merge segment:{} and {}", segment_pair.first, segment_pair.second);
+    mergeSegment(segment_pair.first, segment_pair.second);
+}
+
+void SegmentTestBasic::mergeDeltaRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start merge delta in segment:{}", random_segment_id);
+    mergeSegmentDelta(random_segment_id);
+}
+
+void SegmentTestBasic::flushCacheRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start flush cache in segment:{}", random_segment_id);
+    flushSegmentCache(random_segment_id);
+}
+
+void SegmentTestBasic::randomSegmentTest(size_t operator_count)
+{
+    for (size_t i = 0; i < operator_count; i++)
+    {
+        auto op = static_cast<SegmentOperaterType>(random() % SegmentOperaterMax);
+        segment_operator_entries[op]();
+    }
+}
+
+PageId SegmentTestBasic::getRandomSegmentId()
+{
+    auto max_segment_id = segments.rbegin()->first;
+    PageId random_segment_id = random() % (max_segment_id + 1);
+    auto it = segments.find(random_segment_id);
+    while (it == segments.end())
+    {
+        random_segment_id = random() % (max_segment_id + 1);
+        it = segments.find(random_segment_id);
+    }
+    return random_segment_id;
+}
+
+std::pair<PageId, PageId> SegmentTestBasic::getRandomMergeablePair()
+{
+    while (true)
+    {
+        PageId random_left_segment_id = getRandomSegmentId();
+        PageId random_right_segment_id = random_left_segment_id;
+        while (random_right_segment_id == random_left_segment_id)
+        {
+            random_right_segment_id = getRandomSegmentId();
+        }
+        auto left_segment = segments[random_left_segment_id];
+        auto right_segment = segments[random_right_segment_id];
+        if (compare(left_segment->getRowKeyRange().getEnd(), right_segment->getRowKeyRange().getStart()) != 0 || left_segment->nextSegmentId() != right_segment->segmentId())
+        {
+            continue;
+        }
+        return {random_left_segment_id, random_right_segment_id};
+    }
+}
+
+RowKeyRange SegmentTestBasic::commanHandleKeyRange()
+{
+    String start_key, end_key;
+    {
+        WriteBufferFromOwnString ss;
+        ::DB::EncodeUInt(static_cast<UInt8>(TiDB::CodecFlagInt), ss);
+        ::DB::EncodeInt64(std::numeric_limits<Int64>::min(), ss);
+        start_key = ss.releaseStr();
+    }
+    {
+        WriteBufferFromOwnString ss;
+        ::DB::EncodeUInt(static_cast<UInt8>(TiDB::CodecFlagInt), ss);
+        ::DB::EncodeInt64(std::numeric_limits<Int64>::max(), ss);
+        end_key = ss.releaseStr();
+    }
+    return RowKeyRange(RowKeyValue(true, std::make_shared<String>(start_key), 0), RowKeyValue(true, std::make_shared<String>(end_key), 0), true, 1);
+}
+
+SegmentPtr SegmentTestBasic::reload(bool is_common_handle, const ColumnDefinesPtr & pre_define_columns, DB::Settings && db_settings)
+{
+    TiFlashStorageTestBasic::reload(std::move(db_settings));
+    storage_path_pool = std::make_unique<StoragePathPool>(db_context->getPathPool().withTable("test", "t1", false));
+    storage_pool = std::make_unique<StoragePool>(*db_context, /*ns_id*/ 100, *storage_path_pool, "test.t1");
+    storage_pool->restore();
+    ColumnDefinesPtr cols = (!pre_define_columns) ? DMTestEnv::getDefaultColumns(is_common_handle ? DMTestEnv::PkType::CommonHandle : DMTestEnv::PkType::HiddenTiDBRowID) : pre_define_columns;
+    setColumns(cols);
+
+    return Segment::newSegment(*dm_context, table_columns, is_common_handle ? commanHandleKeyRange() : RowKeyRange::newAll(is_common_handle, 1), storage_pool->newMetaPageId(), 0);
+}
+
+void SegmentTestBasic::setColumns(const ColumnDefinesPtr & columns)
+{
+    *table_columns = *columns;
+
+    dm_context = std::make_unique<DMContext>(*db_context,
+                                             *storage_path_pool,
+                                             *storage_pool,
+                                             0,
+                                             /*min_version_*/ 0,
+                                             settings.not_compress_columns,
+                                             options.is_common_handle,
+                                             1,
+                                             db_context->getSettingsRef());
+}
+} // namespace tests
+} // namespace DM
+} // namespace DB
diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h
new file mode 100644
index 00000000000..ab0c7d6d0be
--- /dev/null
+++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h
@@ -0,0 +1,123 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Storages/DeltaMerge/DMContext.h>
+#include <Storages/DeltaMerge/Segment.h>
+#include <Storages/Transaction/TMTContext.h>
+#include <Storages/tests/TiFlashStorageTestBasic.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+#include <vector>
+
+namespace DB
+{
+namespace DM
+{
+namespace tests
+{
+class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic
+{
+public:
+    struct SegmentTestOptions
+    {
+        bool is_common_handle = false;
+    };
+
+public:
+    void reloadWithOptions(SegmentTestOptions config);
+
+    std::optional<PageId> splitSegment(PageId segment_id);
+    void mergeSegment(PageId left_segment_id, PageId right_segment_id);
+    void mergeSegmentDelta(PageId segment_id);
+    void flushSegmentCache(PageId segment_id);
+    void writeSegment(PageId segment_id, UInt64 write_rows = 100);
+    void writeSegmentWithDeletedPack(PageId segment_id);
+    void deleteRangeSegment(PageId segment_id);
+
+
+    void writeRandomSegment();
+    void writeRandomSegmentWithDeletedPack();
+    void deleteRangeRandomSegment();
+    void splitRandomSegment();
+    void mergeRandomSegment();
+    void mergeDeltaRandomSegment();
+    void flushCacheRandomSegment();
+
+    void randomSegmentTest(size_t operator_count);
+
+    PageId createNewSegmentWithSomeData();
+    size_t getSegmentRowNumWithoutMVCC(PageId segment_id);
+    size_t getSegmentRowNum(PageId segment_id);
+    void checkSegmentRow(PageId segment_id, size_t expected_row_num);
+    std::pair<Int64, Int64> getSegmentKeyRange(SegmentPtr segment);
+
+protected:
+    // <segment_id, segment_ptr>
+    std::map<PageId, SegmentPtr> segments;
+
+    enum SegmentOperaterType
+    {
+        Write = 0,
+        DeleteRange,
+        Split,
+        Merge,
+        MergeDelta,
+        FlushCache,
+        WriteDeletedPack,
+        SegmentOperaterMax
+    };
+
+    const std::vector<std::function<void()>> segment_operator_entries = {
+        [this] { writeRandomSegment(); },
+        [this] { deleteRangeRandomSegment(); },
+        [this] { splitRandomSegment(); },
+        [this] { mergeRandomSegment(); },
+        [this] { mergeDeltaRandomSegment(); },
+        [this] { flushCacheRandomSegment(); },
+        [this] {
+            writeRandomSegmentWithDeletedPack();
+        }};
+
+    PageId getRandomSegmentId();
+
+    std::pair<PageId, PageId> getRandomMergeablePair();
+
+    RowKeyRange commanHandleKeyRange();
+
+    SegmentPtr reload(bool is_common_handle, const ColumnDefinesPtr & pre_define_columns = {}, DB::Settings && db_settings = DB::Settings());
+
+    // setColumns should update dm_context at the same time
+    void setColumns(const ColumnDefinesPtr & columns);
+
+    const ColumnDefinesPtr & tableColumns() const { return table_columns; }
+
+    DMContext & dmContext() { return *dm_context; }
+
+protected:
+    /// all these var lives as ref in dm_context
+    std::unique_ptr<StoragePathPool> storage_path_pool;
+    std::unique_ptr<StoragePool> storage_pool;
+    /// dm_context
+    std::unique_ptr<DMContext> dm_context;
+    ColumnDefinesPtr table_columns;
+    DM::DeltaMergeStore::Settings settings;
+
+    SegmentPtr root_segment;
+    UInt64 version = 0;
+    SegmentTestOptions options;
+};
+} // namespace tests
+} // namespace DM
+} // namespace DB
\ No newline at end of file
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/Main.cpp b/dbms/src/Storages/DeltaMerge/tools/workload/Main.cpp
deleted file mode 100644
index 092c8a89a42..00000000000
--- a/dbms/src/Storages/DeltaMerge/tools/workload/Main.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2022 PingCAP, Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <Storages/DeltaMerge/tools/workload/DTWorkload.h>
-
-using namespace DB::DM::tests;
-
-int main(int argc, char ** argv)
-{
-    return DTWorkload::mainEntry(argc, argv);
-}
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/CMakeLists.txt b/dbms/src/Storages/DeltaMerge/workload/CMakeLists.txt
similarity index 86%
rename from dbms/src/Storages/DeltaMerge/tools/workload/CMakeLists.txt
rename to dbms/src/Storages/DeltaMerge/workload/CMakeLists.txt
index 7227f1cf563..7a83cbec57c 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/CMakeLists.txt
+++ b/dbms/src/Storages/DeltaMerge/workload/CMakeLists.txt
@@ -18,6 +18,3 @@ set(dt-workload-src MainEntry.cpp DTWorkload.cpp KeyGenerator.cpp TableGenerator
 
 add_library(dt-workload-lib ${dt-workload-src})
 target_link_libraries(dt-workload-lib dbms clickhouse_functions clickhouse-server-lib)
-
-add_executable(dt-workload Main.cpp ${dt-workload-src})
-target_link_libraries(dt-workload dbms gtest clickhouse_functions clickhouse-server-lib)
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/DTWorkload.cpp b/dbms/src/Storages/DeltaMerge/workload/DTWorkload.cpp
similarity index 94%
rename from dbms/src/Storages/DeltaMerge/tools/workload/DTWorkload.cpp
rename to dbms/src/Storages/DeltaMerge/workload/DTWorkload.cpp
index a6113f91d91..a53a1b9ebbd 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/DTWorkload.cpp
+++ b/dbms/src/Storages/DeltaMerge/workload/DTWorkload.cpp
@@ -19,16 +19,16 @@
 #include <Poco/Util/LayeredConfiguration.h>
 #include <Storages/DeltaMerge/DeltaMergeStore.h>
 #include <Storages/DeltaMerge/Filter/RSOperator.h>
-#include <Storages/DeltaMerge/tools/workload/DTWorkload.h>
-#include <Storages/DeltaMerge/tools/workload/DataGenerator.h>
-#include <Storages/DeltaMerge/tools/workload/Handle.h>
-#include <Storages/DeltaMerge/tools/workload/KeyGenerator.h>
-#include <Storages/DeltaMerge/tools/workload/Limiter.h>
-#include <Storages/DeltaMerge/tools/workload/Options.h>
-#include <Storages/DeltaMerge/tools/workload/ReadColumnsGenerator.h>
-#include <Storages/DeltaMerge/tools/workload/TableGenerator.h>
-#include <Storages/DeltaMerge/tools/workload/TimestampGenerator.h>
-#include <Storages/DeltaMerge/tools/workload/Utils.h>
+#include <Storages/DeltaMerge/workload/DTWorkload.h>
+#include <Storages/DeltaMerge/workload/DataGenerator.h>
+#include <Storages/DeltaMerge/workload/Handle.h>
+#include <Storages/DeltaMerge/workload/KeyGenerator.h>
+#include <Storages/DeltaMerge/workload/Limiter.h>
+#include <Storages/DeltaMerge/workload/Options.h>
+#include <Storages/DeltaMerge/workload/ReadColumnsGenerator.h>
+#include <Storages/DeltaMerge/workload/TableGenerator.h>
+#include <Storages/DeltaMerge/workload/TimestampGenerator.h>
+#include <Storages/DeltaMerge/workload/Utils.h>
 #include <TestUtils/TiFlashTestEnv.h>
 #include <cpptoml.h>
 
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/DTWorkload.h b/dbms/src/Storages/DeltaMerge/workload/DTWorkload.h
similarity index 97%
rename from dbms/src/Storages/DeltaMerge/tools/workload/DTWorkload.h
rename to dbms/src/Storages/DeltaMerge/workload/DTWorkload.h
index 26cc5b6e07c..1ee5ba6b871 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/DTWorkload.h
+++ b/dbms/src/Storages/DeltaMerge/workload/DTWorkload.h
@@ -73,7 +73,7 @@ class ThreadStat
 class Statistics
 {
 public:
-    Statistics(int write_thread_count = 0, int read_thread_count = 0)
+    explicit Statistics(int write_thread_count = 0, int read_thread_count = 0)
         : init_ms(0)
         , write_stats(write_thread_count)
         , read_stats(read_thread_count)
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/DataGenerator.cpp b/dbms/src/Storages/DeltaMerge/workload/DataGenerator.cpp
similarity index 95%
rename from dbms/src/Storages/DeltaMerge/tools/workload/DataGenerator.cpp
rename to dbms/src/Storages/DeltaMerge/workload/DataGenerator.cpp
index be6ff1dcbbe..479977d46d1 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/DataGenerator.cpp
+++ b/dbms/src/Storages/DeltaMerge/workload/DataGenerator.cpp
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include <DataTypes/DataTypeEnum.h>
-#include <Storages/DeltaMerge/tools/workload/DataGenerator.h>
-#include <Storages/DeltaMerge/tools/workload/KeyGenerator.h>
-#include <Storages/DeltaMerge/tools/workload/Options.h>
-#include <Storages/DeltaMerge/tools/workload/TableGenerator.h>
-#include <Storages/DeltaMerge/tools/workload/TimestampGenerator.h>
+#include <Storages/DeltaMerge/workload/DataGenerator.h>
+#include <Storages/DeltaMerge/workload/KeyGenerator.h>
+#include <Storages/DeltaMerge/workload/Options.h>
+#include <Storages/DeltaMerge/workload/TableGenerator.h>
+#include <Storages/DeltaMerge/workload/TimestampGenerator.h>
 #include <fmt/ranges.h>
 
 #include <random>
@@ -33,7 +33,7 @@ class RandomDataGenerator : public DataGenerator
         , rand_gen(std::random_device()())
     {}
 
-    virtual std::tuple<Block, uint64_t> get(uint64_t key) override
+    std::tuple<Block, uint64_t> get(uint64_t key) override
     {
         Block block;
         // Generate 'rowkeys'.
@@ -227,7 +227,9 @@ class RandomDataGenerator : public DataGenerator
     struct tm randomLocalTime()
     {
         time_t t = randomUTCTimestamp();
-        struct tm res;
+        struct tm res
+        {
+        };
         if (localtime_r(&t, &res) == nullptr)
         {
             throw std::invalid_argument(fmt::format("localtime_r({}) ret {}", t, strerror(errno)));
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/DataGenerator.h b/dbms/src/Storages/DeltaMerge/workload/DataGenerator.h
similarity index 96%
rename from dbms/src/Storages/DeltaMerge/tools/workload/DataGenerator.h
rename to dbms/src/Storages/DeltaMerge/workload/DataGenerator.h
index e32de4591e6..cd29f1a3a80 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/DataGenerator.h
+++ b/dbms/src/Storages/DeltaMerge/workload/DataGenerator.h
@@ -27,7 +27,7 @@ class DataGenerator
 public:
     static std::unique_ptr<DataGenerator> create(const WorkloadOptions & opts, const TableInfo & table_info, TimestampGenerator & ts_gen);
     virtual std::tuple<Block, uint64_t> get(uint64_t key) = 0;
-    virtual ~DataGenerator() {}
+    virtual ~DataGenerator() = default;
 };
 
 std::string blockToString(const Block & block);
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/Handle.h b/dbms/src/Storages/DeltaMerge/workload/Handle.h
similarity index 90%
rename from dbms/src/Storages/DeltaMerge/tools/workload/Handle.h
rename to dbms/src/Storages/DeltaMerge/workload/Handle.h
index eb117a4fddd..2bbd1bd409d 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/Handle.h
+++ b/dbms/src/Storages/DeltaMerge/workload/Handle.h
@@ -16,7 +16,7 @@
 
 #include <Encryption/PosixRandomAccessFile.h>
 #include <Encryption/PosixWritableFile.h>
-#include <Storages/DeltaMerge/tools/workload/TableGenerator.h>
+#include <Storages/DeltaMerge/workload/TableGenerator.h>
 #include <fcntl.h>
 
 #include <memory>
@@ -40,7 +40,7 @@ class HandleLock
     static constexpr uint64_t default_lock_count = 4096;
 
     static std::unique_ptr<HandleLock> create(const TableInfo & table_info);
-    HandleLock(uint64_t lock_count = default_lock_count)
+    explicit HandleLock(uint64_t lock_count = default_lock_count)
         : rmtxs(lock_count)
     {}
 
@@ -52,6 +52,7 @@ class HandleLock
     std::vector<std::unique_lock<std::recursive_mutex>> getLocks(const std::vector<uint64_t> & handles)
     {
         std::vector<uint64_t> indexes;
+        indexes.reserve(handles.size());
         for (const auto & h : handles)
         {
             indexes.push_back(index(h));
@@ -59,6 +60,7 @@ class HandleLock
         // Sort mutex indexes to avoid dead lock.
         sort(indexes.begin(), indexes.end());
         std::vector<std::unique_lock<std::recursive_mutex>> locks;
+        locks.reserve(indexes.size());
         for (auto i : indexes)
         {
             locks.push_back(getLockByIndex(i));
@@ -105,7 +107,7 @@ class HandleTable
         std::lock_guard lock(mtx);
         handle_to_ts[handle] = ts;
         Record r{handle, ts};
-        if (wal != nullptr && wal->write((char *)&r, sizeof(r)) != sizeof(r))
+        if (wal != nullptr && wal->write(reinterpret_cast<char *>(&r), sizeof(r)) != sizeof(r))
         {
             throw std::runtime_error(fmt::format("write ret {}", strerror(errno)));
         }
@@ -134,8 +136,8 @@ class HandleTable
         try
         {
             PosixRandomAccessFile f(fname, -1);
-            Record r;
-            while (f.read((char *)&r, sizeof(r)) == sizeof(r))
+            Record r{};
+            while (f.read(reinterpret_cast<char *>(&r), sizeof(r)) == sizeof(r))
             {
                 handle_to_ts[r.handle] = r.ts;
             }
@@ -156,7 +158,7 @@ class HandleTable
         for (const auto & pa : handle_to_ts)
         {
             Record r{pa.first, pa.second};
-            if (f.write((char *)&r, sizeof(r)) != sizeof(r))
+            if (f.write(reinterpret_cast<char *>(&r), sizeof(r)) != sizeof(r))
             {
                 throw std::runtime_error(fmt::format("write ret {}", strerror(errno)));
             }
@@ -191,7 +193,7 @@ class SharedHandleTable
 public:
     static constexpr uint64_t default_shared_count = 4096;
 
-    SharedHandleTable(uint64_t max_key_count, const std::string & waldir = "", uint64_t shared_cnt = default_shared_count)
+    explicit SharedHandleTable(uint64_t max_key_count, const std::string & waldir = "", uint64_t shared_cnt = default_shared_count)
         : tables(shared_cnt)
     {
         uint64_t max_key_count_per_shared = max_key_count / default_shared_count + 1;
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/KeyGenerator.cpp b/dbms/src/Storages/DeltaMerge/workload/KeyGenerator.cpp
similarity index 92%
rename from dbms/src/Storages/DeltaMerge/tools/workload/KeyGenerator.cpp
rename to dbms/src/Storages/DeltaMerge/workload/KeyGenerator.cpp
index bb2f2253279..f899ec71b4b 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/KeyGenerator.cpp
+++ b/dbms/src/Storages/DeltaMerge/workload/KeyGenerator.cpp
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Storages/DeltaMerge/tools/workload/KeyGenerator.h>
-#include <Storages/DeltaMerge/tools/workload/Options.h>
+#include <Storages/DeltaMerge/workload/KeyGenerator.h>
+#include <Storages/DeltaMerge/workload/Options.h>
 #include <fmt/core.h>
 
 #include <atomic>
@@ -31,7 +31,7 @@ class IncrementalKeyGenerator : public KeyGenerator
         , key(0)
     {}
 
-    virtual uint64_t get64() override
+    uint64_t get64() override
     {
         return key.fetch_add(1, std::memory_order_relaxed) % key_count + start_key;
     }
@@ -54,7 +54,7 @@ class UniformDistributionKeyGenerator : public KeyGenerator
         , uniform_dist(0, key_count)
     {}
 
-    virtual uint64_t get64() override
+    uint64_t get64() override
     {
         std::lock_guard lock(mtx);
         return uniform_dist(rand_gen);
@@ -78,7 +78,7 @@ class NormalDistributionKeyGenerator : public KeyGenerator
         , normal_dist(key_count / 2.0, key_count / 20.0)
     {}
 
-    virtual uint64_t get64() override
+    uint64_t get64() override
     {
         std::lock_guard lock(mtx);
         return normal_dist(rand_gen);
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/KeyGenerator.h b/dbms/src/Storages/DeltaMerge/workload/KeyGenerator.h
similarity index 92%
rename from dbms/src/Storages/DeltaMerge/tools/workload/KeyGenerator.h
rename to dbms/src/Storages/DeltaMerge/workload/KeyGenerator.h
index 447f3ffc27a..7c8b8fd0080 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/KeyGenerator.h
+++ b/dbms/src/Storages/DeltaMerge/workload/KeyGenerator.h
@@ -23,8 +23,8 @@ class KeyGenerator
 public:
     static std::unique_ptr<KeyGenerator> create(const WorkloadOptions & opts);
 
-    KeyGenerator() {}
-    virtual ~KeyGenerator() {}
+    KeyGenerator() = default;
+    virtual ~KeyGenerator() = default;
 
     virtual uint64_t get64() = 0;
 };
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/Limiter.cpp b/dbms/src/Storages/DeltaMerge/workload/Limiter.cpp
similarity index 77%
rename from dbms/src/Storages/DeltaMerge/tools/workload/Limiter.cpp
rename to dbms/src/Storages/DeltaMerge/workload/Limiter.cpp
index 73764d27bc5..65f9e3ce72c 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/Limiter.cpp
+++ b/dbms/src/Storages/DeltaMerge/workload/Limiter.cpp
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include <Encryption/RateLimiter.h>
-#include <Storages/DeltaMerge/tools/workload/Limiter.h>
-#include <Storages/DeltaMerge/tools/workload/Options.h>
+#include <Storages/DeltaMerge/workload/Limiter.h>
+#include <Storages/DeltaMerge/workload/Options.h>
 #include <fmt/core.h>
 
 #include <cmath>
@@ -24,10 +24,10 @@ namespace DB::DM::tests
 class ConstantLimiter : public Limiter
 {
 public:
-    ConstantLimiter(uint64_t rate_per_sec)
+    explicit ConstantLimiter(uint64_t rate_per_sec)
         : limiter(rate_per_sec, LimiterType::UNKNOW)
     {}
-    virtual void request() override
+    void request() override
     {
         limiter.request(1);
     }
@@ -38,7 +38,7 @@ class ConstantLimiter : public Limiter
 
 std::unique_ptr<Limiter> Limiter::create(const WorkloadOptions & opts)
 {
-    uint64_t per_sec = std::ceil(static_cast<double>(opts.max_write_per_sec / opts.write_thread_count));
+    uint64_t per_sec = std::ceil(opts.max_write_per_sec * 1.0 / opts.write_thread_count);
     return std::make_unique<ConstantLimiter>(per_sec);
 }
 
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/Limiter.h b/dbms/src/Storages/DeltaMerge/workload/Limiter.h
similarity index 96%
rename from dbms/src/Storages/DeltaMerge/tools/workload/Limiter.h
rename to dbms/src/Storages/DeltaMerge/workload/Limiter.h
index e2892b178a2..da2d31c7915 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/Limiter.h
+++ b/dbms/src/Storages/DeltaMerge/workload/Limiter.h
@@ -23,6 +23,6 @@ class Limiter
 public:
     static std::unique_ptr<Limiter> create(const WorkloadOptions & opts);
     virtual void request() = 0;
-    virtual ~Limiter() {}
+    virtual ~Limiter() = default;
 };
 } // namespace DB::DM::tests
\ No newline at end of file
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/MainEntry.cpp b/dbms/src/Storages/DeltaMerge/workload/MainEntry.cpp
similarity index 97%
rename from dbms/src/Storages/DeltaMerge/tools/workload/MainEntry.cpp
rename to dbms/src/Storages/DeltaMerge/workload/MainEntry.cpp
index f79d414f20b..88cf0b6322f 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/MainEntry.cpp
+++ b/dbms/src/Storages/DeltaMerge/workload/MainEntry.cpp
@@ -14,10 +14,10 @@
 
 #include <Common/Exception.h>
 #include <Common/Logger.h>
-#include <Storages/DeltaMerge/tools/workload/DTWorkload.h>
-#include <Storages/DeltaMerge/tools/workload/Handle.h>
-#include <Storages/DeltaMerge/tools/workload/Options.h>
-#include <Storages/DeltaMerge/tools/workload/Utils.h>
+#include <Storages/DeltaMerge/workload/DTWorkload.h>
+#include <Storages/DeltaMerge/workload/Handle.h>
+#include <Storages/DeltaMerge/workload/Options.h>
+#include <Storages/DeltaMerge/workload/Utils.h>
 #include <Storages/PathPool.h>
 #include <TestUtils/TiFlashTestEnv.h>
 #include <common/logger_useful.h>
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/Options.cpp b/dbms/src/Storages/DeltaMerge/workload/Options.cpp
similarity index 98%
rename from dbms/src/Storages/DeltaMerge/tools/workload/Options.cpp
rename to dbms/src/Storages/DeltaMerge/workload/Options.cpp
index 1c6409f3c53..8545d22ca8d 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/Options.cpp
+++ b/dbms/src/Storages/DeltaMerge/workload/Options.cpp
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include <Common/FailPoint.h>
-#include <Storages/DeltaMerge/tools/workload/Options.h>
-#include <Storages/DeltaMerge/tools/workload/Utils.h>
+#include <Storages/DeltaMerge/workload/Options.h>
+#include <Storages/DeltaMerge/workload/Utils.h>
 #include <fmt/ranges.h>
 
 #include <boost/program_options.hpp>
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/Options.h b/dbms/src/Storages/DeltaMerge/workload/Options.h
similarity index 100%
rename from dbms/src/Storages/DeltaMerge/tools/workload/Options.h
rename to dbms/src/Storages/DeltaMerge/workload/Options.h
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/ReadColumnsGenerator.h b/dbms/src/Storages/DeltaMerge/workload/ReadColumnsGenerator.h
similarity index 93%
rename from dbms/src/Storages/DeltaMerge/tools/workload/ReadColumnsGenerator.h
rename to dbms/src/Storages/DeltaMerge/workload/ReadColumnsGenerator.h
index 180409f89e1..c881bb148a2 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/ReadColumnsGenerator.h
+++ b/dbms/src/Storages/DeltaMerge/workload/ReadColumnsGenerator.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <Storages/DeltaMerge/tools/workload/TableGenerator.h>
+#include <Storages/DeltaMerge/workload/TableGenerator.h>
 
 #include <random>
 
@@ -28,7 +28,7 @@ class ReadColumnsGenerator
         return std::make_unique<ReadColumnsGenerator>(table_info);
     }
 
-    ReadColumnsGenerator(const TableInfo & table_info_)
+    explicit ReadColumnsGenerator(const TableInfo & table_info_)
         : table_info(table_info_)
         , rand_gen(std::random_device()())
         , uniform_dist(0, table_info_.columns->size() - 1)
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/TableGenerator.cpp b/dbms/src/Storages/DeltaMerge/workload/TableGenerator.cpp
similarity index 97%
rename from dbms/src/Storages/DeltaMerge/tools/workload/TableGenerator.cpp
rename to dbms/src/Storages/DeltaMerge/workload/TableGenerator.cpp
index cf52e808ab1..ec29a476d6a 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/TableGenerator.cpp
+++ b/dbms/src/Storages/DeltaMerge/workload/TableGenerator.cpp
@@ -15,8 +15,8 @@
 #include <Common/nocopyable.h>
 #include <DataTypes/DataTypeEnum.h>
 #include <DataTypes/DataTypeString.h>
-#include <Storages/DeltaMerge/tools/workload/Options.h>
-#include <Storages/DeltaMerge/tools/workload/TableGenerator.h>
+#include <Storages/DeltaMerge/workload/Options.h>
+#include <Storages/DeltaMerge/workload/TableGenerator.h>
 #include <fmt/ranges.h>
 
 #include <random>
@@ -237,7 +237,7 @@ class RandomTableGenerator : public TableGenerator
         , rand_gen(std::random_device()())
     {}
 
-    virtual TableInfo get(int64_t table_id, std::string table_name) override
+    TableInfo get(int64_t table_id, std::string table_name) override
     {
         TableInfo table_info;
 
@@ -293,7 +293,7 @@ class RandomTableGenerator : public TableGenerator
 
 class ConstantTableGenerator : public TableGenerator
 {
-    virtual TableInfo get(int64_t table_id, std::string table_name) override
+    TableInfo get(int64_t table_id, std::string table_name) override
     {
         TableInfo table_info;
 
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/TableGenerator.h b/dbms/src/Storages/DeltaMerge/workload/TableGenerator.h
similarity index 96%
rename from dbms/src/Storages/DeltaMerge/tools/workload/TableGenerator.h
rename to dbms/src/Storages/DeltaMerge/workload/TableGenerator.h
index aba5c1590b7..b88bf2b72e2 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/TableGenerator.h
+++ b/dbms/src/Storages/DeltaMerge/workload/TableGenerator.h
@@ -38,6 +38,6 @@ class TableGenerator
 
     virtual TableInfo get(int64_t table_id, std::string table_name) = 0;
 
-    virtual ~TableGenerator() {}
+    virtual ~TableGenerator() = default;
 };
 } // namespace DB::DM::tests
\ No newline at end of file
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/TimestampGenerator.h b/dbms/src/Storages/DeltaMerge/workload/TimestampGenerator.h
similarity index 100%
rename from dbms/src/Storages/DeltaMerge/tools/workload/TimestampGenerator.h
rename to dbms/src/Storages/DeltaMerge/workload/TimestampGenerator.h
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/Utils.cpp b/dbms/src/Storages/DeltaMerge/workload/Utils.cpp
similarity index 94%
rename from dbms/src/Storages/DeltaMerge/tools/workload/Utils.cpp
rename to dbms/src/Storages/DeltaMerge/workload/Utils.cpp
index 1cefae724c6..80d9f788016 100644
--- a/dbms/src/Storages/DeltaMerge/tools/workload/Utils.cpp
+++ b/dbms/src/Storages/DeltaMerge/workload/Utils.cpp
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Storages/DeltaMerge/tools/workload/Utils.h>
+#include <Storages/DeltaMerge/workload/Utils.h>
 #include <fmt/chrono.h>
 #include <fmt/ranges.h>
 
@@ -83,7 +83,7 @@ std::string fieldToString(const DataTypePtr & data_type, const Field & f)
         }
         else if (t == Field::Types::Which::Decimal256)
         {
-            auto i = f.get<Decimal256>();
+            const auto & i = f.get<Decimal256>();
             auto scale = dynamic_cast<const DataTypeDecimal256 *>(data_type.get())->getScale();
             return i.toString(scale);
         }
@@ -105,8 +105,8 @@ std::vector<std::string> colToVec(const DataTypePtr & data_type, const ColumnPtr
 std::string blockToString(const Block & block)
 {
     std::string s = "id name type values\n";
-    auto & cols = block.getColumnsWithTypeAndName();
-    for (auto & col : cols)
+    const auto & cols = block.getColumnsWithTypeAndName();
+    for (const auto & col : cols)
     {
         s += fmt::format("{} {} {} {}\n", col.column_id, col.name, col.type->getFamilyName(), colToVec(col.type, col.column));
     }
diff --git a/dbms/src/Storages/DeltaMerge/tools/workload/Utils.h b/dbms/src/Storages/DeltaMerge/workload/Utils.h
similarity index 100%
rename from dbms/src/Storages/DeltaMerge/tools/workload/Utils.h
rename to dbms/src/Storages/DeltaMerge/workload/Utils.h
diff --git a/dbms/src/Storages/IManageableStorage.h b/dbms/src/Storages/IManageableStorage.h
index ebf84c592e4..2ff766a9c6d 100644
--- a/dbms/src/Storages/IManageableStorage.h
+++ b/dbms/src/Storages/IManageableStorage.h
@@ -68,7 +68,7 @@ class IManageableStorage : public IStorage
 
     virtual void flushCache(const Context & /*context*/) {}
 
-    virtual void flushCache(const Context & /*context*/, const DM::RowKeyRange & /*range_to_flush*/) {}
+    virtual bool flushCache(const Context & /*context*/, const DM::RowKeyRange & /*range_to_flush*/, [[maybe_unused]] bool try_until_succeed = true) { return true; }
 
     virtual BlockInputStreamPtr status() { return {}; }
 
diff --git a/dbms/src/Storages/MarkCache.h b/dbms/src/Storages/MarkCache.h
index 5816b0c1bba..728f830e0d0 100644
--- a/dbms/src/Storages/MarkCache.h
+++ b/dbms/src/Storages/MarkCache.h
@@ -14,24 +14,23 @@
 
 #pragma once
 
-#include <memory>
-
 #include <Common/LRUCache.h>
 #include <Common/ProfileEvents.h>
 #include <Common/SipHash.h>
-#include <Interpreters/AggregationCommon.h>
 #include <DataStreams/MarkInCompressedFile.h>
+#include <Interpreters/AggregationCommon.h>
+
+#include <memory>
 
 
 namespace ProfileEvents
 {
-    extern const Event MarkCacheHits;
-    extern const Event MarkCacheMisses;
-}
+extern const Event MarkCacheHits;
+extern const Event MarkCacheMisses;
+} // namespace ProfileEvents
 
 namespace DB
 {
-
 /// Estimate of number of bytes in cache for marks.
 struct MarksWeightFunction
 {
@@ -53,7 +52,8 @@ class MarkCache : public LRUCache<String, MarksInCompressedFile, std::hash<Strin
 
 public:
     MarkCache(size_t max_size_in_bytes, const Delay & expiration_delay)
-        : Base(max_size_in_bytes, expiration_delay) {}
+        : Base(max_size_in_bytes, expiration_delay)
+    {}
 
     template <typename LoadFunc>
     MappedPtr getOrSet(const Key & key, LoadFunc && load)
@@ -70,4 +70,4 @@ class MarkCache : public LRUCache<String, MarksInCompressedFile, std::hash<Strin
 
 using MarkCachePtr = std::shared_ptr<MarkCache>;
 
-}
+} // namespace DB
diff --git a/dbms/src/Storages/Page/CMakeLists.txt b/dbms/src/Storages/Page/CMakeLists.txt
index cead83fa126..f208dc84be2 100644
--- a/dbms/src/Storages/Page/CMakeLists.txt
+++ b/dbms/src/Storages/Page/CMakeLists.txt
@@ -14,13 +14,3 @@
 
 add_subdirectory(V2)
 add_subdirectory(tools)
-
-# PageStorage Stress test
-if (ENABLE_V3_PAGESTORAGE)
-    add_headers_and_sources(page_stress_testing stress)
-    add_headers_and_sources(page_stress_testing stress/workload)
-    add_executable(page_stress_testing EXCLUDE_FROM_ALL ${page_stress_testing_sources})
-    target_link_libraries(page_stress_testing dbms page_storage_v3)
-    target_include_directories(page_stress_testing PRIVATE stress)
-    target_compile_options(page_stress_testing PRIVATE -Wno-format -lc++) # turn off printf format check
-endif()
\ No newline at end of file
diff --git a/dbms/src/Storages/Page/PageUtil.h b/dbms/src/Storages/Page/PageUtil.h
index cebcbdb27f2..b0d8f0f88c8 100644
--- a/dbms/src/Storages/Page/PageUtil.h
+++ b/dbms/src/Storages/Page/PageUtil.h
@@ -281,7 +281,7 @@ void readFile(T & file,
     }
 
     if (unlikely(bytes_read != expected_bytes))
-        throw DB::TiFlashException(fmt::format("No enough data in file {}, read bytes: {} , expected bytes: {}", file->getFileName(), bytes_read, expected_bytes),
+        throw DB::TiFlashException(fmt::format("No enough data in file {}, read bytes: {}, expected bytes: {}, offset: {}", file->getFileName(), bytes_read, expected_bytes, offset),
                                    Errors::PageStorage::FileSizeNotMatch);
 }
 
diff --git a/dbms/src/Storages/Page/V2/tests/gtest_page_util.cpp b/dbms/src/Storages/Page/V2/tests/gtest_page_util.cpp
index e72c7a87541..c4dd2178eb9 100644
--- a/dbms/src/Storages/Page/V2/tests/gtest_page_util.cpp
+++ b/dbms/src/Storages/Page/V2/tests/gtest_page_util.cpp
@@ -17,6 +17,7 @@
 #include <Poco/Logger.h>
 #include <Storages/Page/PageUtil.h>
 #include <TestUtils/TiFlashTestBasic.h>
+#include <common/logger_useful.h>
 
 namespace DB
 {
@@ -30,6 +31,7 @@ namespace tests
 static const std::string FileName = "page_util_test";
 
 TEST(PageUtilsTest, ReadWriteFile)
+try
 {
     ::remove(FileName.c_str());
 
@@ -52,6 +54,7 @@ TEST(PageUtilsTest, ReadWriteFile)
 
     ::remove(FileName.c_str());
 }
+CATCH
 
 TEST(PageUtilsTest, FileNotExists)
 {
diff --git a/dbms/src/Storages/Page/V3/BlobStore.cpp b/dbms/src/Storages/Page/V3/BlobStore.cpp
index d5f71841b91..3bd0bd9c4fa 100644
--- a/dbms/src/Storages/Page/V3/BlobStore.cpp
+++ b/dbms/src/Storages/Page/V3/BlobStore.cpp
@@ -14,6 +14,7 @@
 
 #include <Common/Checksum.h>
 #include <Common/CurrentMetrics.h>
+#include <Common/Exception.h>
 #include <Common/FailPoint.h>
 #include <Common/Logger.h>
 #include <Common/ProfileEvents.h>
@@ -555,7 +556,7 @@ void BlobStore::read(PageIDAndEntriesV3 & entries, const PageHandler & handler,
 
     for (const auto & [page_id_v3, entry] : entries)
     {
-        auto blob_file = read(entry.file_id, entry.offset, data_buf, entry.size, read_limiter);
+        auto blob_file = read(page_id_v3, entry.file_id, entry.offset, data_buf, entry.size, read_limiter);
 
         if constexpr (BLOBSTORE_CHECKSUM_ON_READ)
         {
@@ -635,7 +636,7 @@ PageMap BlobStore::read(FieldReadInfos & to_read, const ReadLimiterPtr & read_li
             // TODO: Continuously fields can read by one system call.
             const auto [beg_offset, end_offset] = entry.getFieldOffsets(field_index);
             const auto size_to_read = end_offset - beg_offset;
-            auto blob_file = read(entry.file_id, entry.offset + beg_offset, write_offset, size_to_read, read_limiter);
+            auto blob_file = read(page_id_v3, entry.file_id, entry.offset + beg_offset, write_offset, size_to_read, read_limiter);
             fields_offset_in_page.emplace(field_index, read_size_this_entry);
 
             if constexpr (BLOBSTORE_CHECKSUM_ON_READ)
@@ -732,7 +733,7 @@ PageMap BlobStore::read(PageIDAndEntriesV3 & entries, const ReadLimiterPtr & rea
     PageMap page_map;
     for (const auto & [page_id_v3, entry] : entries)
     {
-        auto blob_file = read(entry.file_id, entry.offset, pos, entry.size, read_limiter);
+        auto blob_file = read(page_id_v3, entry.file_id, entry.offset, pos, entry.size, read_limiter);
 
         if constexpr (BLOBSTORE_CHECKSUM_ON_READ)
         {
@@ -797,7 +798,7 @@ Page BlobStore::read(const PageIDAndEntryV3 & id_entry, const ReadLimiterPtr & r
         free(p, buf_size);
     });
 
-    auto blob_file = read(entry.file_id, entry.offset, data_buf, buf_size, read_limiter);
+    auto blob_file = read(page_id_v3, entry.file_id, entry.offset, data_buf, buf_size, read_limiter);
     if constexpr (BLOBSTORE_CHECKSUM_ON_READ)
     {
         ChecksumClass digest;
@@ -824,11 +825,20 @@ Page BlobStore::read(const PageIDAndEntryV3 & id_entry, const ReadLimiterPtr & r
     return page;
 }
 
-BlobFilePtr BlobStore::read(BlobFileId blob_id, BlobFileOffset offset, char * buffers, size_t size, const ReadLimiterPtr & read_limiter, bool background)
+BlobFilePtr BlobStore::read(const PageIdV3Internal & page_id_v3, BlobFileId blob_id, BlobFileOffset offset, char * buffers, size_t size, const ReadLimiterPtr & read_limiter, bool background)
 {
     assert(buffers != nullptr);
-    auto blob_file = getBlobFile(blob_id);
-    blob_file->read(buffers, offset, size, read_limiter, background);
+    BlobFilePtr blob_file = getBlobFile(blob_id);
+    try
+    {
+        blob_file->read(buffers, offset, size, read_limiter, background);
+    }
+    catch (DB::Exception & e)
+    {
+        // add debug message
+        e.addMessage(fmt::format("(error while reading page data [page_id={}] [blob_id={}] [offset={}] [size={}] [background={}])", page_id_v3, blob_id, offset, size, background));
+        e.rethrow();
+    }
     return blob_file;
 }
 
@@ -841,8 +851,8 @@ struct BlobStoreGCInfo
                            toTypeString("Read-Only Blob", 0),
                            toTypeString("No GC Blob", 1),
                            toTypeString("Full GC Blob", 2),
-                           toTypeString("Truncated Blob", 3),
-                           toTypeString("Big Blob", 4));
+                           toTypeString("Big Blob", 3),
+                           toTypeTruncateString("Truncated Blob"));
     }
 
     void appendToReadOnlyBlob(const BlobFileId blob_id, double valid_rate)
@@ -860,23 +870,24 @@ struct BlobStoreGCInfo
         blob_gc_info[2].emplace_back(std::make_pair(blob_id, valid_rate));
     }
 
-    void appendToTruncatedBlob(const BlobFileId blob_id, double valid_rate)
+    void appendToBigBlob(const BlobFileId blob_id, double valid_rate)
     {
         blob_gc_info[3].emplace_back(std::make_pair(blob_id, valid_rate));
     }
 
-    void appendToBigBlob(const BlobFileId blob_id, double valid_rate)
+    void appendToTruncatedBlob(const BlobFileId blob_id, UInt64 origin_size, UInt64 truncated_size, double valid_rate)
     {
-        blob_gc_info[4].emplace_back(std::make_pair(blob_id, valid_rate));
+        blob_gc_truncate_info.emplace_back(std::make_tuple(blob_id, origin_size, truncated_size, valid_rate));
     }
 
 private:
     // 1. read only blob
     // 2. no need gc blob
     // 3. full gc blob
-    // 4. need truncate blob
-    // 5. big blob
-    std::vector<std::pair<BlobFileId, double>> blob_gc_info[5];
+    // 4. big blob
+    std::vector<std::pair<BlobFileId, double>> blob_gc_info[4];
+
+    std::vector<std::tuple<BlobFileId, UInt64, UInt64, double>> blob_gc_truncate_info;
 
     String toTypeString(const std::string_view prefix, const size_t index) const
     {
@@ -901,6 +912,32 @@ struct BlobStoreGCInfo
 
         return fmt_buf.toString();
     }
+
+    String toTypeTruncateString(const std::string_view prefix) const
+    {
+        FmtBuffer fmt_buf;
+        if (blob_gc_truncate_info.empty())
+        {
+            fmt_buf.fmtAppend("{}: [null]", prefix);
+        }
+        else
+        {
+            fmt_buf.fmtAppend("{}: [", prefix);
+            fmt_buf.joinStr(
+                blob_gc_truncate_info.begin(),
+                blob_gc_truncate_info.end(),
+                [](const auto arg, FmtBuffer & fb) {
+                    fb.fmtAppend("{} origin: {} truncate: {} rate: {:.2f}", //
+                                 std::get<0>(arg), // blob id
+                                 std::get<1>(arg), // origin size
+                                 std::get<2>(arg), // truncated size
+                                 std::get<3>(arg)); // valid rate
+                },
+                ", ");
+            fmt_buf.append("]");
+        }
+        return fmt_buf.toString();
+    }
 };
 
 std::vector<BlobFileId> BlobStore::getGCStats()
@@ -943,7 +980,7 @@ std::vector<BlobFileId> BlobStore::getGCStats()
             }
 
             auto lock = stat->lock();
-            auto right_margin = stat->smap->getRightMargin();
+            auto right_margin = stat->smap->getUsedBoundary();
 
             // Avoid divide by zero
             if (right_margin == 0)
@@ -956,14 +993,13 @@ std::vector<BlobFileId> BlobStore::getGCStats()
                                                 stat->sm_valid_rate));
                 }
 
-                LOG_FMT_TRACE(log, "Current blob is empty [blob_id={}, total size(all invalid)={}] [valid_rate={}].", stat->id, stat->sm_total_size, stat->sm_valid_rate);
-
                 // If current blob empty, the size of in disk blob may not empty
                 // So we need truncate current blob, and let it be reused.
                 auto blobfile = getBlobFile(stat->id);
-                LOG_FMT_TRACE(log, "Truncate empty blob file [blob_id={}] to 0.", stat->id);
+                LOG_FMT_INFO(log, "Current blob file is empty, truncated to zero [blob_id={}] [total_size={}] [valid_rate={}]", stat->id, stat->sm_total_size, stat->sm_valid_rate);
                 blobfile->truncate(right_margin);
-                blobstore_gc_info.appendToTruncatedBlob(stat->id, stat->sm_valid_rate);
+                blobstore_gc_info.appendToTruncatedBlob(stat->id, stat->sm_total_size, right_margin, stat->sm_valid_rate);
+                stat->sm_total_size = right_margin;
                 continue;
             }
 
@@ -1004,9 +1040,10 @@ std::vector<BlobFileId> BlobStore::getGCStats()
                 auto blobfile = getBlobFile(stat->id);
                 LOG_FMT_TRACE(log, "Truncate blob file [blob_id={}] [origin size={}] [truncated size={}]", stat->id, stat->sm_total_size, right_margin);
                 blobfile->truncate(right_margin);
+                blobstore_gc_info.appendToTruncatedBlob(stat->id, stat->sm_total_size, right_margin, stat->sm_valid_rate);
+
                 stat->sm_total_size = right_margin;
                 stat->sm_valid_rate = stat->sm_valid_size * 1.0 / stat->sm_total_size;
-                blobstore_gc_info.appendToTruncatedBlob(stat->id, stat->sm_valid_rate);
             }
         }
     }
@@ -1117,21 +1154,15 @@ PageEntriesEdit BlobStore::gc(std::map<BlobFileId, PageIdAndVersionedEntries> &
                 std::tie(blobfile_id, file_offset_beg) = getPosFromStats(next_alloc_size);
             }
 
-            PageEntryV3 new_entry;
-
-            read(file_id, entry.offset, data_pos, entry.size, read_limiter, /*background*/ true);
-
-            // No need do crc again, crc won't be changed.
-            new_entry.checksum = entry.checksum;
-
-            // Need copy the field_offsets
-            new_entry.field_offsets = entry.field_offsets;
-
-            // Entry size won't be changed.
-            new_entry.size = entry.size;
+            // Read the data into buffer by old entry
+            read(page_id, file_id, entry.offset, data_pos, entry.size, read_limiter, /*background*/ true);
 
+            // Most vars of the entry is not changed, but the file id and offset
+            // need to be updated.
+            PageEntryV3 new_entry = entry;
             new_entry.file_id = blobfile_id;
             new_entry.offset = file_offset_beg + offset_in_data;
+            new_entry.padded_size = 0; // reset padded size to be zero
 
             offset_in_data += new_entry.size;
             data_pos += new_entry.size;
diff --git a/dbms/src/Storages/Page/V3/BlobStore.h b/dbms/src/Storages/Page/V3/BlobStore.h
index 24bf4652123..6b139b98557 100644
--- a/dbms/src/Storages/Page/V3/BlobStore.h
+++ b/dbms/src/Storages/Page/V3/BlobStore.h
@@ -296,7 +296,7 @@ class BlobStore : private Allocator<false>
 
     PageEntriesEdit handleLargeWrite(DB::WriteBatch & wb, const WriteLimiterPtr & write_limiter = nullptr);
 
-    BlobFilePtr read(BlobFileId blob_id, BlobFileOffset offset, char * buffers, size_t size, const ReadLimiterPtr & read_limiter = nullptr, bool background = false);
+    BlobFilePtr read(const PageIdV3Internal & page_id_v3, BlobFileId blob_id, BlobFileOffset offset, char * buffers, size_t size, const ReadLimiterPtr & read_limiter = nullptr, bool background = false);
 
     /**
      *  Ask BlobStats to get a span from BlobStat.
diff --git a/dbms/src/Storages/Page/V3/PageDirectory.cpp b/dbms/src/Storages/Page/V3/PageDirectory.cpp
index e9b754854b8..951da42de1c 100644
--- a/dbms/src/Storages/Page/V3/PageDirectory.cpp
+++ b/dbms/src/Storages/Page/V3/PageDirectory.cpp
@@ -478,7 +478,7 @@ PageSize VersionedPageEntries::getEntriesByBlobIds(
 bool VersionedPageEntries::cleanOutdatedEntries(
     UInt64 lowest_seq,
     std::map<PageIdV3Internal, std::pair<PageVersion, Int64>> * normal_entries_to_deref,
-    PageEntriesV3 & entries_removed,
+    PageEntriesV3 * entries_removed,
     const PageLock & /*page_lock*/)
 {
     if (type == EditRecordType::VAR_EXTERNAL)
@@ -541,7 +541,10 @@ bool VersionedPageEntries::cleanOutdatedEntries(
             {
                 if (iter->second.being_ref_count == 1)
                 {
-                    entries_removed.emplace_back(iter->second.entry);
+                    if (entries_removed)
+                    {
+                        entries_removed->emplace_back(iter->second.entry);
+                    }
                     iter = entries.erase(iter);
                 }
                 // The `being_ref_count` for this version is valid. While for older versions,
@@ -551,7 +554,10 @@ bool VersionedPageEntries::cleanOutdatedEntries(
             else
             {
                 // else there are newer "entry" in the version list, the outdated entries should be removed
-                entries_removed.emplace_back(iter->second.entry);
+                if (entries_removed)
+                {
+                    entries_removed->emplace_back(iter->second.entry);
+                }
                 iter = entries.erase(iter);
             }
         }
@@ -564,7 +570,7 @@ bool VersionedPageEntries::cleanOutdatedEntries(
     return entries.empty() || (entries.size() == 1 && entries.begin()->second.isDelete());
 }
 
-bool VersionedPageEntries::derefAndClean(UInt64 lowest_seq, PageIdV3Internal page_id, const PageVersion & deref_ver, const Int64 deref_count, PageEntriesV3 & entries_removed)
+bool VersionedPageEntries::derefAndClean(UInt64 lowest_seq, PageIdV3Internal page_id, const PageVersion & deref_ver, const Int64 deref_count, PageEntriesV3 * entries_removed)
 {
     auto page_lock = acquireLock();
     if (type == EditRecordType::VAR_EXTERNAL)
@@ -1223,7 +1229,7 @@ bool PageDirectory::tryDumpSnapshot(const ReadLimiterPtr & read_limiter, const W
         // `being_ref_count` by the function `createSnapshot()`.
         assert(!files_snap.persisted_log_files.empty()); // should not be empty when `needSave` return true
         auto log_num = files_snap.persisted_log_files.rbegin()->log_num;
-        auto identifier = fmt::format("{}_dump_{}", wal->name(), log_num);
+        auto identifier = fmt::format("{}.dump_{}", wal->name(), log_num);
         auto snapshot_reader = wal->createReaderForFiles(identifier, files_snap.persisted_log_files, read_limiter);
         PageDirectoryFactory factory;
         // we just use the `collapsed_dir` to dump edit of the snapshot, should never call functions like `apply` that
@@ -1239,7 +1245,7 @@ bool PageDirectory::tryDumpSnapshot(const ReadLimiterPtr & read_limiter, const W
     return done_any_io;
 }
 
-PageEntriesV3 PageDirectory::gcInMemEntries()
+PageEntriesV3 PageDirectory::gcInMemEntries(bool return_removed_entries)
 {
     UInt64 lowest_seq = sequence.load();
 
@@ -1303,7 +1309,7 @@ PageEntriesV3 PageDirectory::gcInMemEntries()
         const bool all_deleted = iter->second->cleanOutdatedEntries(
             lowest_seq,
             &normal_entries_to_deref,
-            all_del_entries,
+            return_removed_entries ? &all_del_entries : nullptr,
             iter->second->acquireLock());
 
         {
@@ -1342,7 +1348,7 @@ PageEntriesV3 PageDirectory::gcInMemEntries()
             page_id,
             /*deref_ver=*/deref_counter.first,
             /*deref_count=*/deref_counter.second,
-            all_del_entries);
+            return_removed_entries ? &all_del_entries : nullptr);
 
         if (all_deleted)
         {
diff --git a/dbms/src/Storages/Page/V3/PageDirectory.h b/dbms/src/Storages/Page/V3/PageDirectory.h
index bd7c433022f..2f0f09f4e42 100644
--- a/dbms/src/Storages/Page/V3/PageDirectory.h
+++ b/dbms/src/Storages/Page/V3/PageDirectory.h
@@ -223,14 +223,14 @@ class VersionedPageEntries
     bool cleanOutdatedEntries(
         UInt64 lowest_seq,
         std::map<PageIdV3Internal, std::pair<PageVersion, Int64>> * normal_entries_to_deref,
-        PageEntriesV3 & entries_removed,
+        PageEntriesV3 * entries_removed,
         const PageLock & page_lock);
     bool derefAndClean(
         UInt64 lowest_seq,
         PageIdV3Internal page_id,
         const PageVersion & deref_ver,
         Int64 deref_count,
-        PageEntriesV3 & entries_removed);
+        PageEntriesV3 * entries_removed);
 
     void collapseTo(UInt64 seq, PageIdV3Internal page_id, PageEntriesEdit & edit);
 
@@ -360,7 +360,9 @@ class PageDirectory
 
     bool tryDumpSnapshot(const ReadLimiterPtr & read_limiter = nullptr, const WriteLimiterPtr & write_limiter = nullptr);
 
-    PageEntriesV3 gcInMemEntries();
+    // Perform a GC for in-memory entries and return the removed entries.
+    // If `return_removed_entries` is false, then just return an empty set.
+    PageEntriesV3 gcInMemEntries(bool return_removed_entries = true);
 
     std::set<PageId> getAliveExternalIds(NamespaceId ns_id) const;
 
diff --git a/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp b/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp
index 483c5073ab5..968049a3273 100644
--- a/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp
+++ b/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp
@@ -44,7 +44,8 @@ PageDirectoryPtr PageDirectoryFactory::createFromReader(String storage_name, WAL
 
     // After restoring from the disk, we need cleanup all invalid entries in memory, or it will
     // try to run GC again on some entries that are already marked as invalid in BlobStore.
-    dir->gcInMemEntries();
+    // It's no need to remove the expired entries in BlobStore, so skip filling removed_entries to imporve performance.
+    dir->gcInMemEntries(/*return_removed_entries=*/false);
     LOG_FMT_INFO(DB::Logger::get("PageDirectoryFactory", storage_name), "PageDirectory restored [max_page_id={}] [max_applied_ver={}]", dir->getMaxId(), dir->sequence);
 
     if (blob_stats)
@@ -84,7 +85,8 @@ PageDirectoryPtr PageDirectoryFactory::createFromEdit(String storage_name, FileP
 
     // After restoring from the disk, we need cleanup all invalid entries in memory, or it will
     // try to run GC again on some entries that are already marked as invalid in BlobStore.
-    dir->gcInMemEntries();
+    // It's no need to remove the expired entries in BlobStore when restore, so no need to fill removed_entries.
+    dir->gcInMemEntries(/*return_removed_entries=*/false);
 
     if (blob_stats)
     {
diff --git a/dbms/src/Storages/Page/V3/spacemap/SpaceMap.h b/dbms/src/Storages/Page/V3/spacemap/SpaceMap.h
index ae44b608de0..d230b2f3e35 100644
--- a/dbms/src/Storages/Page/V3/spacemap/SpaceMap.h
+++ b/dbms/src/Storages/Page/V3/spacemap/SpaceMap.h
@@ -95,9 +95,11 @@ class SpaceMap
     virtual std::tuple<UInt64, UInt64, bool> searchInsertOffset(size_t size) = 0;
 
     /**
-     * Get the offset of the last free block. `[margin_offset, +∞)` is not used at all.
+     * Get the used boundary of this SpaceMap. 
+     * The return value (`used_boundary`) means that `[used_bounary + 1, +∞)` is safe to be truncated.
+     * If the `used_boundary` is equal to the `end` of this SpaceMap, it means that there is no space to be truncated.
      */
-    virtual UInt64 getRightMargin() = 0;
+    virtual UInt64 getUsedBoundary() = 0;
 
     /**
      * Get the accurate max capacity of the space map.
diff --git a/dbms/src/Storages/Page/V3/spacemap/SpaceMapBig.h b/dbms/src/Storages/Page/V3/spacemap/SpaceMapBig.h
index 22128a09f30..81c2a5cb786 100644
--- a/dbms/src/Storages/Page/V3/spacemap/SpaceMapBig.h
+++ b/dbms/src/Storages/Page/V3/spacemap/SpaceMapBig.h
@@ -74,7 +74,7 @@ class BigSpaceMap
         return std::make_pair(size_in_used, size_in_used);
     }
 
-    UInt64 getRightMargin() override
+    UInt64 getUsedBoundary() override
     {
         return end;
     }
diff --git a/dbms/src/Storages/Page/V3/spacemap/SpaceMapRBTree.cpp b/dbms/src/Storages/Page/V3/spacemap/SpaceMapRBTree.cpp
index 54275574060..4bd53b93e07 100644
--- a/dbms/src/Storages/Page/V3/spacemap/SpaceMapRBTree.cpp
+++ b/dbms/src/Storages/Page/V3/spacemap/SpaceMapRBTree.cpp
@@ -84,7 +84,7 @@ static void rb_get_new_entry(struct SmapRbEntry ** entry, UInt64 start, UInt64 c
 {
     struct SmapRbEntry * new_entry;
 
-    new_entry = static_cast<struct SmapRbEntry *>(calloc(1, sizeof(struct SmapRbEntry)));
+    new_entry = static_cast<struct SmapRbEntry *>(calloc(1, sizeof(struct SmapRbEntry))); // NOLINT
     if (new_entry == nullptr)
     {
         return;
@@ -115,7 +115,7 @@ inline static void rb_free_entry(struct RbPrivate * private_data, struct SmapRbE
         private_data->read_index_next = nullptr;
     }
 
-    free(entry);
+    free(entry); // NOLINT
 }
 
 
@@ -419,7 +419,7 @@ std::shared_ptr<RBTreeSpaceMap> RBTreeSpaceMap::create(UInt64 start, UInt64 end)
 {
     auto ptr = std::shared_ptr<RBTreeSpaceMap>(new RBTreeSpaceMap(start, end));
 
-    ptr->rb_tree = static_cast<struct RbPrivate *>(calloc(1, sizeof(struct RbPrivate)));
+    ptr->rb_tree = static_cast<struct RbPrivate *>(calloc(1, sizeof(struct RbPrivate))); // NOLINT
     if (ptr->rb_tree == nullptr)
     {
         return nullptr;
@@ -435,7 +435,7 @@ std::shared_ptr<RBTreeSpaceMap> RBTreeSpaceMap::create(UInt64 start, UInt64 end)
     if (!rb_insert_entry(start, end, ptr->rb_tree, ptr->log))
     {
         LOG_FMT_ERROR(ptr->log, "Erorr happend, when mark all space free.  [start={}] , [end={}]", start, end);
-        free(ptr->rb_tree);
+        free(ptr->rb_tree); // NOLINT
         return nullptr;
     }
     return ptr;
@@ -451,7 +451,7 @@ static void rb_free_tree(struct rb_root * root)
         next = rb_tree_next(node);
         entry = node_to_entry(node);
         rb_node_remove(node, root);
-        free(entry);
+        free(entry); // NOLINT
     }
 }
 
@@ -460,7 +460,7 @@ void RBTreeSpaceMap::freeSmap()
     if (rb_tree)
     {
         rb_free_tree(&rb_tree->root);
-        free(rb_tree);
+        free(rb_tree); // NOLINT
     }
 }
 
@@ -734,7 +734,7 @@ std::pair<UInt64, UInt64> RBTreeSpaceMap::getSizes() const
     }
 }
 
-UInt64 RBTreeSpaceMap::getRightMargin()
+UInt64 RBTreeSpaceMap::getUsedBoundary()
 {
     struct rb_node * node = rb_tree_last(&rb_tree->root);
     if (node == nullptr)
@@ -743,6 +743,20 @@ UInt64 RBTreeSpaceMap::getRightMargin()
     }
 
     auto * entry = node_to_entry(node);
+
+    // If the `offset+size` of the last free node is not equal to `end`, it means the range `[last_node.offset, end)` is marked as used,
+    // then we should return `end` as the used boundary.
+    //
+    // eg.
+    //  1. The spacemap manage a space of `[0, 100]`
+    //  2. A span {offset=90, size=10} is marked as used, then the free range in SpaceMap is `[0, 90)`
+    //  3. The return value should be 100
+    if (entry->start + entry->count != end)
+    {
+        return end;
+    }
+
+    // Else we should return the offset of last free node
     return entry->start;
 }
 
diff --git a/dbms/src/Storages/Page/V3/spacemap/SpaceMapRBTree.h b/dbms/src/Storages/Page/V3/spacemap/SpaceMapRBTree.h
index 0393fda081b..04691007a47 100644
--- a/dbms/src/Storages/Page/V3/spacemap/SpaceMapRBTree.h
+++ b/dbms/src/Storages/Page/V3/spacemap/SpaceMapRBTree.h
@@ -46,7 +46,7 @@ class RBTreeSpaceMap
 
     std::pair<UInt64, UInt64> getSizes() const override;
 
-    UInt64 getRightMargin() override;
+    UInt64 getUsedBoundary() override;
 
 protected:
     RBTreeSpaceMap(UInt64 start, UInt64 end)
diff --git a/dbms/src/Storages/Page/V3/spacemap/SpaceMapSTDMap.h b/dbms/src/Storages/Page/V3/spacemap/SpaceMapSTDMap.h
index b6ff8797f0f..41ddd77d03a 100644
--- a/dbms/src/Storages/Page/V3/spacemap/SpaceMapSTDMap.h
+++ b/dbms/src/Storages/Page/V3/spacemap/SpaceMapSTDMap.h
@@ -111,13 +111,29 @@ class STDMapSpaceMap
         }
     }
 
-    UInt64 getRightMargin() override
+    UInt64 getUsedBoundary() override
     {
         if (free_map.empty())
         {
-            return end - start;
+            return end;
         }
-        return free_map.rbegin()->first;
+
+        const auto & last_node_it = free_map.rbegin();
+
+        // If the `offset+size` of the last free node is not equal to `end`, it means the range `[last_node.offset, end)` is marked as used,
+        // then we should return `end` as the used boundary.
+        //
+        // eg.
+        //  1. The spacemap manage a space of `[0, 100]`
+        //  2. A span {offset=90, size=10} is marked as used, then the free range in SpaceMap is `[0, 90)`
+        //  3. The return value should be 100
+        if (last_node_it->first + last_node_it->second != end)
+        {
+            return end;
+        }
+
+        // Else we should return the offset of last free node
+        return last_node_it->first;
     }
 
     bool isMarkUnused(UInt64 offset, size_t length) override
diff --git a/dbms/src/Storages/Page/V3/tests/gtest_blob_store.cpp b/dbms/src/Storages/Page/V3/tests/gtest_blob_store.cpp
index 048140ed04f..f9daacc4cce 100644
--- a/dbms/src/Storages/Page/V3/tests/gtest_blob_store.cpp
+++ b/dbms/src/Storages/Page/V3/tests/gtest_blob_store.cpp
@@ -82,6 +82,7 @@ try
         stats.restoreByEntry(PageEntryV3{
             .file_id = file_id1,
             .size = 128,
+            .padded_size = 0,
             .tag = 0,
             .offset = 1024,
             .checksum = 0x4567,
@@ -89,6 +90,7 @@ try
         stats.restoreByEntry(PageEntryV3{
             .file_id = file_id1,
             .size = 512,
+            .padded_size = 0,
             .tag = 0,
             .offset = 2048,
             .checksum = 0x4567,
@@ -96,6 +98,7 @@ try
         stats.restoreByEntry(PageEntryV3{
             .file_id = file_id2,
             .size = 512,
+            .padded_size = 0,
             .tag = 0,
             .offset = 2048,
             .checksum = 0x4567,
@@ -303,6 +306,7 @@ try
         blob_store.blob_stats.restoreByEntry(PageEntryV3{
             .file_id = file_id1,
             .size = 128,
+            .padded_size = 0,
             .tag = 0,
             .offset = 1024,
             .checksum = 0x4567,
@@ -310,6 +314,7 @@ try
         blob_store.blob_stats.restoreByEntry(PageEntryV3{
             .file_id = file_id1,
             .size = 512,
+            .padded_size = 0,
             .tag = 0,
             .offset = 2048,
             .checksum = 0x4567,
@@ -317,6 +322,7 @@ try
         blob_store.blob_stats.restoreByEntry(PageEntryV3{
             .file_id = file_id2,
             .size = 512,
+            .padded_size = 0,
             .tag = 0,
             .offset = 2048,
             .checksum = 0x4567,
@@ -399,6 +405,7 @@ try
             blob_store.blob_stats.restoreByEntry(PageEntryV3{
                 .file_id = id,
                 .size = 1024,
+                .padded_size = 0,
                 .tag = 0,
                 .offset = 0,
                 .checksum = 0x4567,
@@ -531,7 +538,8 @@ TEST_F(BlobStoreTest, testWriteRead)
         ASSERT_EQ(record.entry.file_id, 1);
 
         // Read directly from the file
-        blob_store.read(record.entry.file_id,
+        blob_store.read(buildV3Id(TEST_NAMESPACE_ID, page_id),
+                        record.entry.file_id,
                         record.entry.offset,
                         c_buff_read + index * buff_size,
                         record.entry.size,
@@ -631,7 +639,8 @@ TEST_F(BlobStoreTest, testWriteReadWithIOLimiter)
         {
             for (const auto & record : edits[i].getRecords())
             {
-                blob_store.read(record.entry.file_id,
+                blob_store.read(buildV3Id(TEST_NAMESPACE_ID, page_id),
+                                record.entry.file_id,
                                 record.entry.offset,
                                 c_buff_read + i * buff_size,
                                 record.entry.size,
@@ -809,7 +818,8 @@ TEST_F(BlobStoreTest, testFeildOffsetWriteRead)
         ASSERT_EQ(check_field_sizes, offsets);
 
         // Read
-        blob_store.read(record.entry.file_id,
+        blob_store.read(buildV3Id(TEST_NAMESPACE_ID, page_id),
+                        record.entry.file_id,
                         record.entry.offset,
                         c_buff_read + index * buff_size,
                         record.entry.size,
diff --git a/dbms/src/Storages/Page/V3/tests/gtest_free_map.cpp b/dbms/src/Storages/Page/V3/tests/gtest_free_map.cpp
index f7120f000b2..faec139920b 100644
--- a/dbms/src/Storages/Page/V3/tests/gtest_free_map.cpp
+++ b/dbms/src/Storages/Page/V3/tests/gtest_free_map.cpp
@@ -427,6 +427,43 @@ TEST_P(SpaceMapTest, TestGetMaxCap)
     }
 }
 
+
+TEST_P(SpaceMapTest, TestGetUsedBoundary)
+{
+    {
+        auto smap = SpaceMap::createSpaceMap(test_type, 0, 100);
+        ASSERT_TRUE(smap->markUsed(50, 10));
+        ASSERT_EQ(smap->getUsedBoundary(), 60);
+        ASSERT_TRUE(smap->markUsed(80, 10));
+        ASSERT_EQ(smap->getUsedBoundary(), 90);
+
+        ASSERT_TRUE(smap->markUsed(90, 10));
+        ASSERT_EQ(smap->getUsedBoundary(), 100);
+    }
+
+    {
+        auto smap = SpaceMap::createSpaceMap(test_type, 0, 100);
+        ASSERT_TRUE(smap->markUsed(90, 10));
+        ASSERT_EQ(smap->getUsedBoundary(), 100);
+
+        ASSERT_TRUE(smap->markUsed(20, 10));
+        ASSERT_EQ(smap->getUsedBoundary(), 100);
+
+        ASSERT_TRUE(smap->markFree(90, 10));
+        ASSERT_EQ(smap->getUsedBoundary(), 30);
+
+        ASSERT_TRUE(smap->markUsed(90, 10));
+        ASSERT_EQ(smap->getUsedBoundary(), 100);
+    }
+
+    {
+        auto smap = SpaceMap::createSpaceMap(test_type, 0, 100);
+        ASSERT_EQ(smap->getUsedBoundary(), 0);
+        ASSERT_TRUE(smap->markUsed(0, 100));
+        ASSERT_EQ(smap->getUsedBoundary(), 100);
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(
     Type,
     SpaceMapTest,
diff --git a/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp b/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp
index 6e2b0efa1ea..6d6ef41630f 100644
--- a/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp
+++ b/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp
@@ -75,7 +75,7 @@ try
     auto snap0 = dir->createSnapshot();
     EXPECT_ENTRY_NOT_EXIST(dir, 1, snap0);
 
-    PageEntryV3 entry1{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry1{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(1, entry1);
@@ -85,7 +85,7 @@ try
     auto snap1 = dir->createSnapshot();
     EXPECT_ENTRY_EQ(entry1, dir, 1, snap1);
 
-    PageEntryV3 entry2{.file_id = 2, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry2{.file_id = 2, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(2, entry2);
@@ -102,7 +102,7 @@ try
         EXPECT_ENTRIES_EQ(expected_entries, dir, ids, snap2);
     }
 
-    PageEntryV3 entry2_v2{.file_id = 2 + 102, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry2_v2{.file_id = 2 + 102, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.del(2);
@@ -123,7 +123,7 @@ try
     auto snap0 = dir->createSnapshot();
     EXPECT_ENTRY_NOT_EXIST(dir, page_id, snap0);
 
-    PageEntryV3 entry1{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry1{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(page_id, entry1);
@@ -133,7 +133,7 @@ try
     auto snap1 = dir->createSnapshot();
     EXPECT_ENTRY_EQ(entry1, dir, page_id, snap1);
 
-    PageEntryV3 entry2{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x1234, .checksum = 0x4567};
+    PageEntryV3 entry2{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x1234, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(page_id, entry2);
@@ -151,7 +151,7 @@ try
 
     // Put identical page within one `edit`
     page_id++;
-    PageEntryV3 entry3{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x12345, .checksum = 0x4567};
+    PageEntryV3 entry3{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x12345, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(page_id, entry1);
@@ -172,8 +172,8 @@ CATCH
 TEST_F(PageDirectoryTest, ApplyPutDelRead)
 try
 {
-    PageEntryV3 entry1{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry2{.file_id = 2, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry1{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry2{.file_id = 2, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(1, entry1);
@@ -185,8 +185,8 @@ try
     EXPECT_ENTRY_EQ(entry1, dir, 1, snap1);
     EXPECT_ENTRY_EQ(entry2, dir, 2, snap1);
 
-    PageEntryV3 entry3{.file_id = 3, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry4{.file_id = 4, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry3{.file_id = 3, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry4{.file_id = 4, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.del(2);
@@ -217,8 +217,8 @@ CATCH
 TEST_F(PageDirectoryTest, ApplyUpdateOnRefEntries)
 try
 {
-    PageEntryV3 entry1{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry2{.file_id = 2, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry1{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry2{.file_id = 2, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(1, entry1);
@@ -236,14 +236,14 @@ try
     EXPECT_ENTRY_EQ(entry2, dir, 3, snap1);
 
     // Update on ref page is not allowed
-    PageEntryV3 entry_updated{.file_id = 999, .size = 16, .tag = 0, .offset = 0x123, .checksum = 0x123};
+    PageEntryV3 entry_updated{.file_id = 999, .size = 16, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x123};
     {
         PageEntriesEdit edit;
         edit.put(3, entry_updated);
         ASSERT_ANY_THROW(dir->apply(std::move(edit)));
     }
 
-    PageEntryV3 entry_updated2{.file_id = 777, .size = 16, .tag = 0, .offset = 0x123, .checksum = 0x123};
+    PageEntryV3 entry_updated2{.file_id = 777, .size = 16, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x123};
     {
         PageEntriesEdit edit;
         edit.put(2, entry_updated2);
@@ -255,8 +255,8 @@ CATCH
 TEST_F(PageDirectoryTest, ApplyDeleteOnRefEntries)
 try
 {
-    PageEntryV3 entry1{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry2{.file_id = 2, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry1{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry2{.file_id = 2, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(1, entry1);
@@ -305,8 +305,8 @@ CATCH
 TEST_F(PageDirectoryTest, ApplyRefOnRefEntries)
 try
 {
-    PageEntryV3 entry1{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry2{.file_id = 2, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry1{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry2{.file_id = 2, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(1, entry1);
@@ -343,8 +343,8 @@ CATCH
 TEST_F(PageDirectoryTest, ApplyDuplicatedRefEntries)
 try
 {
-    PageEntryV3 entry1{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry2{.file_id = 2, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry1{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry2{.file_id = 2, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(1, entry1);
@@ -410,8 +410,8 @@ CATCH
 TEST_F(PageDirectoryTest, ApplyCollapseDuplicatedRefEntries)
 try
 {
-    PageEntryV3 entry1{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry2{.file_id = 2, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry1{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry2{.file_id = 2, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(1, entry1);
@@ -447,9 +447,9 @@ CATCH
 TEST_F(PageDirectoryTest, ApplyRefToNotExistEntry)
 try
 {
-    PageEntryV3 entry1{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry2{.file_id = 2, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry3{.file_id = 3, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry1{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry2{.file_id = 2, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry3{.file_id = 3, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(1, entry1);
@@ -628,12 +628,12 @@ try
 }
 CATCH
 
-#define INSERT_BLOBID_ENTRY(BLOBID, VERSION)                                                                             \
-    PageEntryV3 entry_v##VERSION{.file_id = (BLOBID), .size = (VERSION), .tag = 0, .offset = 0x123, .checksum = 0x4567}; \
+#define INSERT_BLOBID_ENTRY(BLOBID, VERSION)                                                                                               \
+    PageEntryV3 entry_v##VERSION{.file_id = (BLOBID), .size = (VERSION), .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567}; \
     entries.createNewEntry(PageVersion(VERSION), entry_v##VERSION);
 #define INSERT_ENTRY(VERSION) INSERT_BLOBID_ENTRY(1, VERSION)
-#define INSERT_GC_ENTRY(VERSION, EPOCH)                                                                                                        \
-    PageEntryV3 entry_gc_v##VERSION##_##EPOCH{.file_id = 2, .size = 100 * (VERSION) + (EPOCH), .tag = 0, .offset = 0x234, .checksum = 0x5678}; \
+#define INSERT_GC_ENTRY(VERSION, EPOCH)                                                                                                                          \
+    PageEntryV3 entry_gc_v##VERSION##_##EPOCH{.file_id = 2, .size = 100 * (VERSION) + (EPOCH), .padded_size = 0, .tag = 0, .offset = 0x234, .checksum = 0x5678}; \
     entries.createNewEntry(PageVersion((VERSION), (EPOCH)), entry_gc_v##VERSION##_##EPOCH);
 
 class VersionedEntriesTest : public ::testing::Test
@@ -644,14 +644,14 @@ class VersionedEntriesTest : public ::testing::Test
     {
         DerefCounter deref_counter;
         PageEntriesV3 removed_entries;
-        bool all_removed = entries.cleanOutdatedEntries(seq, &deref_counter, removed_entries, entries.acquireLock());
+        bool all_removed = entries.cleanOutdatedEntries(seq, &deref_counter, &removed_entries, entries.acquireLock());
         return {all_removed, removed_entries, deref_counter};
     }
 
     std::tuple<bool, PageEntriesV3> runDeref(UInt64 seq, PageVersion ver, Int64 decrease_num)
     {
         PageEntriesV3 removed_entries;
-        bool all_removed = entries.derefAndClean(seq, buildV3Id(TEST_NAMESPACE_ID, page_id), ver, decrease_num, removed_entries);
+        bool all_removed = entries.derefAndClean(seq, buildV3Id(TEST_NAMESPACE_ID, page_id), ver, decrease_num, &removed_entries);
         return {all_removed, removed_entries};
     }
 
@@ -1271,12 +1271,12 @@ class PageDirectoryGCTest : public PageDirectoryTest
 {
 };
 
-#define INSERT_ENTRY_TO(PAGE_ID, VERSION, BLOB_FILE_ID)                                                                        \
-    PageEntryV3 entry_v##VERSION{.file_id = (BLOB_FILE_ID), .size = (VERSION), .tag = 0, .offset = 0x123, .checksum = 0x4567}; \
-    {                                                                                                                          \
-        PageEntriesEdit edit;                                                                                                  \
-        edit.put((PAGE_ID), entry_v##VERSION);                                                                                 \
-        dir->apply(std::move(edit));                                                                                           \
+#define INSERT_ENTRY_TO(PAGE_ID, VERSION, BLOB_FILE_ID)                                                                                          \
+    PageEntryV3 entry_v##VERSION{.file_id = (BLOB_FILE_ID), .size = (VERSION), .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567}; \
+    {                                                                                                                                            \
+        PageEntriesEdit edit;                                                                                                                    \
+        edit.put((PAGE_ID), entry_v##VERSION);                                                                                                   \
+        dir->apply(std::move(edit));                                                                                                             \
     }
 // Insert an entry into mvcc directory
 #define INSERT_ENTRY(PAGE_ID, VERSION) INSERT_ENTRY_TO(PAGE_ID, VERSION, 1)
@@ -1566,7 +1566,7 @@ try
     INSERT_ENTRY_ACQ_SNAP(page_id, 5);
     INSERT_ENTRY(another_page_id, 6);
     INSERT_ENTRY(another_page_id, 7);
-    PageEntryV3 entry_v8{.file_id = 1, .size = 8, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_v8{.file_id = 1, .size = 8, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.del(page_id);
@@ -1756,7 +1756,7 @@ TEST_F(PageDirectoryGCTest, GCOnRefedEntries)
 try
 {
     // 10->entry1, 11->10=>11->entry1; del 10->entry1
-    PageEntryV3 entry1{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry1{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(10, entry1);
@@ -1793,7 +1793,7 @@ TEST_F(PageDirectoryGCTest, GCOnRefedEntries2)
 try
 {
     // 10->entry1, 11->10=>11->entry1; del 10->entry1
-    PageEntryV3 entry1{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry1{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(10, entry1);
@@ -1836,7 +1836,7 @@ TEST_F(PageDirectoryGCTest, UpsertOnRefedEntries)
 try
 {
     // 10->entry1, 11->10, 12->10
-    PageEntryV3 entry1{.file_id = 1, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry1{.file_id = 1, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(10, entry1);
@@ -1860,7 +1860,7 @@ try
     }
 
     // upsert 10->entry2
-    PageEntryV3 entry2{.file_id = 2, .size = 1024, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry2{.file_id = 2, .size = 1024, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         auto full_gc_entries = dir->getEntriesByBlobIds({1});
@@ -2024,10 +2024,10 @@ try
         return d;
     };
 
-    PageEntryV3 entry_1_v1{.file_id = 1, .size = 1, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_1_v2{.file_id = 1, .size = 2, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_2_v1{.file_id = 2, .size = 1, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_2_v2{.file_id = 2, .size = 2, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_1_v1{.file_id = 1, .size = 1, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_1_v2{.file_id = 1, .size = 2, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_2_v1{.file_id = 2, .size = 1, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_2_v2{.file_id = 2, .size = 2, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(1, entry_1_v1);
@@ -2055,8 +2055,8 @@ try
 
     // 10->ext, 11->10, del 10->ext
     // 50->entry, 51->50, 52->51=>50, del 50
-    PageEntryV3 entry_50{.file_id = 1, .size = 50, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_60{.file_id = 1, .size = 90, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_50{.file_id = 1, .size = 50, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_60{.file_id = 1, .size = 90, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.del(2);
@@ -2218,9 +2218,9 @@ try
     Poco::File(fmt::format("{}/{}{}", path, BlobFile::BLOB_PREFIX_NAME, file_id1)).createFile();
     Poco::File(fmt::format("{}/{}{}", path, BlobFile::BLOB_PREFIX_NAME, file_id2)).createFile();
 
-    PageEntryV3 entry_1_v1{.file_id = file_id1, .size = 7890, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_5_v1{.file_id = file_id2, .size = 255, .tag = 0, .offset = 0x100, .checksum = 0x4567};
-    PageEntryV3 entry_5_v2{.file_id = file_id2, .size = 255, .tag = 0, .offset = 0x400, .checksum = 0x4567};
+    PageEntryV3 entry_1_v1{.file_id = file_id1, .size = 7890, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_5_v1{.file_id = file_id2, .size = 255, .padded_size = 0, .tag = 0, .offset = 0x100, .checksum = 0x4567};
+    PageEntryV3 entry_5_v2{.file_id = file_id2, .size = 255, .padded_size = 0, .tag = 0, .offset = 0x400, .checksum = 0x4567};
     {
         PageEntriesEdit edit;
         edit.put(1, entry_1_v1);
@@ -2275,8 +2275,8 @@ CATCH
 TEST_F(PageDirectoryGCTest, CleanAfterDecreaseRef)
 try
 {
-    PageEntryV3 entry_50_1{.file_id = 1, .size = 7890, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_50_2{.file_id = 2, .size = 7890, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_50_1{.file_id = 1, .size = 7890, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_50_2{.file_id = 2, .size = 7890, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
 
     auto restore_from_edit = [](const PageEntriesEdit & edit) {
         auto ctx = ::DB::tests::TiFlashTestEnv::getContext();
diff --git a/dbms/src/Storages/Page/V3/tests/gtest_page_storage.cpp b/dbms/src/Storages/Page/V3/tests/gtest_page_storage.cpp
index f7ba33c46c8..f9ef25cb973 100644
--- a/dbms/src/Storages/Page/V3/tests/gtest_page_storage.cpp
+++ b/dbms/src/Storages/Page/V3/tests/gtest_page_storage.cpp
@@ -1441,6 +1441,55 @@ try
 }
 CATCH
 
+TEST_F(PageStorageTest, EntryTagAfterFullGC)
+try
+{
+    {
+        PageStorage::Config config;
+        config.blob_heavy_gc_valid_rate = 1.0; /// always run full gc
+        page_storage = reopenWithConfig(config);
+    }
+
+    const size_t buf_sz = 1024;
+    char c_buff[buf_sz];
+
+    for (size_t i = 0; i < buf_sz; ++i)
+    {
+        c_buff[i] = i % 0xff;
+    }
+
+    PageId page_id = 120;
+    UInt64 tag = 12345;
+    {
+        WriteBatch batch;
+        batch.putPage(page_id, tag, std::make_shared<ReadBufferFromMemory>(c_buff, buf_sz), buf_sz, {});
+        page_storage->write(std::move(batch));
+    }
+
+    {
+        auto entry = page_storage->getEntry(page_id);
+        ASSERT_EQ(entry.tag, tag);
+        auto page = page_storage->read(page_id);
+        for (size_t i = 0; i < buf_sz; ++i)
+        {
+            EXPECT_EQ(*(page.data.begin() + i), static_cast<char>(i % 0xff));
+        }
+    }
+
+    auto done_full_gc = page_storage->gc();
+    EXPECT_TRUE(done_full_gc);
+
+    {
+        auto entry = page_storage->getEntry(page_id);
+        ASSERT_EQ(entry.tag, tag);
+        auto page = page_storage->read(page_id);
+        for (size_t i = 0; i < buf_sz; ++i)
+        {
+            EXPECT_EQ(*(page.data.begin() + i), static_cast<char>(i % 0xff));
+        }
+    }
+}
+CATCH
 
 } // namespace PS::V3::tests
 } // namespace DB
diff --git a/dbms/src/Storages/Page/V3/tests/gtest_page_storage_mix_mode.cpp b/dbms/src/Storages/Page/V3/tests/gtest_page_storage_mix_mode.cpp
index 078daa3e5b4..74e56c929d8 100644
--- a/dbms/src/Storages/Page/V3/tests/gtest_page_storage_mix_mode.cpp
+++ b/dbms/src/Storages/Page/V3/tests/gtest_page_storage_mix_mode.cpp
@@ -85,6 +85,16 @@ class PageStorageMixedTest : public DB::base::TiFlashStorageTestBasic
         return run_mode;
     }
 
+    PageReader newMixedPageReader(PageStorage::SnapshotPtr & snapshot)
+    {
+        return storage_pool_mix->newLogReader(nullptr, snapshot);
+    }
+
+    PageReader newMixedPageReader()
+    {
+        return storage_pool_mix->newLogReader(nullptr, true, "PageStorageMixedTest");
+    }
+
     void reloadV2StoragePool()
     {
         db_context->setPageStorageRunMode(PageStorageRunMode::ONLY_V2);
@@ -1035,7 +1045,7 @@ try
     // Thread A create snapshot for read
     auto snapshot_mix_before_merge_delta = page_reader_mix->getSnapshot("ReadWithSnapshotAfterMergeDelta");
     {
-        auto page_reader_mix_with_snap = storage_pool_mix->newLogReader(nullptr, snapshot_mix_before_merge_delta);
+        auto page_reader_mix_with_snap = newMixedPageReader(snapshot_mix_before_merge_delta);
         const auto & page1 = page_reader_mix_with_snap.read(1);
         const auto & page2 = page_reader_mix_with_snap.read(2);
         const auto & page3 = page_reader_mix_with_snap.read(3);
@@ -1044,7 +1054,7 @@ try
         ASSERT_PAGE_EQ(c_buff2, buf_sz2, page3, 3);
     }
     {
-        auto page_reader_mix_with_snap = storage_pool_mix->newLogReader(nullptr, true, "ReadWithSnapshotAfterMergeDelta");
+        auto page_reader_mix_with_snap = newMixedPageReader();
         const auto & page1 = page_reader_mix_with_snap.read(1);
         const auto & page2 = page_reader_mix_with_snap.read(2);
         const auto & page3 = page_reader_mix_with_snap.read(3);
@@ -1063,7 +1073,7 @@ try
     }
     // Thread A continue to read 1, 3
     {
-        auto page_reader_mix_with_snap = storage_pool_mix->newLogReader(nullptr, snapshot_mix_before_merge_delta);
+        auto page_reader_mix_with_snap = newMixedPageReader(snapshot_mix_before_merge_delta);
         // read 1, 3 with snapshot, should be success
         const auto & page1 = page_reader_mix_with_snap.read(1);
         const auto & page3 = page_reader_mix_with_snap.read(3);
@@ -1071,6 +1081,7 @@ try
         ASSERT_PAGE_EQ(c_buff2, buf_sz2, page3, 3);
         ASSERT_THROW(page_reader_mix_with_snap.read(4), DB::Exception);
     }
+
     {
         // Revert v3
         WriteBatch batch;
@@ -1081,6 +1092,290 @@ try
 }
 CATCH
 
+TEST_F(PageStorageMixedTest, refWithSnapshot2)
+try
+{
+    UInt64 tag = 0;
+    const size_t buf_sz = 1024;
+    char c_buff[buf_sz];
+    for (size_t i = 0; i < buf_sz; ++i)
+    {
+        c_buff[i] = i % 0xff;
+    }
+
+    {
+        WriteBatch batch;
+        ReadBufferPtr buff = std::make_shared<ReadBufferFromMemory>(c_buff, sizeof(c_buff));
+        batch.putPage(1, tag, buff, buf_sz);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    {
+        WriteBatch batch;
+        batch.putRefPage(2, 1);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    // Change to mix mode here
+    ASSERT_EQ(reloadMixedStoragePool(), PageStorageRunMode::MIX_MODE);
+
+    auto snapshot_mix = page_reader_mix->getSnapshot("");
+    {
+        WriteBatch batch;
+        batch.delPage(1);
+        batch.delPage(2);
+        page_writer_mix->write(std::move(batch), nullptr);
+    }
+
+    {
+        auto page_maps = newMixedPageReader(snapshot_mix).read({1, 2});
+        ASSERT_EQ(page_maps.size(), 2);
+
+        ASSERT_PAGE_EQ(c_buff, buf_sz, page_maps[1], 1);
+        ASSERT_PAGE_EQ(c_buff, buf_sz, page_maps[2], 2);
+    }
+}
+CATCH
+
+TEST_F(PageStorageMixedTest, refWithSnapshot3)
+try
+{
+    UInt64 tag = 0;
+    const size_t buf_sz = 1024;
+    char c_buff[buf_sz];
+    for (size_t i = 0; i < buf_sz; ++i)
+    {
+        c_buff[i] = i % 0xff;
+    }
+
+    {
+        WriteBatch batch;
+        ReadBufferPtr buff = std::make_shared<ReadBufferFromMemory>(c_buff, sizeof(c_buff));
+        batch.putPage(1, tag, buff, buf_sz);
+        // to keep mix mode
+        batch.putExternal(10, 1);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    {
+        WriteBatch batch;
+        batch.putRefPage(2, 1);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    {
+        WriteBatch batch;
+        batch.delPage(1);
+        batch.delPage(2);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    // Change to mix mode here
+    ASSERT_EQ(reloadMixedStoragePool(), PageStorageRunMode::MIX_MODE);
+
+    {
+        WriteBatch batch;
+        ReadBufferPtr buff = std::make_shared<ReadBufferFromMemory>(c_buff, sizeof(c_buff));
+        batch.putPage(1, tag, buff, buf_sz);
+        page_writer_mix->write(std::move(batch), nullptr);
+    }
+
+    {
+        WriteBatch batch;
+        batch.putRefPage(2, 1);
+        page_writer_mix->write(std::move(batch), nullptr);
+    }
+
+    auto snapshot_mix = page_reader_mix->getSnapshot("");
+    {
+        WriteBatch batch;
+        batch.delPage(1);
+        batch.delPage(2);
+        page_writer_mix->write(std::move(batch), nullptr);
+    }
+
+    {
+        auto page_maps = newMixedPageReader(snapshot_mix).read({1, 2});
+        ASSERT_EQ(page_maps.size(), 2);
+
+        ASSERT_PAGE_EQ(c_buff, buf_sz, page_maps[1], 1);
+        ASSERT_PAGE_EQ(c_buff, buf_sz, page_maps[2], 2);
+    }
+}
+CATCH
+
+TEST_F(PageStorageMixedTest, refWithSnapshot4)
+try
+{
+    UInt64 tag = 0;
+    const size_t buf_sz = 1024;
+    char c_buff[buf_sz];
+    for (size_t i = 0; i < buf_sz; ++i)
+    {
+        c_buff[i] = i % 0xff;
+    }
+
+    {
+        WriteBatch batch;
+        ReadBufferPtr buff = std::make_shared<ReadBufferFromMemory>(c_buff, sizeof(c_buff));
+        batch.putPage(1, tag, buff, buf_sz);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    {
+        WriteBatch batch;
+        batch.putRefPage(2, 1);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    // Change to mix mode here
+    ASSERT_EQ(reloadMixedStoragePool(), PageStorageRunMode::MIX_MODE);
+
+    {
+        WriteBatch batch;
+        batch.delPage(2);
+        page_writer_mix->write(std::move(batch), nullptr);
+    }
+
+    {
+        auto page1 = page_reader_mix->read(1);
+
+        ASSERT_PAGE_EQ(c_buff, buf_sz, page1, 1);
+    }
+}
+CATCH
+
+TEST_F(PageStorageMixedTest, refWithSnapshot5)
+try
+{
+    UInt64 tag = 0;
+    const size_t buf_sz = 1024;
+    char c_buff[buf_sz];
+    for (size_t i = 0; i < buf_sz; ++i)
+    {
+        c_buff[i] = i % 0xff;
+    }
+
+    {
+        WriteBatch batch;
+        ReadBufferPtr buff = std::make_shared<ReadBufferFromMemory>(c_buff, sizeof(c_buff));
+        batch.putPage(1, tag, buff, buf_sz);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    {
+        WriteBatch batch;
+        batch.putRefPage(2, 1);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    {
+        WriteBatch batch;
+        batch.delPage(1);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    // Change to mix mode here
+    ASSERT_EQ(reloadMixedStoragePool(), PageStorageRunMode::MIX_MODE);
+
+    {
+        auto page1 = page_reader_mix->read(2);
+
+        ASSERT_PAGE_EQ(c_buff, buf_sz, page1, 2);
+    }
+}
+CATCH
+
+TEST_F(PageStorageMixedTest, refWithSnapshot6)
+try
+{
+    UInt64 tag = 0;
+    const size_t buf_sz = 1024;
+    char c_buff[buf_sz];
+    for (size_t i = 0; i < buf_sz; ++i)
+    {
+        c_buff[i] = i % 0xff;
+    }
+
+    {
+        WriteBatch batch;
+        ReadBufferPtr buff = std::make_shared<ReadBufferFromMemory>(c_buff, sizeof(c_buff));
+        batch.putPage(1, tag, buff, buf_sz);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    {
+        WriteBatch batch;
+        batch.putRefPage(2, 1);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    // Change to mix mode here
+    ASSERT_EQ(reloadMixedStoragePool(), PageStorageRunMode::MIX_MODE);
+
+    {
+        WriteBatch batch;
+        batch.delPage(1);
+        page_writer_mix->write(std::move(batch), nullptr);
+    }
+
+    {
+        auto page1 = page_reader_mix->read(2);
+
+        ASSERT_PAGE_EQ(c_buff, buf_sz, page1, 2);
+    }
+}
+CATCH
+
+TEST_F(PageStorageMixedTest, ReadWithSnapshot2)
+try
+{
+    UInt64 tag = 0;
+    const size_t buf_sz = 1;
+    char c_buff1[buf_sz];
+    c_buff1[0] = 1;
+
+    char c_buff2[buf_sz];
+    c_buff2[0] = 2;
+
+    {
+        WriteBatch batch;
+        ReadBufferPtr buff = std::make_shared<ReadBufferFromMemory>(c_buff1, buf_sz);
+        batch.putPage(1, tag, buff, buf_sz);
+        page_writer_v2->write(std::move(batch), nullptr);
+    }
+
+    // Change to mix mode here
+    ASSERT_EQ(reloadMixedStoragePool(), PageStorageRunMode::MIX_MODE);
+
+    auto snapshot_mix = page_reader_mix->getSnapshot("");
+    {
+        WriteBatch batch;
+        batch.delPage(1);
+        ReadBufferPtr buff = std::make_shared<ReadBufferFromMemory>(c_buff2, buf_sz);
+        batch.putPage(1, tag, buff, buf_sz);
+        page_writer_mix->write(std::move(batch), nullptr);
+    }
+
+    {
+        auto page1 = newMixedPageReader(snapshot_mix).read(1);
+        ASSERT_PAGE_EQ(c_buff1, buf_sz, page1, 1);
+    }
+
+    {
+        auto page1 = page_reader_mix->read(1);
+        ASSERT_PAGE_EQ(c_buff2, buf_sz, page1, 1);
+    }
+
+    {
+        // Revert v3
+        WriteBatch batch;
+        batch.delPage(1);
+        page_writer_mix->write(std::move(batch), nullptr);
+    }
+}
+CATCH
+
 
 } // namespace PS::V3::tests
 } // namespace DB
diff --git a/dbms/src/Storages/Page/V3/tests/gtest_wal_store.cpp b/dbms/src/Storages/Page/V3/tests/gtest_wal_store.cpp
index 6d47adabbc5..b4e6c2d9204 100644
--- a/dbms/src/Storages/Page/V3/tests/gtest_wal_store.cpp
+++ b/dbms/src/Storages/Page/V3/tests/gtest_wal_store.cpp
@@ -34,8 +34,8 @@ namespace DB::PS::V3::tests
 {
 TEST(WALSeriTest, AllPuts)
 {
-    PageEntryV3 entry_p1{.file_id = 1, .size = 1, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_p2{.file_id = 1, .size = 2, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p1{.file_id = 1, .size = 1, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p2{.file_id = 1, .size = 2, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     PageVersion ver20(/*seq=*/20);
     PageEntriesEdit edit;
     edit.put(1, entry_p1);
@@ -56,8 +56,8 @@ TEST(WALSeriTest, AllPuts)
 TEST(WALSeriTest, PutsAndRefsAndDels)
 try
 {
-    PageEntryV3 entry_p3{.file_id = 1, .size = 3, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_p5{.file_id = 1, .size = 5, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p3{.file_id = 1, .size = 3, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p5{.file_id = 1, .size = 5, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     PageVersion ver21(/*seq=*/21);
     PageEntriesEdit edit;
     edit.put(3, entry_p3);
@@ -104,9 +104,9 @@ CATCH
 
 TEST(WALSeriTest, Upserts)
 {
-    PageEntryV3 entry_p1_2{.file_id = 2, .size = 1, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_p3_2{.file_id = 2, .size = 3, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_p5_2{.file_id = 2, .size = 5, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p1_2{.file_id = 2, .size = 1, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p3_2{.file_id = 2, .size = 3, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p5_2{.file_id = 2, .size = 5, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     PageVersion ver20_1(/*seq=*/20, /*epoch*/ 1);
     PageVersion ver21_1(/*seq=*/21, /*epoch*/ 1);
     PageEntriesEdit edit;
@@ -164,7 +164,7 @@ TEST(WALSeriTest, RefExternalAndEntry)
 
     {
         PageEntriesEdit edit;
-        PageEntryV3 entry_p1_2{.file_id = 2, .size = 1, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+        PageEntryV3 entry_p1_2{.file_id = 2, .size = 1, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
         edit.varEntry(1, ver1_0, entry_p1_2, 2);
         edit.varDel(1, ver2_0);
         edit.varRef(2, ver3_0, 1);
@@ -405,8 +405,8 @@ try
     ASSERT_NE(wal, nullptr);
 
     // Stage 2. Apply with only puts
-    PageEntryV3 entry_p1{.file_id = 1, .size = 1, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_p2{.file_id = 1, .size = 2, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p1{.file_id = 1, .size = 1, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p2{.file_id = 1, .size = 2, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     PageVersion ver20(/*seq=*/20);
     {
         PageEntriesEdit edit;
@@ -435,8 +435,8 @@ try
     }
 
     // Stage 3. Apply with puts and refs
-    PageEntryV3 entry_p3{.file_id = 1, .size = 3, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_p5{.file_id = 1, .size = 5, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p3{.file_id = 1, .size = 3, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p5{.file_id = 1, .size = 5, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     PageVersion ver21(/*seq=*/21);
     {
         PageEntriesEdit edit;
@@ -468,9 +468,9 @@ try
 
 
     // Stage 4. Apply with delete and upsert
-    PageEntryV3 entry_p1_2{.file_id = 2, .size = 1, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_p3_2{.file_id = 2, .size = 3, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_p5_2{.file_id = 2, .size = 5, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p1_2{.file_id = 2, .size = 1, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p3_2{.file_id = 2, .size = 3, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p5_2{.file_id = 2, .size = 5, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     PageVersion ver20_1(/*seq=*/20, /*epoch*/ 1);
     PageVersion ver21_1(/*seq=*/21, /*epoch*/ 1);
     {
@@ -514,8 +514,8 @@ try
 
     std::vector<size_t> size_each_edit;
     // Stage 1. Apply with only puts
-    PageEntryV3 entry_p1{.file_id = 1, .size = 1, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_p2{.file_id = 1, .size = 2, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p1{.file_id = 1, .size = 1, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p2{.file_id = 1, .size = 2, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     PageVersion ver20(/*seq=*/20);
     {
         PageEntriesEdit edit;
@@ -526,8 +526,8 @@ try
     }
 
     // Stage 2. Apply with puts and refs
-    PageEntryV3 entry_p3{.file_id = 1, .size = 3, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_p5{.file_id = 1, .size = 5, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p3{.file_id = 1, .size = 3, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p5{.file_id = 1, .size = 5, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     PageVersion ver21(/*seq=*/21);
     {
         PageEntriesEdit edit;
@@ -540,9 +540,9 @@ try
     }
 
     // Stage 3. Apply with delete and upsert
-    PageEntryV3 entry_p1_2{.file_id = 2, .size = 1, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_p3_2{.file_id = 2, .size = 3, .tag = 0, .offset = 0x123, .checksum = 0x4567};
-    PageEntryV3 entry_p5_2{.file_id = 2, .size = 5, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p1_2{.file_id = 2, .size = 1, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p3_2{.file_id = 2, .size = 3, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry_p5_2{.file_id = 2, .size = 5, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     PageVersion ver20_1(/*seq=*/20, /*epoch*/ 1);
     PageVersion ver21_1(/*seq=*/21, /*epoch*/ 1);
     {
@@ -615,7 +615,7 @@ try
     PageVersion ver(/*seq*/ 32);
     for (size_t i = 0; i < num_edits_test; ++i)
     {
-        PageEntryV3 entry{.file_id = 2, .size = 1, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+        PageEntryV3 entry{.file_id = 2, .size = 1, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
         PageEntriesEdit edit;
         const size_t num_pages_put = d_20(rd);
         for (size_t p = 0; p < num_pages_put; ++p)
@@ -660,7 +660,7 @@ try
                                       .persisted_log_files = persisted_log_files};
 
     PageEntriesEdit snap_edit;
-    PageEntryV3 entry{.file_id = 2, .size = 1, .tag = 0, .offset = 0x123, .checksum = 0x4567};
+    PageEntryV3 entry{.file_id = 2, .size = 1, .padded_size = 0, .tag = 0, .offset = 0x123, .checksum = 0x4567};
     std::uniform_int_distribution<> d_10000(0, 10000);
     // just fill in some random entry
     for (size_t i = 0; i < 70; ++i)
diff --git a/dbms/src/Storages/Page/stress/stress_page_storage.cpp b/dbms/src/Storages/Page/stress/stress_page_storage.cpp
deleted file mode 100644
index 818be710363..00000000000
--- a/dbms/src/Storages/Page/stress/stress_page_storage.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2022 PingCAP, Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <PSStressEnv.h>
-#include <PSWorkload.h>
-
-namespace DB
-{
-// Define is_background_thread for this binary
-// It is required for `RateLimiter` but we do not link with `BackgroundProcessingPool`.
-#if __APPLE__ && __clang__
-__thread bool is_background_thread = false;
-#else
-thread_local bool is_background_thread = false;
-#endif
-} // namespace DB
-
-int main(int argc, char ** argv)
-try
-{
-    StressEnv::initGlobalLogger();
-    auto env = StressEnv::parse(argc, argv);
-    env.setup();
-
-    auto & mamager = StressWorkloadManger::getInstance();
-    mamager.setEnv(env);
-    mamager.runWorkload();
-
-    return StressEnvStatus::getInstance().isSuccess();
-}
-catch (...)
-{
-    DB::tryLogCurrentException("");
-    exit(-1);
-}
diff --git a/dbms/src/Storages/Page/workload/CMakeLists.txt b/dbms/src/Storages/Page/workload/CMakeLists.txt
new file mode 100644
index 00000000000..adf94c75f11
--- /dev/null
+++ b/dbms/src/Storages/Page/workload/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include_directories (${CMAKE_CURRENT_BINARY_DIR})
+
+set (page-workload-src MainEntry.cpp PSBackground.cpp PSRunnable.cpp PSStressEnv.cpp PSWorkload.cpp)
+
+add_library (page-workload-lib ${page-workload-src})
+target_link_libraries (page-workload-lib dbms clickhouse_functions clickhouse-server-lib)
+target_compile_options (page-workload-lib PRIVATE -Wno-format -lc++)
\ No newline at end of file
diff --git a/dbms/src/Storages/Page/stress/workload/HeavyMemoryCostInGC.cpp b/dbms/src/Storages/Page/workload/HeavyMemoryCostInGC.h
similarity index 95%
rename from dbms/src/Storages/Page/stress/workload/HeavyMemoryCostInGC.cpp
rename to dbms/src/Storages/Page/workload/HeavyMemoryCostInGC.h
index 40595f0cb59..3daaf10ffb3 100644
--- a/dbms/src/Storages/Page/stress/workload/HeavyMemoryCostInGC.cpp
+++ b/dbms/src/Storages/Page/workload/HeavyMemoryCostInGC.h
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <PSWorkload.h>
+#include <Storages/Page/workload/PSWorkload.h>
 
+namespace DB::PS::tests
+{
 class HeavyMemoryCostInGC
     : public StressWorkload
     , public StressWorkloadFunc<HeavyMemoryCostInGC>
@@ -79,5 +81,4 @@ class HeavyMemoryCostInGC
                     fmt::format("Memory Peak is {} , it should not bigger than {} ", metrics_dumper->getMemoryPeak(), 5 * 1024 * 1024));
     }
 };
-
-REGISTER_WORKLOAD(HeavyMemoryCostInGC)
+} // namespace DB::PS::tests
diff --git a/dbms/src/Storages/Page/stress/workload/HeavyRead.cpp b/dbms/src/Storages/Page/workload/HeavyRead.h
similarity index 95%
rename from dbms/src/Storages/Page/stress/workload/HeavyRead.cpp
rename to dbms/src/Storages/Page/workload/HeavyRead.h
index 15aeb1320cf..80023f95988 100644
--- a/dbms/src/Storages/Page/stress/workload/HeavyRead.cpp
+++ b/dbms/src/Storages/Page/workload/HeavyRead.h
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <PSWorkload.h>
+#include <Storages/Page/workload/PSWorkload.h>
 
+namespace DB::PS::tests
+{
 class HeavyRead : public StressWorkload
     , public StressWorkloadFunc<HeavyRead>
 {
@@ -68,5 +70,4 @@ class HeavyRead : public StressWorkload
         }
     }
 };
-
-REGISTER_WORKLOAD(HeavyRead)
\ No newline at end of file
+} // namespace DB::PS::tests
\ No newline at end of file
diff --git a/dbms/src/Storages/Page/stress/workload/HeavySkewWriteRead.cpp b/dbms/src/Storages/Page/workload/HeavySkewWriteRead.h
similarity index 96%
rename from dbms/src/Storages/Page/stress/workload/HeavySkewWriteRead.cpp
rename to dbms/src/Storages/Page/workload/HeavySkewWriteRead.h
index 78ffa5b60e0..0e75bc0d3e5 100644
--- a/dbms/src/Storages/Page/stress/workload/HeavySkewWriteRead.cpp
+++ b/dbms/src/Storages/Page/workload/HeavySkewWriteRead.h
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <PSWorkload.h>
+#include <Storages/Page/workload/PSWorkload.h>
 
+namespace DB::PS::tests
+{
 class HeavySkewWriteRead : public StressWorkload
     , public StressWorkloadFunc<HeavySkewWriteRead>
 {
@@ -83,5 +85,4 @@ class HeavySkewWriteRead : public StressWorkload
         return true;
     }
 };
-
-REGISTER_WORKLOAD(HeavySkewWriteRead)
\ No newline at end of file
+} // namespace DB::PS::tests
\ No newline at end of file
diff --git a/dbms/src/Storages/Page/stress/workload/HeavyWrite.cpp b/dbms/src/Storages/Page/workload/HeavyWrite.h
similarity index 95%
rename from dbms/src/Storages/Page/stress/workload/HeavyWrite.cpp
rename to dbms/src/Storages/Page/workload/HeavyWrite.h
index 265b289db56..54b7585ee20 100644
--- a/dbms/src/Storages/Page/stress/workload/HeavyWrite.cpp
+++ b/dbms/src/Storages/Page/workload/HeavyWrite.h
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <PSWorkload.h>
+#include <Storages/Page/workload/PSWorkload.h>
 
+namespace DB::PS::tests
+{
 class HeavyWrite : public StressWorkload
     , public StressWorkloadFunc<HeavyWrite>
 {
@@ -70,5 +72,4 @@ class HeavyWrite : public StressWorkload
         return true;
     }
 };
-
-REGISTER_WORKLOAD(HeavyWrite)
\ No newline at end of file
+} // namespace DB::PS::tests
\ No newline at end of file
diff --git a/dbms/src/Storages/Page/stress/workload/HighValidBigFileGC.cpp b/dbms/src/Storages/Page/workload/HighValidBigFileGC.h
similarity index 97%
rename from dbms/src/Storages/Page/stress/workload/HighValidBigFileGC.cpp
rename to dbms/src/Storages/Page/workload/HighValidBigFileGC.h
index 866782c9578..cc3b5b45135 100644
--- a/dbms/src/Storages/Page/stress/workload/HighValidBigFileGC.cpp
+++ b/dbms/src/Storages/Page/workload/HighValidBigFileGC.h
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <PSWorkload.h>
+#include <Storages/Page/workload/PSWorkload.h>
 
+namespace DB::PS::tests
+{
 class HighValidBigFileGCWorkload
     : public StressWorkload
     , public StressWorkloadFunc<HighValidBigFileGCWorkload>
@@ -127,5 +129,4 @@ class HighValidBigFileGCWorkload
 private:
     UInt64 gc_time_ms = 0;
 };
-
-REGISTER_WORKLOAD(HighValidBigFileGCWorkload)
+} // namespace DB::PS::tests
\ No newline at end of file
diff --git a/dbms/src/Storages/Page/stress/workload/HoldSnapshotsLongTime.cpp b/dbms/src/Storages/Page/workload/HoldSnapshotsLongTime.h
similarity index 96%
rename from dbms/src/Storages/Page/stress/workload/HoldSnapshotsLongTime.cpp
rename to dbms/src/Storages/Page/workload/HoldSnapshotsLongTime.h
index b49347fc858..071a104010c 100644
--- a/dbms/src/Storages/Page/stress/workload/HoldSnapshotsLongTime.cpp
+++ b/dbms/src/Storages/Page/workload/HoldSnapshotsLongTime.h
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <PSWorkload.h>
+#include <Storages/Page/workload/PSWorkload.h>
 
+namespace DB::PS::tests
+{
 class HoldSnapshotsLongTime : public StressWorkload
     , public StressWorkloadFunc<HoldSnapshotsLongTime>
 {
@@ -92,5 +94,4 @@ class HoldSnapshotsLongTime : public StressWorkload
         return true;
     }
 };
-
-REGISTER_WORKLOAD(HoldSnapshotsLongTime)
\ No newline at end of file
+} // namespace DB::PS::tests
\ No newline at end of file
diff --git a/dbms/src/Storages/Page/workload/MainEntry.cpp b/dbms/src/Storages/Page/workload/MainEntry.cpp
new file mode 100644
index 00000000000..18e42106c90
--- /dev/null
+++ b/dbms/src/Storages/Page/workload/MainEntry.cpp
@@ -0,0 +1,58 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <Storages/Page/workload/HeavyMemoryCostInGC.h>
+#include <Storages/Page/workload/HeavyRead.h>
+#include <Storages/Page/workload/HeavySkewWriteRead.h>
+#include <Storages/Page/workload/HeavyWrite.h>
+#include <Storages/Page/workload/HighValidBigFileGC.h>
+#include <Storages/Page/workload/HoldSnapshotsLongTime.h>
+#include <Storages/Page/workload/Normal.h>
+#include <Storages/Page/workload/PSStressEnv.h>
+#include <Storages/Page/workload/PSWorkload.h>
+#include <Storages/Page/workload/PageStorageInMemoryCapacity.h>
+#include <Storages/Page/workload/ThousandsOfOffset.h>
+
+using namespace DB::PS::tests;
+
+int StressWorkload::mainEntry(int argc, char ** argv)
+{
+    {
+        work_load_register<HeavyMemoryCostInGC>();
+        work_load_register<HeavyRead>();
+        work_load_register<HeavySkewWriteRead>();
+        work_load_register<HeavyWrite>();
+        work_load_register<HighValidBigFileGCWorkload>();
+        work_load_register<HoldSnapshotsLongTime>();
+        work_load_register<PageStorageInMemoryCapacity>();
+        work_load_register<NormalWorkload>();
+        work_load_register<ThousandsOfOffset>();
+    }
+    try
+    {
+        StressEnv::initGlobalLogger();
+        auto env = StressEnv::parse(argc, argv);
+        env.setup();
+
+        auto & mamager = StressWorkloadManger::getInstance();
+        mamager.setEnv(env);
+        mamager.runWorkload();
+
+        return StressEnvStatus::getInstance().isSuccess();
+    }
+    catch (...)
+    {
+        DB::tryLogCurrentException("");
+        exit(-1);
+    }
+}
\ No newline at end of file
diff --git a/dbms/src/Storages/Page/stress/workload/Normal.cpp b/dbms/src/Storages/Page/workload/Normal.h
similarity index 95%
rename from dbms/src/Storages/Page/stress/workload/Normal.cpp
rename to dbms/src/Storages/Page/workload/Normal.h
index 0323b857613..164f17b9d61 100644
--- a/dbms/src/Storages/Page/stress/workload/Normal.cpp
+++ b/dbms/src/Storages/Page/workload/Normal.h
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <PSWorkload.h>
+#include <Storages/Page/workload/PSWorkload.h>
 
+namespace DB::PS::tests
+{
 class NormalWorkload
     : public StressWorkload
     , public StressWorkloadFunc<NormalWorkload>
@@ -75,5 +77,4 @@ class NormalWorkload
         stop_watch.stop();
     }
 };
-
-REGISTER_WORKLOAD(NormalWorkload)
+} // namespace DB::PS::tests
diff --git a/dbms/src/Storages/Page/stress/PSBackground.cpp b/dbms/src/Storages/Page/workload/PSBackground.cpp
similarity index 96%
rename from dbms/src/Storages/Page/stress/PSBackground.cpp
rename to dbms/src/Storages/Page/workload/PSBackground.cpp
index af7329e8348..247bea23dcc 100644
--- a/dbms/src/Storages/Page/stress/PSBackground.cpp
+++ b/dbms/src/Storages/Page/workload/PSBackground.cpp
@@ -13,11 +13,14 @@
 // limitations under the License.
 
 #include <Common/MemoryTracker.h>
-#include <PSBackground.h>
 #include <Poco/Logger.h>
 #include <Poco/Timer.h>
+#include <Storages/Page/workload/PSBackground.h>
 #include <fmt/format.h>
 
+
+namespace DB::PS::tests
+{
 void PSMetricsDumper::onTime(Poco::Timer & /*timer*/)
 {
     for (auto & metric : metrics)
@@ -107,3 +110,4 @@ void StressTimeout::start()
 {
     timeout_timer.start(Poco::TimerCallback<StressTimeout>(*this, &StressTimeout::onTime));
 }
+} // namespace DB::PS::tests
diff --git a/dbms/src/Storages/Page/stress/PSBackground.h b/dbms/src/Storages/Page/workload/PSBackground.h
similarity index 97%
rename from dbms/src/Storages/Page/stress/PSBackground.h
rename to dbms/src/Storages/Page/workload/PSBackground.h
index 8c22458c5e8..c91dad1361f 100644
--- a/dbms/src/Storages/Page/stress/PSBackground.h
+++ b/dbms/src/Storages/Page/workload/PSBackground.h
@@ -15,14 +15,16 @@
 #pragma once
 #include <Common/MemoryTracker.h>
 #include <Common/Stopwatch.h>
-#include <PSStressEnv.h>
 #include <Poco/Timer.h>
+#include <Storages/Page/workload/PSStressEnv.h>
 
 namespace CurrentMetrics
 {
 extern const Metric PSMVCCSnapshotsList;
 }
 
+namespace DB::PS::tests
+{
 class PSMetricsDumper
 {
 public:
@@ -162,3 +164,4 @@ class StressTimeout
     Poco::Timer timeout_timer;
 };
 using StressTimeoutPtr = std::shared_ptr<StressTimeout>;
+} // namespace DB::PS::tests
diff --git a/dbms/src/Storages/Page/stress/PSRunnable.cpp b/dbms/src/Storages/Page/workload/PSRunnable.cpp
similarity index 97%
rename from dbms/src/Storages/Page/stress/PSRunnable.cpp
rename to dbms/src/Storages/Page/workload/PSRunnable.cpp
index 5d6c8ecc5c6..5e9774ccc99 100644
--- a/dbms/src/Storages/Page/stress/PSRunnable.cpp
+++ b/dbms/src/Storages/Page/workload/PSRunnable.cpp
@@ -16,14 +16,16 @@
 #include <Common/formatReadable.h>
 #include <Encryption/MockKeyManager.h>
 #include <IO/ReadBufferFromMemory.h>
-#include <PSRunnable.h>
 #include <Poco/File.h>
 #include <Poco/Logger.h>
+#include <Storages/Page/workload/PSRunnable.h>
 #include <TestUtils/MockDiskDelegator.h>
 #include <fmt/format.h>
 
 #include <random>
 
+namespace DB::PS::tests
+{
 void PSRunnable::run()
 try
 {
@@ -69,7 +71,7 @@ DB::ReadBufferPtr PSWriter::genRandomData(const DB::PageId pageId, DB::MemHolder
     std::uniform_int_distribution<> dist(0, 3000);
 
     const size_t buff_sz = approx_page_mb * DB::MB + dist(size_gen);
-    char * buff = static_cast<char *>(malloc(buff_sz));
+    char * buff = static_cast<char *>(malloc(buff_sz)); // NOLINT
     if (buff == nullptr)
     {
         throw DB::Exception("Alloc fix memory failed.", DB::ErrorCodes::LOGICAL_ERROR);
@@ -78,7 +80,7 @@ DB::ReadBufferPtr PSWriter::genRandomData(const DB::PageId pageId, DB::MemHolder
     const char buff_ch = pageId % 0xFF;
     memset(buff, buff_ch, buff_sz);
 
-    holder = DB::createMemHolder(buff, [&](char * p) { free(p); });
+    holder = DB::createMemHolder(buff, [&](char * p) { free(p); }); // NOLINT
 
     return std::make_shared<DB::ReadBufferFromMemory>(const_cast<char *>(buff), buff_sz);
 }
@@ -88,7 +90,7 @@ void PSWriter::updatedRandomData()
     size_t memory_size = approx_page_mb * DB::MB * 2;
     if (memory == nullptr)
     {
-        memory = static_cast<char *>(malloc(memory_size));
+        memory = static_cast<char *>(malloc(memory_size)); // NOLINT
         if (memory == nullptr)
         {
             throw DB::Exception("Alloc fix memory failed.", DB::ErrorCodes::LOGICAL_ERROR);
@@ -147,7 +149,7 @@ void PSCommonWriter::updatedRandomData()
 
     if (memory == nullptr)
     {
-        memory = static_cast<char *>(malloc(memory_size));
+        memory = static_cast<char *>(malloc(memory_size)); // NOLINT
         if (memory == nullptr)
         {
             throw DB::Exception("Alloc fix memory failed.", DB::ErrorCodes::LOGICAL_ERROR);
@@ -415,3 +417,4 @@ DB::PageId PSIncreaseWriter::genRandomPageId()
 {
     return static_cast<DB::PageId>(begin_page_id++);
 }
+} // namespace DB::PS::tests
diff --git a/dbms/src/Storages/Page/stress/PSRunnable.h b/dbms/src/Storages/Page/workload/PSRunnable.h
similarity index 90%
rename from dbms/src/Storages/Page/stress/PSRunnable.h
rename to dbms/src/Storages/Page/workload/PSRunnable.h
index 3ddcd73c093..b723236391d 100644
--- a/dbms/src/Storages/Page/stress/PSRunnable.h
+++ b/dbms/src/Storages/Page/workload/PSRunnable.h
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #pragma once
-#include <PSStressEnv.h>
 #include <Poco/Runnable.h>
 #include <Storages/Page/PageDefines.h>
+#include <Storages/Page/workload/PSStressEnv.h>
 
 const DB::PageId MAX_PAGE_ID_DEFAULT = 1000;
 
+namespace DB::PS::tests
+{
 class PSRunnable : public Poco::Runnable
 {
 public:
@@ -46,7 +48,7 @@ class PSWriter : public PSRunnable
         gen.seed(time(nullptr));
     }
 
-    virtual ~PSWriter()
+    ~PSWriter() override
     {
         if (memory != nullptr)
         {
@@ -54,7 +56,7 @@ class PSWriter : public PSRunnable
         }
     }
 
-    virtual String description() override
+    String description() override
     {
         return fmt::format("(Stress Test Writer {})", index);
     }
@@ -67,7 +69,7 @@ class PSWriter : public PSRunnable
 
     static void fillAllPages(const PSPtr & ps);
 
-    virtual bool runImpl() override;
+    bool runImpl() override;
 
 protected:
     virtual DB::PageId genRandomPageId();
@@ -91,11 +93,11 @@ class PSCommonWriter : public PSWriter
         : PSWriter(ps_, index_)
     {}
 
-    virtual void updatedRandomData() override;
+    void updatedRandomData() override;
 
-    virtual String description() override { return fmt::format("(Stress Test Common Writer {})", index); }
+    String description() override { return fmt::format("(Stress Test Common Writer {})", index); }
 
-    virtual bool runImpl() override;
+    bool runImpl() override;
 
     void setBatchBufferNums(size_t numbers);
 
@@ -120,7 +122,7 @@ class PSCommonWriter : public PSWriter
 
     DB::PageFieldSizes data_sizes = {};
 
-    virtual DB::PageId genRandomPageId() override;
+    DB::PageId genRandomPageId() override;
     virtual size_t genBufferSize();
 };
 
@@ -154,7 +156,7 @@ class PSWindowWriter : public PSCommonWriter
     void setNormalDistributionSigma(size_t sigma);
 
 protected:
-    virtual DB::PageId genRandomPageId() override;
+    DB::PageId genRandomPageId() override;
 
 protected:
     size_t window_size = 100;
@@ -170,12 +172,12 @@ class PSIncreaseWriter : public PSCommonWriter
 
     String description() override { return fmt::format("(Stress Test Increase Writer {})", index); }
 
-    virtual bool runImpl() override;
+    bool runImpl() override;
 
     void setPageRange(size_t page_range);
 
 protected:
-    virtual DB::PageId genRandomPageId() override;
+    DB::PageId genRandomPageId() override;
 
 protected:
     size_t begin_page_id = 1;
@@ -192,9 +194,9 @@ class PSReader : public PSRunnable
         gen.seed(time(nullptr));
     }
 
-    virtual String description() override { return fmt::format("(Stress Test PSReader {})", index); }
+    String description() override { return fmt::format("(Stress Test PSReader {})", index); }
 
-    virtual bool runImpl() override;
+    bool runImpl() override;
 
     void setPageReadOnce(size_t page_read_once);
 
@@ -242,7 +244,7 @@ class PSWindowReader : public PSReader
     void setWriterNums(size_t writer_nums);
 
 protected:
-    virtual DB::PageIds genRandomPageIds() override;
+    DB::PageIds genRandomPageIds() override;
 
 protected:
     size_t window_size = 100;
@@ -261,12 +263,13 @@ class PSSnapshotReader : public PSReader
         : PSReader(ps_, index_)
     {}
 
-    virtual bool runImpl() override;
+    bool runImpl() override;
 
     void setSnapshotGetIntervalMs(size_t snapshot_get_interval_ms_);
 
 protected:
-    size_t snapshots_hold_num;
+    size_t snapshots_hold_num = 0;
     size_t snapshot_get_interval_ms = 0;
     std::list<DB::PageStorage::SnapshotPtr> snapshots;
-};
\ No newline at end of file
+};
+} // namespace DB::PS::tests
diff --git a/dbms/src/Storages/Page/stress/PSStressEnv.cpp b/dbms/src/Storages/Page/workload/PSStressEnv.cpp
similarity index 97%
rename from dbms/src/Storages/Page/stress/PSStressEnv.cpp
rename to dbms/src/Storages/Page/workload/PSStressEnv.cpp
index 7d680cd43c0..f5cead0a158 100644
--- a/dbms/src/Storages/Page/stress/PSStressEnv.cpp
+++ b/dbms/src/Storages/Page/workload/PSStressEnv.cpp
@@ -16,18 +16,20 @@
 #include <Common/FailPoint.h>
 #include <Common/MemoryTracker.h>
 #include <Common/UnifiedLogPatternFormatter.h>
-#include <PSStressEnv.h>
-#include <PSWorkload.h>
 #include <Poco/AutoPtr.h>
 #include <Poco/ConsoleChannel.h>
 #include <Poco/File.h>
 #include <Poco/FormattingChannel.h>
 #include <Poco/Logger.h>
 #include <Poco/PatternFormatter.h>
+#include <Storages/Page/workload/PSStressEnv.h>
+#include <Storages/Page/workload/PSWorkload.h>
 #include <signal.h>
 
 #include <boost/program_options.hpp>
 
+namespace DB::PS::tests
+{
 Poco::Logger * StressEnv::logger;
 void StressEnv::initGlobalLogger()
 {
@@ -146,3 +148,4 @@ void StressEnv::setup()
         init_pages = true;
     setupSignal();
 }
+} // namespace DB::PS::tests
diff --git a/dbms/src/Storages/Page/stress/PSStressEnv.h b/dbms/src/Storages/Page/workload/PSStressEnv.h
similarity index 98%
rename from dbms/src/Storages/Page/stress/PSStressEnv.h
rename to dbms/src/Storages/Page/workload/PSStressEnv.h
index 1c7d8ee761f..e67cb325430 100644
--- a/dbms/src/Storages/Page/stress/PSStressEnv.h
+++ b/dbms/src/Storages/Page/workload/PSStressEnv.h
@@ -25,6 +25,8 @@ namespace Poco
 class Logger;
 }
 
+namespace DB::PS::tests
+{
 using PSPtr = std::shared_ptr<DB::PageStorage>;
 
 enum StressEnvStat
@@ -124,3 +126,4 @@ struct StressEnv
 
     void setup();
 };
+} // namespace DB::PS::tests
diff --git a/dbms/src/Storages/Page/stress/PSWorkload.cpp b/dbms/src/Storages/Page/workload/PSWorkload.cpp
similarity index 98%
rename from dbms/src/Storages/Page/stress/PSWorkload.cpp
rename to dbms/src/Storages/Page/workload/PSWorkload.cpp
index ce1f8d92ce0..81f13527f48 100644
--- a/dbms/src/Storages/Page/stress/PSWorkload.cpp
+++ b/dbms/src/Storages/Page/workload/PSWorkload.cpp
@@ -14,12 +14,14 @@
 
 #include <Common/MemoryTracker.h>
 #include <Encryption/MockKeyManager.h>
-#include <PSWorkload.h>
 #include <Poco/Logger.h>
 #include <Storages/Page/V2/PageStorage.h>
 #include <Storages/Page/V3/PageStorageImpl.h>
+#include <Storages/Page/workload/PSWorkload.h>
 #include <TestUtils/MockDiskDelegator.h>
 
+namespace DB::PS::tests
+{
 void StressWorkload::onDumpResult()
 {
     UInt64 time_interval = stop_watch.elapsedMilliseconds();
@@ -177,3 +179,4 @@ void StressWorkloadManger::runWorkload()
         }
     }
 }
+} // namespace DB::PS::tests
diff --git a/dbms/src/Storages/Page/stress/PSWorkload.h b/dbms/src/Storages/Page/workload/PSWorkload.h
similarity index 85%
rename from dbms/src/Storages/Page/stress/PSWorkload.h
rename to dbms/src/Storages/Page/workload/PSWorkload.h
index cb099b4203a..26a9c24d6da 100644
--- a/dbms/src/Storages/Page/stress/PSWorkload.h
+++ b/dbms/src/Storages/Page/workload/PSWorkload.h
@@ -16,15 +16,17 @@
 
 #include <Common/Stopwatch.h>
 #include <Common/nocopyable.h>
-#include <PSBackground.h>
-#include <PSRunnable.h>
-#include <PSStressEnv.h>
 #include <Poco/ThreadPool.h>
 #include <Storages/Page/PageDefines.h>
 #include <Storages/Page/PageStorage.h>
+#include <Storages/Page/workload/PSBackground.h>
+#include <Storages/Page/workload/PSRunnable.h>
+#include <Storages/Page/workload/PSStressEnv.h>
 #include <fmt/format.h>
 
 #define NORMAL_WORKLOAD 0
+namespace DB::PS::tests
+{
 template <typename Child>
 class StressWorkloadFunc
 {
@@ -45,6 +47,8 @@ class StressWorkloadFunc
 class StressWorkload
 {
 public:
+    static int mainEntry(int argc, char ** argv);
+
     explicit StressWorkload(StressEnv options_)
         : options(options_)
     {}
@@ -189,13 +193,15 @@ class StressWorkloadManger
     StressEnv options;
 };
 
-#define REGISTER_WORKLOAD(WORKLOAD)                                                     \
-    static void __attribute__((constructor)) _work_load_register_named_##WORKLOAD(void) \
-    {                                                                                   \
-        StressWorkloadManger::getInstance().reg(                                        \
-            WORKLOAD::nameFunc(),                                                       \
-            WORKLOAD::maskFunc(),                                                       \
-            [](const StressEnv & opts) -> std::shared_ptr<StressWorkload> {             \
-                return std::make_shared<WORKLOAD>(opts);                                \
-            });                                                                         \
-    }
+template <class Workload>
+void work_load_register()
+{
+    StressWorkloadManger::getInstance().reg(
+        Workload::nameFunc(),
+        Workload::maskFunc(),
+        [](const StressEnv & opts) -> std::shared_ptr<StressWorkload> {
+            return std::make_shared<Workload>(opts);
+        });
+}
+
+} // namespace DB::PS::tests
diff --git a/dbms/src/Storages/Page/stress/workload/PageStorageInMemoryCapacity.cpp b/dbms/src/Storages/Page/workload/PageStorageInMemoryCapacity.h
similarity index 97%
rename from dbms/src/Storages/Page/stress/workload/PageStorageInMemoryCapacity.cpp
rename to dbms/src/Storages/Page/workload/PageStorageInMemoryCapacity.h
index 190cbf6b323..337c732e6f7 100644
--- a/dbms/src/Storages/Page/stress/workload/PageStorageInMemoryCapacity.cpp
+++ b/dbms/src/Storages/Page/workload/PageStorageInMemoryCapacity.h
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <PSWorkload.h>
+#include <Storages/Page/workload/PSWorkload.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <sys/resource.h>
 #include <unistd.h>
 
-
 #ifdef __APPLE__
 
 #include <libproc.h>
@@ -27,6 +26,8 @@
 #include <sys/sysctl.h>
 #endif
 
+namespace DB::PS::tests
+{
 class PageStorageInMemoryCapacity : public StressWorkload
     , public StressWorkloadFunc<PageStorageInMemoryCapacity>
 {
@@ -89,14 +90,14 @@ class PageStorageInMemoryCapacity : public StressWorkload
         }
 
         FILE * file = fopen("/proc/meminfo", "r");
-        if (file != NULL)
+        if (file != nullptr)
         {
             char buffer[128];
 #define MEMORY_TOTAL_LABEL "MemTotal:"
             while (fgets(buffer, 128, file))
             {
                 if ((strncmp((buffer), (MEMORY_TOTAL_LABEL), strlen(MEMORY_TOTAL_LABEL)) == 0)
-                    && sscanf(buffer + strlen(MEMORY_TOTAL_LABEL), " %32llu kB", &total_mem))
+                    && sscanf(buffer + strlen(MEMORY_TOTAL_LABEL), " %32llu kB", &total_mem)) // NOLINT
                 {
                     break;
                 }
@@ -173,5 +174,4 @@ class PageStorageInMemoryCapacity : public StressWorkload
                                                 std::round(resident_used) ? (total_mem / ((double)resident_used / page_writen)) : 0));
     }
 };
-
-REGISTER_WORKLOAD(PageStorageInMemoryCapacity)
\ No newline at end of file
+} // namespace DB::PS::tests
\ No newline at end of file
diff --git a/dbms/src/Storages/Page/stress/workload/ThousandsOfOffset.cpp b/dbms/src/Storages/Page/workload/ThousandsOfOffset.h
similarity index 98%
rename from dbms/src/Storages/Page/stress/workload/ThousandsOfOffset.cpp
rename to dbms/src/Storages/Page/workload/ThousandsOfOffset.h
index 3a215f76769..0232ea235f1 100644
--- a/dbms/src/Storages/Page/stress/workload/ThousandsOfOffset.cpp
+++ b/dbms/src/Storages/Page/workload/ThousandsOfOffset.h
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <PSWorkload.h>
+#include <Storages/Page/workload/PSWorkload.h>
 
+namespace DB::PS::tests
+{
 class ThousandsOfOffset : public StressWorkload
     , public StressWorkloadFunc<ThousandsOfOffset>
 {
@@ -167,5 +169,4 @@ class ThousandsOfOffset : public StressWorkload
         return true;
     }
 };
-
-REGISTER_WORKLOAD(ThousandsOfOffset)
\ No newline at end of file
+} // namespace DB::PS::tests
\ No newline at end of file
diff --git a/dbms/src/Storages/StorageBuffer.cpp b/dbms/src/Storages/StorageBuffer.cpp
index 0dc05674696..1d7c0ace57f 100644
--- a/dbms/src/Storages/StorageBuffer.cpp
+++ b/dbms/src/Storages/StorageBuffer.cpp
@@ -34,24 +34,6 @@
 
 #include <ext/range.h>
 
-
-namespace ProfileEvents
-{
-extern const Event StorageBufferFlush;
-extern const Event StorageBufferErrorOnFlush;
-extern const Event StorageBufferPassedAllMinThresholds;
-extern const Event StorageBufferPassedTimeMaxThreshold;
-extern const Event StorageBufferPassedRowsMaxThreshold;
-extern const Event StorageBufferPassedBytesMaxThreshold;
-} // namespace ProfileEvents
-
-namespace CurrentMetrics
-{
-extern const Metric StorageBufferRows;
-extern const Metric StorageBufferBytes;
-} // namespace CurrentMetrics
-
-
 namespace DB
 {
 namespace ErrorCodes
@@ -170,10 +152,6 @@ static void appendBlock(const Block & from, Block & to)
     to.checkNumberOfRows();
 
     size_t rows = from.rows();
-    size_t bytes = from.bytes();
-
-    CurrentMetrics::add(CurrentMetrics::StorageBufferRows, rows);
-    CurrentMetrics::add(CurrentMetrics::StorageBufferBytes, bytes);
 
     size_t old_rows = to.rows();
 
@@ -430,25 +408,21 @@ bool StorageBuffer::checkThresholdsImpl(size_t rows, size_t bytes, time_t time_p
 {
     if (time_passed > min_thresholds.time && rows > min_thresholds.rows && bytes > min_thresholds.bytes)
     {
-        ProfileEvents::increment(ProfileEvents::StorageBufferPassedAllMinThresholds);
         return true;
     }
 
     if (time_passed > max_thresholds.time)
     {
-        ProfileEvents::increment(ProfileEvents::StorageBufferPassedTimeMaxThreshold);
         return true;
     }
 
     if (rows > max_thresholds.rows)
     {
-        ProfileEvents::increment(ProfileEvents::StorageBufferPassedRowsMaxThreshold);
         return true;
     }
 
     if (bytes > max_thresholds.bytes)
     {
-        ProfileEvents::increment(ProfileEvents::StorageBufferPassedBytesMaxThreshold);
         return true;
     }
 
@@ -495,11 +469,6 @@ void StorageBuffer::flushBuffer(Buffer & buffer, bool check_thresholds)
     buffer.data.swap(block_to_write);
     buffer.first_write_time = 0;
 
-    CurrentMetrics::sub(CurrentMetrics::StorageBufferRows, block_to_write.rows());
-    CurrentMetrics::sub(CurrentMetrics::StorageBufferBytes, block_to_write.bytes());
-
-    ProfileEvents::increment(ProfileEvents::StorageBufferFlush);
-
     LOG_FMT_TRACE(log, "Flushing buffer with {} rows, {} bytes, age {} seconds.", rows, bytes, time_passed);
 
     if (no_destination)
@@ -517,13 +486,7 @@ void StorageBuffer::flushBuffer(Buffer & buffer, bool check_thresholds)
     }
     catch (...)
     {
-        ProfileEvents::increment(ProfileEvents::StorageBufferErrorOnFlush);
-
         /// Return the block to its place in the buffer.
-
-        CurrentMetrics::add(CurrentMetrics::StorageBufferRows, block_to_write.rows());
-        CurrentMetrics::add(CurrentMetrics::StorageBufferBytes, block_to_write.bytes());
-
         buffer.data.swap(block_to_write);
 
         if (!buffer.first_write_time)
diff --git a/dbms/src/Storages/StorageCatBoostPool.cpp b/dbms/src/Storages/StorageCatBoostPool.cpp
deleted file mode 100644
index 317cac21d52..00000000000
--- a/dbms/src/Storages/StorageCatBoostPool.cpp
+++ /dev/null
@@ -1,287 +0,0 @@
-// Copyright 2022 PingCAP, Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <Storages/StorageCatBoostPool.h>
-#include <DataStreams/IProfilingBlockInputStream.h>
-#include <DataStreams/FormatFactory.h>
-#include <IO/ReadBufferFromFile.h>
-#include <fstream>
-#include <DataTypes/DataTypeString.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <DataStreams/FilterColumnsBlockInputStream.h>
-#include <Interpreters/Context.h>
-#include <boost/filesystem.hpp>
-#include <Parsers/ASTIdentifier.h>
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int CANNOT_OPEN_FILE;
-    extern const int CANNOT_PARSE_TEXT;
-    extern const int DATABASE_ACCESS_DENIED;
-}
-
-namespace
-{
-class CatBoostDatasetBlockInputStream : public IProfilingBlockInputStream
-{
-public:
-
-    CatBoostDatasetBlockInputStream(const std::string & file_name, const std::string & format_name,
-                                    const Block & sample_block, const Context & context, size_t max_block_size)
-            : file_name(file_name), format_name(format_name)
-    {
-        read_buf = std::make_unique<ReadBufferFromFile>(file_name);
-        reader = FormatFactory().getInput(format_name, *read_buf, sample_block, context, max_block_size);
-    }
-
-    String getName() const override
-    {
-        return "CatBoostDataset";
-    }
-
-    Block readImpl() override
-    {
-        return reader->read();
-    }
-
-    void readPrefixImpl() override
-    {
-        reader->readPrefix();
-    }
-
-    void readSuffixImpl() override
-    {
-        reader->readSuffix();
-    }
-
-    Block getHeader() const override { return sample_block; };
-
-private:
-    Block sample_block;
-    std::unique_ptr<ReadBufferFromFileDescriptor> read_buf;
-    BlockInputStreamPtr reader;
-    std::string file_name;
-    std::string format_name;
-};
-
-}
-
-static boost::filesystem::path canonicalPath(std::string && path)
-{
-    return boost::filesystem::canonical(boost::filesystem::path(path));
-}
-
-static std::string resolvePath(const boost::filesystem::path & base_path, std::string && path)
-{
-    boost::filesystem::path resolved_path(path);
-    if (!resolved_path.is_absolute())
-        return (base_path / resolved_path).string();
-    return resolved_path.string();
-}
-
-static void checkCreationIsAllowed(const String & base_path, const String & path)
-{
-    if (base_path != path.substr(0, base_path.size()))
-        throw Exception(
-            "Using file descriptor or user specified path as source of storage isn't allowed for server daemons",
-            ErrorCodes::DATABASE_ACCESS_DENIED);
-}
-
-
-StorageCatBoostPool::StorageCatBoostPool(const Context & context,
-                                         String column_description_file_name_,
-                                         String data_description_file_name_)
-        : column_description_file_name(std::move(column_description_file_name_)),
-          data_description_file_name(std::move(data_description_file_name_))
-{
-    auto base_path = canonicalPath(context.getPath());
-    column_description_file_name = resolvePath(base_path, std::move(column_description_file_name));
-    data_description_file_name = resolvePath(base_path, std::move(data_description_file_name));
-    if (context.getApplicationType() == Context::ApplicationType::SERVER)
-    {
-        const auto & base_path_str = base_path.string();
-        checkCreationIsAllowed(base_path_str, column_description_file_name);
-        checkCreationIsAllowed(base_path_str, data_description_file_name);
-    }
-
-    parseColumnDescription();
-    createSampleBlockAndColumns();
-}
-
-std::string StorageCatBoostPool::getColumnTypesString(const ColumnTypesMap & columnTypesMap)
-{
-    std::string types_string;
-    bool first = true;
-    for (const auto & value : columnTypesMap)
-    {
-        if (!first)
-            types_string.append(", ");
-
-        first = false;
-        types_string += value.first;
-    }
-
-    return types_string;
-}
-
-void StorageCatBoostPool::checkDatasetDescription()
-{
-    std::ifstream in(data_description_file_name);
-    if (!in.good())
-        throw Exception("Cannot open file: " + data_description_file_name, ErrorCodes::CANNOT_OPEN_FILE);
-
-    std::string line;
-    if (!std::getline(in, line))
-        throw Exception("File is empty: " + data_description_file_name, ErrorCodes::CANNOT_PARSE_TEXT);
-
-    size_t columns_count = 1;
-    for (char sym : line)
-        if (sym == '\t')
-            ++columns_count;
-
-    columns_description.resize(columns_count);
-}
-
-void StorageCatBoostPool::parseColumnDescription()
-{
-    /// NOTE: simple parsing
-    /// TODO: use ReadBufferFromFile
-
-    checkDatasetDescription();
-
-    std::ifstream in(column_description_file_name);
-    if (!in.good())
-        throw Exception("Cannot open file: " + column_description_file_name, ErrorCodes::CANNOT_OPEN_FILE);
-
-    std::string line;
-    size_t line_num = 0;
-    auto column_types_map = getColumnTypesMap();
-    auto column_types_string = getColumnTypesString(column_types_map);
-
-    /// Enumerate default names for columns as Auxiliary, Auxiliary1, Auxiliary2, ...
-    std::map<DatasetColumnType, size_t> columns_per_type_count;
-
-    while (std::getline(in, line))
-    {
-        ++line_num;
-        std::string str_line_num = std::to_string(line_num);
-
-        if (line.empty())
-            continue;
-
-        std::istringstream iss(line);
-        std::vector<std::string> tokens;
-        std::string token;
-        while (std::getline(iss, token, '\t'))
-            tokens.push_back(token);
-
-        if (tokens.size() != 2 && tokens.size() != 3)
-            throw Exception("Cannot parse column description at line " + str_line_num + " '" + line + "' "
-                            + ": expected 2 or 3 columns, got " + std::to_string(tokens.size()),
-                            ErrorCodes::CANNOT_PARSE_TEXT);
-
-        std::string str_id = tokens[0];
-        std::string col_type = tokens[1];
-        std::string col_alias = tokens.size() > 2 ? tokens[2] : "";
-
-        size_t num_id;
-        try
-        {
-            num_id = std::stoull(str_id);
-        }
-        catch (std::exception & e)
-        {
-            throw Exception("Cannot parse column index at row " + str_line_num + ": " + e.what(),
-                            ErrorCodes::CANNOT_PARSE_TEXT);
-        }
-
-        if (num_id >= columns_description.size())
-            throw Exception("Invalid index at row  " + str_line_num + ": " + str_id
-                            + ", expected in range [0, " + std::to_string(columns_description.size()) + ")",
-                            ErrorCodes::CANNOT_PARSE_TEXT);
-
-        if (column_types_map.count(col_type) == 0)
-            throw Exception("Invalid column type: " + col_type + ", expected: " + column_types_string,
-                            ErrorCodes::CANNOT_PARSE_TEXT);
-
-        auto type = column_types_map[col_type];
-
-        std::string col_name;
-
-        bool is_feature_column = type == DatasetColumnType::Num || type == DatasetColumnType::Categ;
-        auto & col_number = columns_per_type_count[type];
-        /// If column is not feature skip '0' after the name (to use 'Target' instead of 'Target0').
-        col_name = col_type + (is_feature_column || col_number ? std::to_string(col_number) : "");
-        ++col_number;
-
-        columns_description[num_id] = ColumnDescription(col_name, col_alias, type);
-    }
-}
-
-void StorageCatBoostPool::createSampleBlockAndColumns()
-{
-    ColumnsDescription columns;
-    NamesAndTypesList cat_columns;
-    NamesAndTypesList num_columns;
-    sample_block.clear();
-    for (auto & desc : columns_description)
-    {
-        DataTypePtr type;
-        if (desc.column_type == DatasetColumnType::Categ
-            || desc.column_type == DatasetColumnType::Auxiliary
-            || desc.column_type == DatasetColumnType::DocId)
-            type = std::make_shared<DataTypeString>();
-        else
-            type = std::make_shared<DataTypeFloat64>();
-
-        if (desc.column_type == DatasetColumnType::Categ)
-            cat_columns.emplace_back(desc.column_name, type);
-        else if (desc.column_type == DatasetColumnType::Num)
-            num_columns.emplace_back(desc.column_name, type);
-        else
-            columns.materialized.emplace_back(desc.column_name, type);
-
-        if (!desc.alias.empty())
-        {
-            auto alias = std::make_shared<ASTIdentifier>(desc.column_name);
-            columns.defaults[desc.alias] = {ColumnDefaultKind::Alias, alias};
-            columns.aliases.emplace_back(desc.alias, type);
-        }
-
-        sample_block.insert(ColumnWithTypeAndName(type, desc.column_name));
-    }
-    columns.ordinary.insert(columns.ordinary.end(), num_columns.begin(), num_columns.end());
-    columns.ordinary.insert(columns.ordinary.end(), cat_columns.begin(), cat_columns.end());
-
-    setColumns(columns);
-}
-
-BlockInputStreams StorageCatBoostPool::read(const Names & column_names,
-                       const SelectQueryInfo & /*query_info*/,
-                       const Context & context,
-                       QueryProcessingStage::Enum & /*processed_stage*/,
-                       size_t max_block_size,
-                       unsigned /*threads*/)
-{
-    auto stream = std::make_shared<CatBoostDatasetBlockInputStream>(
-            data_description_file_name, "TSV", sample_block, context, max_block_size);
-
-    auto filter_stream = std::make_shared<FilterColumnsBlockInputStream>(stream, column_names, false);
-    return { filter_stream };
-}
-
-}
diff --git a/dbms/src/Storages/StorageCatBoostPool.h b/dbms/src/Storages/StorageCatBoostPool.h
deleted file mode 100644
index 0f4f7c2cede..00000000000
--- a/dbms/src/Storages/StorageCatBoostPool.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright 2022 PingCAP, Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <Core/Defines.h>
-#include <Storages/IStorage.h>
-
-#include <ext/shared_ptr_helper.h>
-
-namespace DB
-{
-class StorageCatBoostPool : public ext::SharedPtrHelper<StorageCatBoostPool>
-    , public IStorage
-{
-public:
-    std::string getName() const override { return "CatBoostPool"; }
-
-    std::string getTableName() const override { return table_name; }
-
-    BlockInputStreams read(const Names & column_names,
-                           const SelectQueryInfo & query_info,
-                           const Context & context,
-                           QueryProcessingStage::Enum & processed_stage,
-                           size_t max_block_size,
-                           unsigned threads) override;
-
-private:
-    String table_name;
-
-    String column_description_file_name;
-    String data_description_file_name;
-    Block sample_block;
-
-    enum class DatasetColumnType
-    {
-        Target,
-        Num,
-        Categ,
-        Auxiliary,
-        DocId,
-        Weight,
-        Baseline
-    };
-
-    using ColumnTypesMap = std::map<std::string, DatasetColumnType>;
-
-    ColumnTypesMap getColumnTypesMap() const
-    {
-        return {
-            {"Target", DatasetColumnType::Target},
-            {"Num", DatasetColumnType::Num},
-            {"Categ", DatasetColumnType::Categ},
-            {"Auxiliary", DatasetColumnType::Auxiliary},
-            {"DocId", DatasetColumnType::DocId},
-            {"Weight", DatasetColumnType::Weight},
-            {"Baseline", DatasetColumnType::Baseline},
-        };
-    };
-
-    std::string getColumnTypesString(const ColumnTypesMap & columnTypesMap);
-
-    struct ColumnDescription
-    {
-        std::string column_name;
-        std::string alias;
-        DatasetColumnType column_type;
-
-        ColumnDescription()
-            : column_type(DatasetColumnType::Num)
-        {}
-        ColumnDescription(std::string column_name, std::string alias, DatasetColumnType column_type)
-            : column_name(std::move(column_name))
-            , alias(std::move(alias))
-            , column_type(column_type)
-        {}
-    };
-
-    std::vector<ColumnDescription> columns_description;
-
-    void checkDatasetDescription();
-    void parseColumnDescription();
-    void createSampleBlockAndColumns();
-
-protected:
-    StorageCatBoostPool(const Context & context, String column_description_file_name, String data_description_file_name);
-};
-
-} // namespace DB
diff --git a/dbms/src/Storages/StorageDeltaMerge.cpp b/dbms/src/Storages/StorageDeltaMerge.cpp
index 67d32c73a05..a6de4efb3ac 100644
--- a/dbms/src/Storages/StorageDeltaMerge.cpp
+++ b/dbms/src/Storages/StorageDeltaMerge.cpp
@@ -775,12 +775,12 @@ void StorageDeltaMerge::checkStatus(const Context & context)
 
 void StorageDeltaMerge::flushCache(const Context & context)
 {
-    flushCache(context, DM::RowKeyRange::newAll(is_common_handle, rowkey_column_size));
+    flushCache(context, DM::RowKeyRange::newAll(is_common_handle, rowkey_column_size), /* try_until_succeed */ true);
 }
 
-void StorageDeltaMerge::flushCache(const Context & context, const DM::RowKeyRange & range_to_flush)
+bool StorageDeltaMerge::flushCache(const Context & context, const DM::RowKeyRange & range_to_flush, bool try_until_succeed)
 {
-    getAndMaybeInitStore()->flushCache(context, range_to_flush);
+    return getAndMaybeInitStore()->flushCache(context, range_to_flush, try_until_succeed);
 }
 
 void StorageDeltaMerge::mergeDelta(const Context & context)
diff --git a/dbms/src/Storages/StorageDeltaMerge.h b/dbms/src/Storages/StorageDeltaMerge.h
index 79ee225d237..9e4ab12ad4f 100644
--- a/dbms/src/Storages/StorageDeltaMerge.h
+++ b/dbms/src/Storages/StorageDeltaMerge.h
@@ -73,7 +73,7 @@ class StorageDeltaMerge
 
     void flushCache(const Context & context) override;
 
-    void flushCache(const Context & context, const DM::RowKeyRange & range_to_flush) override;
+    bool flushCache(const Context & context, const DM::RowKeyRange & range_to_flush, bool try_until_succeed) override;
 
     /// Merge delta into the stable layer for all segments.
     ///
diff --git a/dbms/src/Storages/StorageFile.cpp b/dbms/src/Storages/StorageFile.cpp
deleted file mode 100644
index 4dec4fd5ea0..00000000000
--- a/dbms/src/Storages/StorageFile.cpp
+++ /dev/null
@@ -1,348 +0,0 @@
-// Copyright 2022 PingCAP, Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <Storages/StorageFile.h>
-#include <Storages/StorageFactory.h>
-
-#include <Interpreters/Context.h>
-#include <Interpreters/evaluateConstantExpression.h>
-
-#include <Parsers/ASTLiteral.h>
-#include <Parsers/ASTIdentifier.h>
-
-#include <IO/ReadBufferFromFile.h>
-#include <IO/WriteBufferFromFile.h>
-#include <IO/WriteHelpers.h>
-
-#include <DataStreams/FormatFactory.h>
-#include <DataStreams/IProfilingBlockInputStream.h>
-#include <DataStreams/IBlockOutputStream.h>
-
-#include <Common/escapeForFileName.h>
-#include <Common/typeid_cast.h>
-
-#include <fcntl.h>
-
-#include <Poco/Path.h>
-#include <Poco/File.h>
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int CANNOT_WRITE_TO_FILE_DESCRIPTOR;
-    extern const int CANNOT_SEEK_THROUGH_FILE;
-    extern const int DATABASE_ACCESS_DENIED;
-    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
-    extern const int UNKNOWN_IDENTIFIER;
-    extern const int INCORRECT_FILE_NAME;
-    extern const int FILE_DOESNT_EXIST;
-    extern const int EMPTY_LIST_OF_COLUMNS_PASSED;
-};
-
-
-static std::string getTablePath(const std::string & db_dir_path, const std::string & table_name, const std::string & format_name)
-{
-    return db_dir_path + escapeForFileName(table_name) + "/data." + escapeForFileName(format_name);
-}
-
-/// Both db_dir_path and table_path must be converted to absolute paths (in particular, path cannot contain '..').
-static void checkCreationIsAllowed(Context & context_global, const std::string & db_dir_path, const std::string & table_path, int table_fd)
-{
-    if (context_global.getApplicationType() != Context::ApplicationType::SERVER)
-        return;
-
-    if (table_fd >= 0)
-        throw Exception("Using file descriptor as source of storage isn't allowed for server daemons", ErrorCodes::DATABASE_ACCESS_DENIED);
-    else if (!startsWith(table_path, db_dir_path))
-        throw Exception("Part path " + table_path + " is not inside " + db_dir_path, ErrorCodes::DATABASE_ACCESS_DENIED);
-
-    Poco::File table_path_poco_file = Poco::File(table_path);
-    if (!table_path_poco_file.exists())
-        throw Exception("File " + table_path + " is not exist", ErrorCodes::FILE_DOESNT_EXIST);
-    else if (table_path_poco_file.isDirectory())
-        throw Exception("File " + table_path + " must not be a directory", ErrorCodes::INCORRECT_FILE_NAME);
-}
-
-
-StorageFile::StorageFile(
-        const std::string & table_path_,
-        int table_fd_,
-        const std::string & db_dir_path,
-        const std::string & table_name_,
-        const std::string & format_name_,
-        const ColumnsDescription & columns_,
-        Context & context_)
-    : IStorage(columns_),
-    table_name(table_name_), format_name(format_name_), context_global(context_), table_fd(table_fd_)
-{
-    if (table_fd < 0) /// Will use file
-    {
-        use_table_fd = false;
-
-        if (!table_path_.empty()) /// Is user's file
-        {
-            Poco::Path poco_path = Poco::Path(table_path_);
-            if (poco_path.isRelative())
-                poco_path = Poco::Path(db_dir_path, poco_path);
-
-            path = poco_path.absolute().toString();
-            checkCreationIsAllowed(context_global, db_dir_path, path, table_fd);
-            is_db_table = false;
-        }
-        else /// Is DB's file
-        {
-            if (db_dir_path.empty())
-                throw Exception("Storage " + getName() + " requires data path", ErrorCodes::INCORRECT_FILE_NAME);
-
-            path = getTablePath(db_dir_path, table_name, format_name);
-            is_db_table = true;
-            Poco::File(Poco::Path(path).parent()).createDirectories();
-        }
-    }
-    else /// Will use FD
-    {
-        checkCreationIsAllowed(context_global, db_dir_path, path, table_fd);
-
-        is_db_table = false;
-        use_table_fd = true;
-
-        /// Save initial offset, it will be used for repeating SELECTs
-        /// If FD isn't seekable (lseek returns -1), then the second and subsequent SELECTs will fail.
-        table_fd_init_offset = lseek(table_fd, 0, SEEK_CUR);
-    }
-}
-
-
-class StorageFileBlockInputStream : public IProfilingBlockInputStream
-{
-public:
-    StorageFileBlockInputStream(StorageFile & storage_, const Context & context, size_t max_block_size)
-        : storage(storage_)
-    {
-        if (storage.use_table_fd)
-        {
-            storage.rwlock.lock();
-
-            /// We could use common ReadBuffer and WriteBuffer in storage to leverage cache
-            ///  and add ability to seek unseekable files, but cache sync isn't supported.
-
-            if (storage.table_fd_was_used) /// We need seek to initial position
-            {
-                if (storage.table_fd_init_offset < 0)
-                    throw Exception("File descriptor isn't seekable, inside " + storage.getName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
-
-                /// ReadBuffer's seek() doesn't make sence, since cache is empty
-                if (lseek(storage.table_fd, storage.table_fd_init_offset, SEEK_SET) < 0)
-                    throwFromErrno("Cannot seek file descriptor, inside " + storage.getName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
-            }
-
-            storage.table_fd_was_used = true;
-            read_buf = std::make_unique<ReadBufferFromFileDescriptor>(storage.table_fd);
-        }
-        else
-        {
-            storage.rwlock.lock_shared();
-
-            read_buf = std::make_unique<ReadBufferFromFile>(storage.path);
-        }
-
-        reader = FormatFactory().getInput(storage.format_name, *read_buf, storage.getSampleBlock(), context, max_block_size);
-    }
-
-    ~StorageFileBlockInputStream() override
-    {
-        if (storage.use_table_fd)
-            storage.rwlock.unlock();
-        else
-            storage.rwlock.unlock_shared();
-    }
-
-    String getName() const override
-    {
-        return storage.getName();
-    }
-
-    Block readImpl() override
-    {
-        return reader->read();
-    }
-
-    Block getHeader() const override { return reader->getHeader(); };
-
-    void readPrefixImpl() override
-    {
-        reader->readPrefix();
-    }
-
-    void readSuffixImpl() override
-    {
-        reader->readSuffix();
-    }
-
-private:
-    StorageFile & storage;
-    Block sample_block;
-    std::unique_ptr<ReadBufferFromFileDescriptor> read_buf;
-    BlockInputStreamPtr reader;
-};
-
-
-BlockInputStreams StorageFile::read(
-    const Names & /*column_names*/,
-    const SelectQueryInfo & /*query_info*/,
-    const Context & context,
-    QueryProcessingStage::Enum & /*processed_stage*/,
-    size_t max_block_size,
-    unsigned /*num_streams*/)
-{
-    return BlockInputStreams(1, std::make_shared<StorageFileBlockInputStream>(*this, context, max_block_size));
-}
-
-
-class StorageFileBlockOutputStream : public IBlockOutputStream
-{
-public:
-    explicit StorageFileBlockOutputStream(StorageFile & storage_)
-        : storage(storage_), lock(storage.rwlock)
-    {
-        if (storage.use_table_fd)
-        {
-            /** NOTE: Using real file binded to FD may be misleading:
-              * SELECT *; INSERT insert_data; SELECT *; last SELECT returns initil_fd_data + insert_data
-              * INSERT data; SELECT *; last SELECT returns only insert_data
-              */
-            storage.table_fd_was_used = true;
-            write_buf = std::make_unique<WriteBufferFromFileDescriptor>(storage.table_fd);
-        }
-        else
-        {
-            write_buf = std::make_unique<WriteBufferFromFile>(storage.path, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_APPEND | O_CREAT);
-        }
-
-        writer = FormatFactory().getOutput(storage.format_name, *write_buf, storage.getSampleBlock(), storage.context_global);
-    }
-
-    Block getHeader() const override { return storage.getSampleBlock(); }
-
-    void write(const Block & block) override
-    {
-        writer->write(block);
-    }
-
-    void writePrefix() override
-    {
-        writer->writePrefix();
-    }
-
-    void writeSuffix() override
-    {
-        writer->writeSuffix();
-    }
-
-    void flush() override
-    {
-        writer->flush();
-    }
-
-private:
-    StorageFile & storage;
-    std::unique_lock<std::shared_mutex> lock;
-    std::unique_ptr<WriteBufferFromFileDescriptor> write_buf;
-    BlockOutputStreamPtr writer;
-};
-
-BlockOutputStreamPtr StorageFile::write(
-    const ASTPtr & /*query*/,
-    const Settings & /*settings*/)
-{
-    return std::make_shared<StorageFileBlockOutputStream>(*this);
-}
-
-
-void StorageFile::drop()
-{
-    /// Extra actions are not required.
-}
-
-
-void StorageFile::rename(const String & new_path_to_db, const String & /*new_database_name*/, const String & new_table_name)
-{
-    if (!is_db_table)
-        throw Exception("Can't rename table '" + table_name + "' binded to user-defined file (or FD)", ErrorCodes::DATABASE_ACCESS_DENIED);
-
-    std::unique_lock<std::shared_mutex> lock(rwlock);
-
-    std::string path_new = getTablePath(new_path_to_db, new_table_name, format_name);
-    Poco::File(Poco::Path(path_new).parent()).createDirectories();
-    Poco::File(path).renameTo(path_new);
-
-    path = std::move(path_new);
-}
-
-
-void registerStorageFile(StorageFactory & factory)
-{
-    factory.registerStorage("File", [](const StorageFactory::Arguments & args)
-    {
-        ASTs & engine_args = args.engine_args;
-
-        if (!(engine_args.size() == 1 || engine_args.size() == 2))
-            throw Exception(
-                "Storage File requires 1 or 2 arguments: name of used format and source.",
-                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
-
-        engine_args[0] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[0], args.local_context);
-        String format_name = static_cast<const ASTLiteral &>(*engine_args[0]).value.safeGet<String>();
-
-        int source_fd = -1;
-        String source_path;
-        if (engine_args.size() >= 2)
-        {
-            /// Will use FD if engine_args[1] is int literal or identifier with std* name
-
-            if (const ASTIdentifier * identifier = typeid_cast<const ASTIdentifier *>(engine_args[1].get()))
-            {
-                if (identifier->name == "stdin")
-                    source_fd = STDIN_FILENO;
-                else if (identifier->name == "stdout")
-                    source_fd = STDOUT_FILENO;
-                else if (identifier->name == "stderr")
-                    source_fd = STDERR_FILENO;
-                else
-                    throw Exception("Unknown identifier '" + identifier->name + "' in second arg of File storage constructor",
-                                    ErrorCodes::UNKNOWN_IDENTIFIER);
-            }
-            else if (const ASTLiteral * literal = typeid_cast<const ASTLiteral *>(engine_args[1].get()))
-            {
-                auto type = literal->value.getType();
-                if (type == Field::Types::Int64)
-                    source_fd = static_cast<int>(literal->value.get<Int64>());
-                else if (type == Field::Types::UInt64)
-                    source_fd = static_cast<int>(literal->value.get<UInt64>());
-                else if (type == Field::Types::String)
-                    source_path = literal->value.get<String>();
-            }
-        }
-
-        return StorageFile::create(
-            source_path, source_fd,
-            args.data_path,
-            args.table_name, format_name, args.columns,
-            args.context);
-    });
-}
-
-}
diff --git a/dbms/src/Storages/StorageFile.h b/dbms/src/Storages/StorageFile.h
deleted file mode 100644
index ca46f7f366e..00000000000
--- a/dbms/src/Storages/StorageFile.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright 2022 PingCAP, Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <Poco/File.h>
-#include <Poco/Path.h>
-#include <Storages/IStorage.h>
-#include <common/logger_useful.h>
-
-#include <atomic>
-#include <ext/shared_ptr_helper.h>
-#include <shared_mutex>
-
-
-namespace DB
-{
-class StorageFileBlockInputStream;
-class StorageFileBlockOutputStream;
-
-class StorageFile : public ext::SharedPtrHelper<StorageFile>
-    , public IStorage
-{
-public:
-    std::string getName() const override
-    {
-        return "File";
-    }
-
-    std::string getTableName() const override
-    {
-        return table_name;
-    }
-
-    BlockInputStreams read(
-        const Names & column_names,
-        const SelectQueryInfo & query_info,
-        const Context & context,
-        QueryProcessingStage::Enum & processed_stage,
-        size_t max_block_size,
-        unsigned num_streams) override;
-
-    BlockOutputStreamPtr write(
-        const ASTPtr & query,
-        const Settings & settings) override;
-
-    void drop() override;
-
-    void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name) override;
-
-    String getDataPath() const override { return path; }
-
-protected:
-    friend class StorageFileBlockInputStream;
-    friend class StorageFileBlockOutputStream;
-
-    /** there are three options (ordered by priority):
-    - use specified file descriptor if (fd >= 0)
-    - use specified table_path if it isn't empty
-    - create own table inside data/db/table/
-    */
-    StorageFile(
-        const std::string & table_path_,
-        int table_fd_,
-        const std::string & db_dir_path,
-        const std::string & table_name_,
-        const std::string & format_name_,
-        const ColumnsDescription & columns_,
-        Context & context_);
-
-private:
-    std::string table_name;
-    std::string format_name;
-    Context & context_global;
-
-    std::string path;
-    int table_fd = -1;
-
-    bool is_db_table = true; /// Table is stored in real database, not user's file
-    bool use_table_fd = false; /// Use table_fd insted of path
-    std::atomic<bool> table_fd_was_used{false}; /// To detect repeating reads from stdin
-    off_t table_fd_init_offset = -1; /// Initial position of fd, used for repeating reads
-
-    mutable std::shared_mutex rwlock;
-
-    Poco::Logger * log = &Poco::Logger::get("StorageFile");
-};
-
-} // namespace DB
diff --git a/dbms/src/Storages/Transaction/Collator.cpp b/dbms/src/Storages/Transaction/Collator.cpp
index a9b4d0784be..1b0221a6829 100644
--- a/dbms/src/Storages/Transaction/Collator.cpp
+++ b/dbms/src/Storages/Transaction/Collator.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <Common/Exception.h>
+#include <Functions/CollationOperatorOptimized.h>
 #include <Poco/String.h>
 #include <Storages/Transaction/Collator.h>
 
@@ -29,17 +30,10 @@ TiDBCollators dummy_collators;
 std::vector<std::string> dummy_sort_key_contaners;
 std::string dummy_sort_key_contaner;
 
-std::string_view rtrim(const char * s, size_t length)
+ALWAYS_INLINE std::string_view rtrim(const char * s, size_t length)
 {
     auto v = std::string_view(s, length);
-    size_t end = v.find_last_not_of(' ');
-    return end == std::string_view::npos ? "" : v.substr(0, end + 1);
-}
-
-template <typename T>
-int signum(T val)
-{
-    return (0 < val) - (val < 0);
+    return DB::RightTrim(v);
 }
 
 using Rune = int32_t;
@@ -183,26 +177,26 @@ class Pattern : public ITiDBCollator::IPattern
 };
 
 template <typename T, bool padding = false>
-class BinCollator : public ITiDBCollator
+class BinCollator final : public ITiDBCollator
 {
 public:
     explicit BinCollator(int32_t id)
         : ITiDBCollator(id)
     {}
+
     int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override
     {
         if constexpr (padding)
-            return signum(rtrim(s1, length1).compare(rtrim(s2, length2)));
+            return DB::RtrimStrCompare({s1, length1}, {s2, length2});
         else
-            return signum(std::string_view(s1, length1).compare(std::string_view(s2, length2)));
+            return DB::RawStrCompare({s1, length1}, {s2, length2});
     }
 
     StringRef sortKey(const char * s, size_t length, std::string &) const override
     {
         if constexpr (padding)
         {
-            auto v = rtrim(s, length);
-            return StringRef(v.data(), v.length());
+            return StringRef(rtrim(s, length));
         }
         else
         {
@@ -249,7 +243,7 @@ using WeightType = uint16_t;
 extern const std::array<WeightType, 256 * 256> weight_lut;
 } // namespace GeneralCI
 
-class GeneralCICollator : public ITiDBCollator
+class GeneralCICollator final : public ITiDBCollator
 {
 public:
     explicit GeneralCICollator(int32_t id)
@@ -270,7 +264,7 @@ class GeneralCICollator : public ITiDBCollator
             auto sk2 = weight(c2);
             auto cmp = sk1 - sk2;
             if (cmp != 0)
-                return signum(cmp);
+                return DB::signum(cmp);
         }
 
         return (offset1 < v1.length()) - (offset2 < v2.length());
@@ -365,7 +359,7 @@ const std::array<long_weight, 23> weight_lut_long = {
 
 } // namespace UnicodeCI
 
-class UnicodeCICollator : public ITiDBCollator
+class UnicodeCICollator final : public ITiDBCollator
 {
 public:
     explicit UnicodeCICollator(int32_t id)
@@ -420,7 +414,7 @@ class UnicodeCICollator : public ITiDBCollator
                 }
                 else
                 {
-                    return signum(static_cast<int>(s1_first & 0xFFFF) - static_cast<int>(s2_first & 0xFFFF));
+                    return DB::signum(static_cast<int>(s1_first & 0xFFFF) - static_cast<int>(s2_first & 0xFFFF));
                 }
             }
         }
@@ -593,6 +587,8 @@ class UnicodeCICollator : public ITiDBCollator
     friend class Pattern<UnicodeCICollator>;
 };
 
+using UTF8MB4_BIN_TYPE = BinCollator<Rune, true>;
+
 TiDBCollatorPtr ITiDBCollator::getCollator(int32_t id)
 {
     switch (id)
@@ -607,10 +603,10 @@ TiDBCollatorPtr ITiDBCollator::getCollator(int32_t id)
         static const auto latin1_collator = BinCollator<char, true>(LATIN1_BIN);
         return &latin1_collator;
     case ITiDBCollator::UTF8MB4_BIN:
-        static const auto utf8mb4_collator = BinCollator<Rune, true>(UTF8MB4_BIN);
+        static const auto utf8mb4_collator = UTF8MB4_BIN_TYPE(UTF8MB4_BIN);
         return &utf8mb4_collator;
     case ITiDBCollator::UTF8_BIN:
-        static const auto utf8_collator = BinCollator<Rune, true>(UTF8_BIN);
+        static const auto utf8_collator = UTF8MB4_BIN_TYPE(UTF8_BIN);
         return &utf8_collator;
     case ITiDBCollator::UTF8_GENERAL_CI:
         static const auto utf8_general_ci_collator = GeneralCICollator(UTF8_GENERAL_CI);
diff --git a/dbms/src/Storages/Transaction/DecodingStorageSchemaSnapshot.h b/dbms/src/Storages/Transaction/DecodingStorageSchemaSnapshot.h
index c636d9e60ab..b0cacefe6f4 100644
--- a/dbms/src/Storages/Transaction/DecodingStorageSchemaSnapshot.h
+++ b/dbms/src/Storages/Transaction/DecodingStorageSchemaSnapshot.h
@@ -77,10 +77,12 @@ struct DecodingStorageSchemaSnapshot
         , decoding_schema_version{decoding_schema_version_}
     {
         std::unordered_map<ColumnID, size_t> column_lut;
+        std::unordered_map<String, ColumnID> column_name_id_map;
         for (size_t i = 0; i < table_info_.columns.size(); i++)
         {
             const auto & ci = table_info_.columns[i];
             column_lut.emplace(ci.id, i);
+            column_name_id_map.emplace(ci.name, ci.id);
         }
         for (size_t i = 0; i < column_defines->size(); i++)
         {
@@ -88,7 +90,7 @@ struct DecodingStorageSchemaSnapshot
             sorted_column_id_with_pos.insert({cd.id, i});
             if (cd.id != TiDBPkColumnID && cd.id != VersionColumnID && cd.id != DelMarkColumnID)
             {
-                auto & columns = table_info_.columns;
+                const auto & columns = table_info_.columns;
                 column_infos.push_back(columns[column_lut.at(cd.id)]);
             }
             else
@@ -100,10 +102,14 @@ struct DecodingStorageSchemaSnapshot
         // create pk related metadata if needed
         if (is_common_handle)
         {
-            const auto & primary_index_info = table_info_.getPrimaryIndexInfo();
-            for (size_t i = 0; i < primary_index_info.idx_cols.size(); i++)
+            /// we will not update the IndexInfo except Rename DDL.
+            /// When the add column / drop column action happenes, the offset of each column may change
+            /// Thus, we should not use offset to get the column we want,
+            /// but use to compare the column name to get the column id.
+            const auto & primary_index_cols = table_info_.getPrimaryIndexInfo().idx_cols;
+            for (const auto & col : primary_index_cols)
             {
-                auto pk_column_id = table_info_.columns[primary_index_info.idx_cols[i].offset].id;
+                auto pk_column_id = column_name_id_map[col.name];
                 pk_column_ids.emplace_back(pk_column_id);
                 pk_pos_map.emplace(pk_column_id, reinterpret_cast<size_t>(std::numeric_limits<size_t>::max()));
             }
@@ -125,11 +131,11 @@ struct DecodingStorageSchemaSnapshot
         {
             auto pk_pos_iter = pk_pos_map.begin();
             size_t column_pos_in_block = 0;
-            for (auto iter = sorted_column_id_with_pos.begin(); iter != sorted_column_id_with_pos.end(); iter++)
+            for (auto & column_id_with_pos : sorted_column_id_with_pos)
             {
                 if (pk_pos_iter == pk_pos_map.end())
                     break;
-                if (pk_pos_iter->first == iter->first)
+                if (pk_pos_iter->first == column_id_with_pos.first)
                 {
                     pk_pos_iter->second = column_pos_in_block;
                     pk_pos_iter++;
diff --git a/dbms/src/Storages/Transaction/KVStore.cpp b/dbms/src/Storages/Transaction/KVStore.cpp
index 318a04c6ed9..fb31e4476bb 100644
--- a/dbms/src/Storages/Transaction/KVStore.cpp
+++ b/dbms/src/Storages/Transaction/KVStore.cpp
@@ -129,7 +129,7 @@ void KVStore::traverseRegions(std::function<void(RegionID, const RegionPtr &)> &
         callback(region.first, region.second);
 }
 
-void KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & region, Poco::Logger * log)
+bool KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & region, Poco::Logger * log, bool try_until_succeed)
 {
     auto table_id = region.getMappedTableID();
     auto storage = tmt.getStorages().get(table_id);
@@ -139,7 +139,7 @@ void KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & regi
                         "tryFlushRegionCacheInStorage can not get table for region {} with table id {}, ignored",
                         region.toString(),
                         table_id);
-        return;
+        return true;
     }
 
     try
@@ -151,7 +151,7 @@ void KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & regi
             region.getRange()->getMappedTableID(),
             storage->isCommonHandle(),
             storage->getRowKeyColumnSize());
-        storage->flushCache(tmt.getContext(), rowkey_range);
+        return storage->flushCache(tmt.getContext(), rowkey_range, try_until_succeed);
     }
     catch (DB::Exception & e)
     {
@@ -159,6 +159,7 @@ void KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & regi
         if (e.code() != ErrorCodes::TABLE_IS_DROPPED)
             throw;
     }
+    return true;
 }
 
 void KVStore::tryPersist(RegionID region_id)
@@ -326,6 +327,64 @@ void KVStore::persistRegion(const Region & region, const RegionTaskLock & region
     LOG_FMT_DEBUG(log, "Persist {} done", region.toString(false));
 }
 
+bool KVStore::needFlushRegionData(UInt64 region_id, TMTContext & tmt)
+{
+    auto region_task_lock = region_manager.genRegionTaskLock(region_id);
+    const RegionPtr curr_region_ptr = getRegion(region_id);
+    return canFlushRegionDataImpl(curr_region_ptr, false, false, tmt, region_task_lock);
+}
+
+bool KVStore::tryFlushRegionData(UInt64 region_id, bool try_until_succeed, TMTContext & tmt)
+{
+    auto region_task_lock = region_manager.genRegionTaskLock(region_id);
+    const RegionPtr curr_region_ptr = getRegion(region_id);
+    return canFlushRegionDataImpl(curr_region_ptr, true, try_until_succeed, tmt, region_task_lock);
+}
+
+bool KVStore::canFlushRegionDataImpl(const RegionPtr & curr_region_ptr, UInt8 flush_if_possible, bool try_until_succeed, TMTContext & tmt, const RegionTaskLock & region_task_lock)
+{
+    if (curr_region_ptr == nullptr)
+    {
+        throw Exception(fmt::format("region not found when trying flush", ErrorCodes::LOGICAL_ERROR));
+    }
+    auto & curr_region = *curr_region_ptr;
+
+    auto [rows, size_bytes] = curr_region.getApproxMemCacheInfo();
+
+    LOG_FMT_DEBUG(log, "{} approx mem cache info: rows {}, bytes {}", curr_region.toString(false), rows, size_bytes);
+
+    bool can_flush = false;
+    if (rows >= region_compact_log_min_rows.load(std::memory_order_relaxed)
+        || size_bytes >= region_compact_log_min_bytes.load(std::memory_order_relaxed))
+    {
+        // if rows or bytes more than threshold, flush cache and persist mem data.
+        can_flush = true;
+    }
+    else
+    {
+        // if there is little data in mem, wait until time interval reached threshold.
+        // use random period so that lots of regions will not be persisted at same time.
+        auto compact_log_period = std::rand() % region_compact_log_period.load(std::memory_order_relaxed); // NOLINT
+        can_flush = !(curr_region.lastCompactLogTime() + Seconds{compact_log_period} > Clock::now());
+    }
+    if (can_flush && flush_if_possible)
+    {
+        LOG_FMT_DEBUG(log, "{} flush region due to can_flush_data", curr_region.toString(false));
+        if (tryFlushRegionCacheInStorage(tmt, curr_region, log, try_until_succeed))
+        {
+            persistRegion(curr_region, region_task_lock, "compact raft log");
+            curr_region.markCompactLog();
+            curr_region.cleanApproxMemCacheInfo();
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+    return can_flush;
+}
+
 EngineStoreApplyRes KVStore::handleUselessAdminRaftCmd(
     raft_cmdpb::AdminCmdType cmd_type,
     UInt64 curr_region_id,
@@ -359,32 +418,12 @@ EngineStoreApplyRes KVStore::handleUselessAdminRaftCmd(
         }
         else
         {
-            auto [rows, size_bytes] = curr_region.getApproxMemCacheInfo();
-
-            LOG_FMT_DEBUG(log, "{} approx mem cache info: rows {}, bytes {}", curr_region.toString(false), rows, size_bytes);
-
-            if (rows >= region_compact_log_min_rows.load(std::memory_order_relaxed)
-                || size_bytes >= region_compact_log_min_bytes.load(std::memory_order_relaxed))
-            {
-                // if rows or bytes more than threshold, flush cache and perist mem data.
-                return true;
-            }
-            else
-            {
-                // if thhere is little data in mem, wait until time interval reached threshold.
-                // use random period so that lots of regions will not be persisted at same time.
-                auto compact_log_period = std::rand() % region_compact_log_period.load(std::memory_order_relaxed); // NOLINT
-                return !(curr_region.lastCompactLogTime() + Seconds{compact_log_period} > Clock::now());
-            }
+            return canFlushRegionDataImpl(curr_region_ptr, true, /* try_until_succeed */ false, tmt, region_task_lock);
         }
     };
 
     if (check_sync_log())
     {
-        tryFlushRegionCacheInStorage(tmt, curr_region, log);
-        persistRegion(curr_region, region_task_lock, "compact raft log");
-        curr_region.markCompactLog();
-        curr_region.cleanApproxMemCacheInfo();
         return EngineStoreApplyRes::Persist;
     }
     return EngineStoreApplyRes::None;
diff --git a/dbms/src/Storages/Transaction/KVStore.h b/dbms/src/Storages/Transaction/KVStore.h
index bb45e65d18b..b58083557a1 100644
--- a/dbms/src/Storages/Transaction/KVStore.h
+++ b/dbms/src/Storages/Transaction/KVStore.h
@@ -91,7 +91,7 @@ class KVStore final : private boost::noncopyable
 
     void tryPersist(RegionID region_id);
 
-    static void tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & region, Poco::Logger * log);
+    static bool tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & region, Poco::Logger * log, bool try_until_succeed = true);
 
     size_t regionSize() const;
     EngineStoreApplyRes handleAdminRaftCmd(raft_cmdpb::AdminRequest && request,
@@ -108,6 +108,9 @@ class KVStore final : private boost::noncopyable
         TMTContext & tmt);
     EngineStoreApplyRes handleWriteRaftCmd(const WriteCmdsView & cmds, UInt64 region_id, UInt64 index, UInt64 term, TMTContext & tmt);
 
+    bool needFlushRegionData(UInt64 region_id, TMTContext & tmt);
+    bool tryFlushRegionData(UInt64 region_id, bool try_until_succeed, TMTContext & tmt);
+
     void handleApplySnapshot(metapb::Region && region, uint64_t peer_id, const SSTViewVec, uint64_t index, uint64_t term, TMTContext & tmt);
 
     std::vector<UInt64> /*   */ preHandleSnapshotToFiles(
@@ -219,6 +222,11 @@ class KVStore final : private boost::noncopyable
         UInt64 term,
         TMTContext & tmt);
 
+    /// Notice that if flush_if_possible is set to false, we only check if a flush is allowed by rowsize/size/interval.
+    /// It will not check if a flush will eventually succeed.
+    /// In other words, `canFlushRegionDataImpl(flush_if_possible=true)` can return false.
+    bool canFlushRegionDataImpl(const RegionPtr & curr_region_ptr, UInt8 flush_if_possible, bool try_until_succeed, TMTContext & tmt, const RegionTaskLock & region_task_lock);
+
     void persistRegion(const Region & region, const RegionTaskLock & region_task_lock, const char * caller);
     void releaseReadIndexWorkers();
     void handleDestroy(UInt64 region_id, TMTContext & tmt, const KVStoreTaskLock &);
diff --git a/dbms/src/Storages/Transaction/PDTiKVClient.cpp b/dbms/src/Storages/Transaction/PDTiKVClient.cpp
index 5a4b751fd9c..a06f1a3ae64 100644
--- a/dbms/src/Storages/Transaction/PDTiKVClient.cpp
+++ b/dbms/src/Storages/Transaction/PDTiKVClient.cpp
@@ -22,7 +22,7 @@ namespace ErrorCodes
 extern const int LOGICAL_ERROR;
 }
 
-Timestamp PDClientHelper::cached_gc_safe_point = 0;
-std::chrono::time_point<std::chrono::system_clock> PDClientHelper::safe_point_last_update_time;
+std::atomic<Timestamp> PDClientHelper::cached_gc_safe_point = 0;
+std::atomic<std::chrono::time_point<std::chrono::system_clock>> PDClientHelper::safe_point_last_update_time;
 
 } // namespace DB
diff --git a/dbms/src/Storages/Transaction/PDTiKVClient.h b/dbms/src/Storages/Transaction/PDTiKVClient.h
index 4986c28f4ac..e5801cc7fae 100644
--- a/dbms/src/Storages/Transaction/PDTiKVClient.h
+++ b/dbms/src/Storages/Transaction/PDTiKVClient.h
@@ -29,6 +29,8 @@
 #include <Storages/Transaction/Types.h>
 #include <common/logger_useful.h>
 
+#include <atomic>
+
 // We define a shared ptr here, because TMTContext / SchemaSyncer / IndexReader all need to
 // `share` the resource of cluster.
 using KVClusterPtr = std::shared_ptr<pingcap::kv::Cluster>;
@@ -49,7 +51,7 @@ struct PDClientHelper
         {
             // In case we cost too much to update safe point from PD.
             std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
-            const auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - safe_point_last_update_time);
+            const auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - safe_point_last_update_time.load());
             const auto min_interval = std::max(Int64(1), safe_point_update_interval_seconds); // at least one second
             if (duration.count() < min_interval)
                 return cached_gc_safe_point;
@@ -73,8 +75,8 @@ struct PDClientHelper
     }
 
 private:
-    static Timestamp cached_gc_safe_point;
-    static std::chrono::time_point<std::chrono::system_clock> safe_point_last_update_time;
+    static std::atomic<Timestamp> cached_gc_safe_point;
+    static std::atomic<std::chrono::time_point<std::chrono::system_clock>> safe_point_last_update_time;
 };
 
 
diff --git a/dbms/src/Storages/Transaction/PartitionStreams.cpp b/dbms/src/Storages/Transaction/PartitionStreams.cpp
index 4b2ca6c07a8..cf151c4270d 100644
--- a/dbms/src/Storages/Transaction/PartitionStreams.cpp
+++ b/dbms/src/Storages/Transaction/PartitionStreams.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <Common/Allocator.h>
+#include <Common/Exception.h>
 #include <Common/FailPoint.h>
 #include <Common/TiFlashMetrics.h>
 #include <Common/setThreadName.h>
@@ -39,6 +40,8 @@ namespace FailPoints
 extern const char pause_before_apply_raft_cmd[];
 extern const char pause_before_apply_raft_snapshot[];
 extern const char force_set_safepoint_when_decode_block[];
+extern const char unblock_query_init_after_write[];
+extern const char pause_query_init[];
 } // namespace FailPoints
 
 namespace ErrorCodes
@@ -150,6 +153,7 @@ static void writeRegionDataToStorage(
         default:
             throw Exception("Unknown StorageEngine: " + toString(static_cast<Int32>(storage->engineType())), ErrorCodes::LOGICAL_ERROR);
         }
+
         write_part_cost = watch.elapsedMilliseconds();
         GET_METRIC(tiflash_raft_write_data_to_storage_duration_seconds, type_write).Observe(write_part_cost / 1000.0);
         if (need_decode)
@@ -164,10 +168,20 @@ static void writeRegionDataToStorage(
     /// decoding data. Check the test case for more details.
     FAIL_POINT_PAUSE(FailPoints::pause_before_apply_raft_cmd);
 
+    /// disable pause_query_init when the write action finish, to make the query action continue.
+    /// the usage of unblock_query_init_after_write and pause_query_init can refer to InterpreterSelectQuery::init
+    SCOPE_EXIT({
+        fiu_do_on(FailPoints::unblock_query_init_after_write, {
+            FailPointHelper::disableFailPoint(FailPoints::pause_query_init);
+        });
+    });
+
     /// Try read then write once.
     {
         if (atomic_read_write(false))
+        {
             return;
+        }
     }
 
     /// If first try failed, sync schema and force read then write.
@@ -176,10 +190,12 @@ static void writeRegionDataToStorage(
         tmt.getSchemaSyncer()->syncSchemas(context);
 
         if (!atomic_read_write(true))
+        {
             // Failure won't be tolerated this time.
             // TODO: Enrich exception message.
             throw Exception("Write region " + std::to_string(region->id()) + " to table " + std::to_string(table_id) + " failed",
                             ErrorCodes::LOGICAL_ERROR);
+        }
     }
 }
 
diff --git a/dbms/src/Storages/Transaction/ProxyFFI.cpp b/dbms/src/Storages/Transaction/ProxyFFI.cpp
index 8a40ca9b15e..d4ba50d5714 100644
--- a/dbms/src/Storages/Transaction/ProxyFFI.cpp
+++ b/dbms/src/Storages/Transaction/ProxyFFI.cpp
@@ -128,6 +128,34 @@ EngineStoreApplyRes HandleAdminRaftCmd(
     }
 }
 
+uint8_t NeedFlushData(EngineStoreServerWrap * server, uint64_t region_id)
+{
+    try
+    {
+        auto & kvstore = server->tmt->getKVStore();
+        return kvstore->needFlushRegionData(region_id, *server->tmt);
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+        exit(-1);
+    }
+}
+
+uint8_t TryFlushData(EngineStoreServerWrap * server, uint64_t region_id, uint8_t until_succeed)
+{
+    try
+    {
+        auto & kvstore = server->tmt->getKVStore();
+        return kvstore->tryFlushRegionData(region_id, until_succeed, *server->tmt);
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+        exit(-1);
+    }
+}
+
 static_assert(sizeof(RaftStoreProxyFFIHelper) == sizeof(TiFlashRaftProxyHelper));
 static_assert(alignof(RaftStoreProxyFFIHelper) == alignof(TiFlashRaftProxyHelper));
 
diff --git a/dbms/src/Storages/Transaction/ProxyFFI.h b/dbms/src/Storages/Transaction/ProxyFFI.h
index e1c01599275..aafe4b375eb 100644
--- a/dbms/src/Storages/Transaction/ProxyFFI.h
+++ b/dbms/src/Storages/Transaction/ProxyFFI.h
@@ -125,6 +125,8 @@ EngineStoreApplyRes HandleAdminRaftCmd(
 EngineStoreApplyRes HandleWriteRaftCmd(const EngineStoreServerWrap * server,
                                        WriteCmdsView cmds,
                                        RaftCmdHeader header);
+uint8_t NeedFlushData(EngineStoreServerWrap * server, uint64_t region_id);
+uint8_t TryFlushData(EngineStoreServerWrap * server, uint64_t region_id, uint8_t until_succeed);
 void AtomicUpdateProxy(EngineStoreServerWrap * server, RaftStoreProxyFFIHelper * proxy);
 void HandleDestroy(EngineStoreServerWrap * server, uint64_t region_id);
 EngineStoreApplyRes HandleIngestSST(EngineStoreServerWrap * server, SSTViewVec snaps, RaftCmdHeader header);
@@ -158,6 +160,8 @@ inline EngineStoreServerHelper GetEngineStoreServerHelper(
         .fn_gen_cpp_string = GenCppRawString,
         .fn_handle_write_raft_cmd = HandleWriteRaftCmd,
         .fn_handle_admin_raft_cmd = HandleAdminRaftCmd,
+        .fn_need_flush_data = NeedFlushData,
+        .fn_try_flush_data = TryFlushData,
         .fn_atomic_update_proxy = AtomicUpdateProxy,
         .fn_handle_destroy = HandleDestroy,
         .fn_handle_ingest_sst = HandleIngestSST,
diff --git a/dbms/src/Storages/Transaction/ProxyFFIStatusService.cpp b/dbms/src/Storages/Transaction/ProxyFFIStatusService.cpp
index dafacd8947d..792f149f588 100644
--- a/dbms/src/Storages/Transaction/ProxyFFIStatusService.cpp
+++ b/dbms/src/Storages/Transaction/ProxyFFIStatusService.cpp
@@ -22,26 +22,6 @@
 
 namespace DB
 {
-HttpRequestRes HandleHttpRequestTestShow(
-    EngineStoreServerWrap *,
-    std::string_view path,
-    const std::string & api_name,
-    std::string_view query,
-    std::string_view body)
-{
-    auto * res = RawCppString::New(fmt::format(
-        "api_name: {}\npath: {}\nquery: {}\nbody: {}",
-        api_name,
-        path,
-        query,
-        body));
-    return HttpRequestRes{
-        .status = HttpRequestStatus::Ok,
-        .res = CppStrWithView{
-            .inner = GenRawCppPtr(res, RawCppPtrTypeImpl::String),
-            .view = BaseBuffView{res->data(), res->size()}}};
-}
-
 HttpRequestRes HandleHttpRequestSyncStatus(
     EngineStoreServerWrap * server,
     std::string_view path,
@@ -112,8 +92,7 @@ using HANDLE_HTTP_URI_METHOD = HttpRequestRes (*)(EngineStoreServerWrap *, std::
 
 static const std::map<std::string, HANDLE_HTTP_URI_METHOD> AVAILABLE_HTTP_URI = {
     {"/tiflash/sync-status/", HandleHttpRequestSyncStatus},
-    {"/tiflash/store-status", HandleHttpRequestStoreStatus},
-    {"/tiflash/test-show", HandleHttpRequestTestShow}};
+    {"/tiflash/store-status", HandleHttpRequestStoreStatus}};
 
 uint8_t CheckHttpUriAvailable(BaseBuffView path_)
 {
diff --git a/dbms/src/Storages/Transaction/ReadIndexWorker.cpp b/dbms/src/Storages/Transaction/ReadIndexWorker.cpp
index 3223c815989..7de79dd5c6d 100644
--- a/dbms/src/Storages/Transaction/ReadIndexWorker.cpp
+++ b/dbms/src/Storages/Transaction/ReadIndexWorker.cpp
@@ -880,7 +880,7 @@ BatchReadIndexRes ReadIndexWorkerManager::batchReadIndex(
         }
     }
     { // if meet timeout, which means part of regions can not get response from leader, try to poll rest tasks
-        TEST_LOG_FMT("rest {}, poll rest tasks onece", tasks.size());
+        TEST_LOG_FMT("rest {}, poll rest tasks once", tasks.size());
 
         while (!tasks.empty())
         {
diff --git a/dbms/src/Storages/Transaction/RegionBlockReader.cpp b/dbms/src/Storages/Transaction/RegionBlockReader.cpp
index af351f4a6b0..2ec690c467b 100644
--- a/dbms/src/Storages/Transaction/RegionBlockReader.cpp
+++ b/dbms/src/Storages/Transaction/RegionBlockReader.cpp
@@ -186,6 +186,7 @@ bool RegionBlockReader::readImpl(Block & block, const RegionDataReadInfoList & d
         }
         else
         {
+            // For common handle, sometimes we need to decode the value from encoded key instead of encoded value
             auto * raw_extra_column = const_cast<IColumn *>((block.getByPosition(extra_handle_column_pos)).column.get());
             raw_extra_column->insertData(pk->data(), pk->size());
             /// decode key and insert pk columns if needed
@@ -207,6 +208,8 @@ bool RegionBlockReader::readImpl(Block & block, const RegionDataReadInfoList & d
         }
         index++;
     }
+    block.checkNumberOfRows();
+
     return true;
 }
 
diff --git a/dbms/src/Storages/Transaction/RegionBlockReader.h b/dbms/src/Storages/Transaction/RegionBlockReader.h
index ec633e805c0..004d9f40447 100644
--- a/dbms/src/Storages/Transaction/RegionBlockReader.h
+++ b/dbms/src/Storages/Transaction/RegionBlockReader.h
@@ -41,7 +41,7 @@ class Block;
 class RegionBlockReader : private boost::noncopyable
 {
 public:
-    RegionBlockReader(DecodingStorageSchemaSnapshotConstPtr schema_snapshot_);
+    explicit RegionBlockReader(DecodingStorageSchemaSnapshotConstPtr schema_snapshot_);
 
     /// Read `data_list` as a block.
     ///
diff --git a/dbms/src/Storages/Transaction/RegionTable.cpp b/dbms/src/Storages/Transaction/RegionTable.cpp
index c855d5b3226..5ae36a4bd64 100644
--- a/dbms/src/Storages/Transaction/RegionTable.cpp
+++ b/dbms/src/Storages/Transaction/RegionTable.cpp
@@ -230,7 +230,7 @@ void removeObsoleteDataInStorage(
         auto rowkey_range
             = DM::RowKeyRange::fromRegionRange(handle_range, table_id, table_id, storage->isCommonHandle(), storage->getRowKeyColumnSize());
         dm_storage->deleteRange(rowkey_range, context->getSettingsRef());
-        dm_storage->flushCache(*context, rowkey_range); // flush to disk
+        dm_storage->flushCache(*context, rowkey_range, /*try_until_succeed*/ true); // flush to disk
     }
     catch (DB::Exception & e)
     {
diff --git a/dbms/src/Storages/Transaction/RowCodec.cpp b/dbms/src/Storages/Transaction/RowCodec.cpp
index 427544a0467..ea7f6b7c2da 100644
--- a/dbms/src/Storages/Transaction/RowCodec.cpp
+++ b/dbms/src/Storages/Transaction/RowCodec.cpp
@@ -314,7 +314,7 @@ bool appendRowV2ToBlock(
     ColumnID pk_handle_id,
     bool force_decode)
 {
-    UInt8 row_flag = readLittleEndian<UInt8>(&raw_value[1]);
+    auto row_flag = readLittleEndian<UInt8>(&raw_value[1]);
     bool is_big = row_flag & RowV2::BigRowMask;
     return is_big ? appendRowV2ToBlockImpl<true>(raw_value, column_ids_iter, column_ids_iter_end, block, block_column_pos, column_infos, pk_handle_id, force_decode)
                   : appendRowV2ToBlockImpl<false>(raw_value, column_ids_iter, column_ids_iter_end, block, block_column_pos, column_infos, pk_handle_id, force_decode);
@@ -360,9 +360,10 @@ bool appendRowV2ToBlockImpl(
     decodeUInts<ColumnID, typename RowV2::Types<is_big>::ColumnIDType>(cursor, raw_value, num_null_columns, null_column_ids);
     decodeUInts<size_t, typename RowV2::Types<is_big>::ValueOffsetType>(cursor, raw_value, num_not_null_columns, value_offsets);
     size_t values_start_pos = cursor;
-    size_t id_not_null = 0, id_null = 0;
+    size_t idx_not_null = 0;
+    size_t idx_null = 0;
     // Merge ordered not null/null columns to keep order.
-    while (id_not_null < not_null_column_ids.size() || id_null < null_column_ids.size())
+    while (idx_not_null < not_null_column_ids.size() || idx_null < null_column_ids.size())
     {
         if (column_ids_iter == column_ids_iter_end)
         {
@@ -371,24 +372,31 @@ bool appendRowV2ToBlockImpl(
         }
 
         bool is_null;
-        if (id_not_null < not_null_column_ids.size() && id_null < null_column_ids.size())
-            is_null = not_null_column_ids[id_not_null] > null_column_ids[id_null];
+        if (idx_not_null < not_null_column_ids.size() && idx_null < null_column_ids.size())
+            is_null = not_null_column_ids[idx_not_null] > null_column_ids[idx_null];
         else
-            is_null = id_null < null_column_ids.size();
+            is_null = idx_null < null_column_ids.size();
 
-        auto next_datum_column_id = is_null ? null_column_ids[id_null] : not_null_column_ids[id_not_null];
+        auto next_datum_column_id = is_null ? null_column_ids[idx_null] : not_null_column_ids[idx_not_null];
         if (column_ids_iter->first > next_datum_column_id)
         {
-            // extra column
+            // The next column id to read is bigger than the column id of next datum in encoded row.
+            // It means this is the datum of extra column. May happen when reading after dropping
+            // a column.
             if (!force_decode)
                 return false;
+            // Ignore the extra column and continue to parse other datum
             if (is_null)
-                id_null++;
+                idx_null++;
             else
-                id_not_null++;
+                idx_not_null++;
         }
         else if (column_ids_iter->first < next_datum_column_id)
         {
+            // The next column id to read is less than the column id of next datum in encoded row.
+            // It means this is the datum of missing column. May happen when reading after adding
+            // a column.
+            // Fill with default value and continue to read data for next column id.
             const auto & column_info = column_infos[column_ids_iter->second];
             if (!addDefaultValueToColumnIfPossible(column_info, block, block_column_pos, force_decode))
                 return false;
@@ -397,7 +405,7 @@ bool appendRowV2ToBlockImpl(
         }
         else
         {
-            // if pk_handle_id is a valid column id, then it means the table's pk_is_handle is true
+            // If pk_handle_id is a valid column id, then it means the table's pk_is_handle is true
             // we can just ignore the pk value encoded in value part
             if (unlikely(column_ids_iter->first == pk_handle_id))
             {
@@ -405,15 +413,16 @@ bool appendRowV2ToBlockImpl(
                 block_column_pos++;
                 if (is_null)
                 {
-                    id_null++;
+                    idx_null++;
                 }
                 else
                 {
-                    id_not_null++;
+                    idx_not_null++;
                 }
                 continue;
             }
 
+            // Parse the datum.
             auto * raw_column = const_cast<IColumn *>((block.getByPosition(block_column_pos)).column.get());
             const auto & column_info = column_infos[column_ids_iter->second];
             if (is_null)
@@ -432,15 +441,15 @@ bool appendRowV2ToBlockImpl(
                 }
                 // ColumnNullable::insertDefault just insert a null value
                 raw_column->insertDefault();
-                id_null++;
+                idx_null++;
             }
             else
             {
-                size_t start = id_not_null ? value_offsets[id_not_null - 1] : 0;
-                size_t length = value_offsets[id_not_null] - start;
+                size_t start = idx_not_null ? value_offsets[idx_not_null - 1] : 0;
+                size_t length = value_offsets[idx_not_null] - start;
                 if (!raw_column->decodeTiDBRowV2Datum(values_start_pos + start, raw_value, length, force_decode))
                     return false;
-                id_not_null++;
+                idx_not_null++;
             }
             column_ids_iter++;
             block_column_pos++;
diff --git a/dbms/src/Storages/Transaction/TiDB.cpp b/dbms/src/Storages/Transaction/TiDB.cpp
index 15bf2a3fb58..6d07c47f235 100644
--- a/dbms/src/Storages/Transaction/TiDB.cpp
+++ b/dbms/src/Storages/Transaction/TiDB.cpp
@@ -25,6 +25,7 @@
 #include <Storages/Transaction/Collator.h>
 #include <Storages/Transaction/TiDB.h>
 #include <TiDB/Schema/SchemaNameMapper.h>
+#include <common/logger_useful.h>
 
 #include <cmath>
 
@@ -631,8 +632,8 @@ catch (const Poco::Exception & e)
 ///////////////////////
 
 IndexColumnInfo::IndexColumnInfo(Poco::JSON::Object::Ptr json)
-    : offset(0)
-    , length(0)
+    : length(0)
+    , offset(0)
 {
     deserialize(json);
 }
@@ -772,6 +773,37 @@ catch (const Poco::Exception & e)
         DB::Exception(e));
 }
 
+String TiFlashModeToString(TiFlashMode tiflash_mode)
+{
+    switch (tiflash_mode)
+    {
+    case TiFlashMode::Normal:
+        return "";
+    case TiFlashMode::Fast:
+        return "fast";
+    default:
+        LOG_FMT_WARNING(&Poco::Logger::get("TiDB"), "TiFlashModeToString with invalid tiflash mode {}", tiflash_mode);
+        return "";
+    }
+}
+
+TiFlashMode parseTiFlashMode(String mode_str)
+{
+    if (mode_str.empty())
+    {
+        return TiFlashMode::Normal;
+    }
+    else if (mode_str == "fast")
+    {
+        return TiFlashMode::Fast;
+    }
+    else
+    {
+        throw DB::Exception(
+            std::string(__PRETTY_FUNCTION__)
+            + " ParseTiFlashMode Failed. mode " + mode_str + " is unvalid, please set mode as fast/normal");
+    }
+}
 ///////////////////////
 ////// TableInfo //////
 ///////////////////////
@@ -840,6 +872,8 @@ try
 
     json->set("tiflash_replica", replica_info.getJSONObject());
 
+    json->set("tiflash_mode", std::string(TiFlashModeToString(tiflash_mode)));
+
     json->stringify(buf);
 
     return buf.str();
@@ -926,6 +960,14 @@ try
             replica_info.deserialize(replica_obj);
         }
     }
+    if (obj->has("tiflash_mode"))
+    {
+        auto mode = obj->getValue<String>("tiflash_mode");
+        if (!mode.empty())
+        {
+            tiflash_mode = parseTiFlashMode(mode);
+        }
+    }
     if (is_common_handle && index_infos.size() != 1)
     {
         throw DB::Exception(
diff --git a/dbms/src/Storages/Transaction/TiDB.h b/dbms/src/Storages/Transaction/TiDB.h
index f67bfb332c7..a9d46b60c13 100644
--- a/dbms/src/Storages/Transaction/TiDB.h
+++ b/dbms/src/Storages/Transaction/TiDB.h
@@ -179,7 +179,6 @@ struct ColumnInfo
 
     ColumnID id = -1;
     String name;
-    Int32 offset = -1;
     Poco::Dynamic::Var origin_default_value;
     Poco::Dynamic::Var default_value;
     Poco::Dynamic::Var default_bit_value;
@@ -212,6 +211,12 @@ struct ColumnInfo
     static Int64 getTimeValue(const String &);
     static Int64 getYearValue(const String &);
     static UInt64 getBitValue(const String &);
+
+private:
+    /// please be very careful when you have to use offset,
+    /// because we never update offset when DDL action changes.
+    /// Thus, our offset will not exactly correspond the order of columns.
+    Int32 offset = -1;
 };
 
 enum PartitionType
@@ -298,8 +303,13 @@ struct IndexColumnInfo
     void deserialize(Poco::JSON::Object::Ptr json);
 
     String name;
-    Int32 offset;
     Int32 length;
+
+private:
+    /// please be very careful when you have to use offset,
+    /// because we never update offset when DDL action changes.
+    /// Thus, our offset will not exactly correspond the order of columns.
+    Int32 offset;
 };
 struct IndexInfo
 {
@@ -323,6 +333,12 @@ struct IndexInfo
     bool is_global;
 };
 
+enum class TiFlashMode
+{
+    Normal,
+    Fast,
+};
+
 struct TableInfo
 {
     TableInfo() = default;
@@ -372,6 +388,8 @@ struct TableInfo
     // The TiFlash replica info persisted by TiDB
     TiFlashReplicaInfo replica_info;
 
+    TiFlashMode tiflash_mode = TiFlashMode::Normal;
+
     ::TiDB::StorageEngine engine_type = ::TiDB::StorageEngine::UNSPECIFIED;
 
     ColumnID getColumnID(const String & name) const;
@@ -385,7 +403,12 @@ struct TableInfo
 
     bool isLogicalPartitionTable() const { return is_partition_table && belonging_table_id == DB::InvalidTableID && partition.enable; }
 
-    /// should not be called if is_common_handle = false
+    /// should not be called if is_common_handle = false.
+    /// when use IndexInfo, please avoid to use the offset info
+    /// the offset value may be wrong in some cases,
+    /// due to we will not update IndexInfo except RENAME DDL action,
+    /// but DDL like add column / drop column may change the offset of columns
+    /// Thus, please be very careful when you must have to use offset information !!!!!
     const IndexInfo & getPrimaryIndexInfo() const { return index_infos[0]; }
 
     IndexInfo & getPrimaryIndexInfo() { return index_infos[0]; }
@@ -398,4 +421,7 @@ String genJsonNull();
 tipb::FieldType columnInfoToFieldType(const ColumnInfo & ci);
 ColumnInfo fieldTypeToColumnInfo(const tipb::FieldType & field_type);
 
+String TiFlashModeToString(TiFlashMode tiflash_mode);
+TiFlashMode parseTiFlashMode(String mode_str);
+
 } // namespace TiDB
diff --git a/dbms/src/Storages/Transaction/TiKVRecordFormat.h b/dbms/src/Storages/Transaction/TiKVRecordFormat.h
index 4a25b6d9292..10a7f7220e9 100644
--- a/dbms/src/Storages/Transaction/TiKVRecordFormat.h
+++ b/dbms/src/Storages/Transaction/TiKVRecordFormat.h
@@ -30,7 +30,6 @@
 
 namespace DB
 {
-
 namespace ErrorCodes
 {
 extern const int LOGICAL_ERROR;
@@ -38,7 +37,6 @@ extern const int LOGICAL_ERROR;
 
 namespace RecordKVFormat
 {
-
 enum CFModifyFlag : UInt8
 {
     PutFlag = 'P',
@@ -83,17 +81,35 @@ inline TiKVKey encodeAsTiKVKey(const String & ori_str)
     return TiKVKey(ss.releaseStr());
 }
 
-inline UInt64 encodeUInt64(const UInt64 x) { return toBigEndian(x); }
+inline UInt64 encodeUInt64(const UInt64 x)
+{
+    return toBigEndian(x);
+}
 
-inline UInt64 encodeInt64(const Int64 x) { return encodeUInt64(static_cast<UInt64>(x) ^ SIGN_MASK); }
+inline UInt64 encodeInt64(const Int64 x)
+{
+    return encodeUInt64(static_cast<UInt64>(x) ^ SIGN_MASK);
+}
 
-inline UInt64 encodeUInt64Desc(const UInt64 x) { return encodeUInt64(~x); }
+inline UInt64 encodeUInt64Desc(const UInt64 x)
+{
+    return encodeUInt64(~x);
+}
 
-inline UInt64 decodeUInt64(const UInt64 x) { return toBigEndian(x); }
+inline UInt64 decodeUInt64(const UInt64 x)
+{
+    return toBigEndian(x);
+}
 
-inline UInt64 decodeUInt64Desc(const UInt64 x) { return ~decodeUInt64(x); }
+inline UInt64 decodeUInt64Desc(const UInt64 x)
+{
+    return ~decodeUInt64(x);
+}
 
-inline Int64 decodeInt64(const UInt64 x) { return static_cast<Int64>(decodeUInt64(x) ^ SIGN_MASK); }
+inline Int64 decodeInt64(const UInt64 x)
+{
+    return static_cast<Int64>(decodeUInt64(x) ^ SIGN_MASK);
+}
 
 inline void encodeInt64(const Int64 x, WriteBuffer & ss)
 {
@@ -125,7 +141,10 @@ inline DecodedTiKVKey genRawKey(const TableID tableId, const HandleID handleId)
     return key;
 }
 
-inline TiKVKey genKey(const TableID tableId, const HandleID handleId) { return encodeAsTiKVKey(genRawKey(tableId, handleId)); }
+inline TiKVKey genKey(const TableID tableId, const HandleID handleId)
+{
+    return encodeAsTiKVKey(genRawKey(tableId, handleId));
+}
 
 inline TiKVKey genKey(const TiDB::TableInfo & table_info, std::vector<Field> keys)
 {
@@ -135,9 +154,16 @@ inline TiKVKey genKey(const TiDB::TableInfo & table_info, std::vector<Field> key
     memcpy(key.data() + 1, reinterpret_cast<const char *>(&big_endian_table_id), 8);
     memcpy(key.data() + 1 + 8, RecordKVFormat::RECORD_PREFIX_SEP, 2);
     WriteBufferFromOwnString ss;
+
+    std::unordered_map<String, size_t> column_name_columns_index_map;
+    for (size_t i = 0; i < table_info.columns.size(); i++)
+    {
+        column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+    }
     for (size_t i = 0; i < keys.size(); i++)
     {
-        DB::EncodeDatum(keys[i], table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset].getCodecFlag(), ss);
+        auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+        DB::EncodeDatum(keys[i], table_info.columns[idx].getCodecFlag(), ss);
     }
     return encodeAsTiKVKey(key + ss.releaseStr());
 }
@@ -176,29 +202,50 @@ inline std::tuple<DecodedTiKVKey, size_t> decodeTiKVKeyFull(const TiKVKey & key)
     }
 }
 
-inline DecodedTiKVKey decodeTiKVKey(const TiKVKey & key) { return std::get<0>(decodeTiKVKeyFull(key)); }
+inline DecodedTiKVKey decodeTiKVKey(const TiKVKey & key)
+{
+    return std::get<0>(decodeTiKVKeyFull(key));
+}
 
-inline Timestamp getTs(const TiKVKey & key) { return decodeUInt64Desc(read<UInt64>(key.data() + key.dataSize() - 8)); }
+inline Timestamp getTs(const TiKVKey & key)
+{
+    return decodeUInt64Desc(read<UInt64>(key.data() + key.dataSize() - 8));
+}
 
-inline TableID getTableId(const DecodedTiKVKey & key) { return decodeInt64(read<UInt64>(key.data() + 1)); }
+inline TableID getTableId(const DecodedTiKVKey & key)
+{
+    return decodeInt64(read<UInt64>(key.data() + 1));
+}
 
-inline HandleID getHandle(const DecodedTiKVKey & key) { return decodeInt64(read<UInt64>(key.data() + RAW_KEY_NO_HANDLE_SIZE)); }
+inline HandleID getHandle(const DecodedTiKVKey & key)
+{
+    return decodeInt64(read<UInt64>(key.data() + RAW_KEY_NO_HANDLE_SIZE));
+}
 
 inline RawTiDBPK getRawTiDBPK(const DecodedTiKVKey & key)
 {
     return std::make_shared<const std::string>(key.begin() + RAW_KEY_NO_HANDLE_SIZE, key.end());
 }
 
-inline TableID getTableId(const TiKVKey & key) { return getTableId(decodeTiKVKey(key)); }
+inline TableID getTableId(const TiKVKey & key)
+{
+    return getTableId(decodeTiKVKey(key));
+}
 
-inline HandleID getHandle(const TiKVKey & key) { return getHandle(decodeTiKVKey(key)); }
+inline HandleID getHandle(const TiKVKey & key)
+{
+    return getHandle(decodeTiKVKey(key));
+}
 
 inline bool isRecord(const DecodedTiKVKey & raw_key)
 {
     return raw_key.size() >= RAW_KEY_SIZE && raw_key[0] == TABLE_PREFIX && memcmp(raw_key.data() + 9, RECORD_PREFIX_SEP, 2) == 0;
 }
 
-inline TiKVKey truncateTs(const TiKVKey & key) { return TiKVKey(String(key.data(), key.dataSize() - sizeof(Timestamp))); }
+inline TiKVKey truncateTs(const TiKVKey & key)
+{
+    return TiKVKey(String(key.data(), key.dataSize() - sizeof(Timestamp)));
+}
 
 inline TiKVKey appendTs(const TiKVKey & key, Timestamp ts)
 {
@@ -215,7 +262,12 @@ inline TiKVKey genKey(TableID tableId, HandleID handleId, Timestamp ts)
 }
 
 inline TiKVValue encodeLockCfValue(
-    UInt8 lock_type, const String & primary, Timestamp ts, UInt64 ttl, const String * short_value = nullptr, Timestamp min_commit_ts = 0)
+    UInt8 lock_type,
+    const String & primary,
+    Timestamp ts,
+    UInt64 ttl,
+    const String * short_value = nullptr,
+    Timestamp min_commit_ts = 0)
 {
     WriteBufferFromOwnString res;
     res.write(lock_type);
@@ -275,7 +327,10 @@ inline R readVarInt(const char *& data, size_t & len)
     return res;
 }
 
-inline UInt64 readVarUInt(const char *& data, size_t & len) { return readVarInt<UInt64>(data, len); }
+inline UInt64 readVarUInt(const char *& data, size_t & len)
+{
+    return readVarInt<UInt64>(data, len);
+}
 
 inline UInt8 readUInt8(const char *& data, size_t & len)
 {
@@ -347,30 +402,29 @@ inline DecodedWriteCFValue decodeWriteCfValue(const TiKVValue & value)
         auto flag = RecordKVFormat::readUInt8(data, len);
         switch (flag)
         {
-            case RecordKVFormat::SHORT_VALUE_PREFIX:
-            {
-                size_t slen = RecordKVFormat::readUInt8(data, len);
-                if (slen > len)
-                    throw Exception("content len not equal to short value len", ErrorCodes::LOGICAL_ERROR);
-                short_value = RecordKVFormat::readRawString<std::string_view>(data, len, slen);
-                break;
-            }
-            case RecordKVFormat::FLAG_OVERLAPPED_ROLLBACK:
-                // ignore
-                break;
-            case RecordKVFormat::GC_FENCE_PREFIX:
-                /**
+        case RecordKVFormat::SHORT_VALUE_PREFIX:
+        {
+            size_t slen = RecordKVFormat::readUInt8(data, len);
+            if (slen > len)
+                throw Exception("content len not equal to short value len", ErrorCodes::LOGICAL_ERROR);
+            short_value = RecordKVFormat::readRawString<std::string_view>(data, len, slen);
+            break;
+        }
+        case RecordKVFormat::FLAG_OVERLAPPED_ROLLBACK:
+            // ignore
+            break;
+        case RecordKVFormat::GC_FENCE_PREFIX:
+            /**
                  * according to https://github.com/tikv/tikv/pull/9207, when meet `GC fence` flag, it is definitely a
                  * rewriting record and there must be a complete row written to tikv, just ignore it in tiflash.
                  */
-                return std::nullopt;
-            default:
-                throw Exception("invalid flag " + std::to_string(flag) + " in write cf", ErrorCodes::LOGICAL_ERROR);
+            return std::nullopt;
+        default:
+            throw Exception("invalid flag " + std::to_string(flag) + " in write cf", ErrorCodes::LOGICAL_ERROR);
         }
     }
 
-    return InnerDecodedWriteCFValue{write_type, prewrite_ts,
-        short_value.empty() ? nullptr : std::make_shared<const TiKVValue>(short_value.data(), short_value.length())};
+    return InnerDecodedWriteCFValue{write_type, prewrite_ts, short_value.empty() ? nullptr : std::make_shared<const TiKVValue>(short_value.data(), short_value.length())};
 }
 
 inline TiKVValue encodeWriteCfValue(UInt8 write_type, Timestamp ts, std::string_view short_value = {}, bool gc_fence = false)
diff --git a/dbms/src/Storages/Transaction/tests/RowCodecTestUtils.h b/dbms/src/Storages/Transaction/tests/RowCodecTestUtils.h
index 20b395a9952..34e0d3d4104 100644
--- a/dbms/src/Storages/Transaction/tests/RowCodecTestUtils.h
+++ b/dbms/src/Storages/Transaction/tests/RowCodecTestUtils.h
@@ -237,14 +237,14 @@ std::pair<TableInfo, std::vector<Field>> getTableInfoAndFields(ColumnIDs handle_
     {
         table_info.is_common_handle = true;
         TiDB::IndexInfo index_info;
-        for (size_t i = 0; i < handle_ids.size(); i++)
+        for (auto handle_id : handle_ids)
         {
             TiDB::IndexColumnInfo index_column_info;
-            for (size_t pos = 0; pos < table_info.columns.size(); pos++)
+            for (auto & column : table_info.columns)
             {
-                if (table_info.columns[pos].id == handle_ids[i])
+                if (column.id == handle_id)
                 {
-                    index_column_info.offset = pos;
+                    index_column_info.name = column.name;
                     break;
                 }
             }
diff --git a/dbms/src/Storages/Transaction/tests/bench_region_block_reader.cpp b/dbms/src/Storages/Transaction/tests/bench_region_block_reader.cpp
new file mode 100644
index 00000000000..05ab637de7f
--- /dev/null
+++ b/dbms/src/Storages/Transaction/tests/bench_region_block_reader.cpp
@@ -0,0 +1,171 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Storages/DeltaMerge/DeltaMergeDefines.h>
+#include <Storages/Transaction/RegionBlockReader.h>
+#include <benchmark/benchmark.h>
+#include <gtest/gtest.h>
+
+#include "RowCodecTestUtils.h"
+
+using TableInfo = TiDB::TableInfo;
+namespace DB::tests
+{
+using ColumnIDs = std::vector<ColumnID>;
+class RegionBlockReaderBenchTest : public benchmark::Fixture
+{
+protected:
+    Int64 handle_value = 100;
+    UInt8 del_mark_value = 0;
+    UInt64 version_value = 100;
+
+    RegionDataReadInfoList data_list_read;
+    std::unordered_map<ColumnID, Field> fields_map;
+
+    enum RowEncodeVersion
+    {
+        RowV1,
+        RowV2
+    };
+
+protected:
+    void SetUp(const benchmark::State & /*state*/) override
+    {
+        data_list_read.clear();
+        fields_map.clear();
+    }
+
+    void encodeColumns(TableInfo & table_info, std::vector<Field> & fields, RowEncodeVersion row_version, size_t num_rows)
+    {
+        // for later check
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            fields_map.emplace(table_info.columns[i].id, fields[i]);
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
+
+        std::vector<Field> value_fields;
+        std::vector<Field> pk_fields;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            if (!table_info.columns[i].hasPriKeyFlag())
+                value_fields.emplace_back(fields[i]);
+            else
+                pk_fields.emplace_back(fields[i]);
+        }
+
+        // create PK
+        WriteBufferFromOwnString pk_buf;
+        if (table_info.is_common_handle)
+        {
+            auto & primary_index_info = table_info.getPrimaryIndexInfo();
+            for (size_t i = 0; i < primary_index_info.idx_cols.size(); i++)
+            {
+                auto idx = column_name_columns_index_map[primary_index_info.idx_cols[i].name];
+                EncodeDatum(pk_fields[i], table_info.columns[idx].getCodecFlag(), pk_buf);
+            }
+        }
+        else
+        {
+            DB::EncodeInt64(handle_value, pk_buf);
+        }
+        RawTiDBPK pk{std::make_shared<String>(pk_buf.releaseStr())};
+        // create value
+        WriteBufferFromOwnString value_buf;
+        if (row_version == RowEncodeVersion::RowV1)
+        {
+            encodeRowV1(table_info, value_fields, value_buf);
+        }
+        else if (row_version == RowEncodeVersion::RowV2)
+        {
+            encodeRowV2(table_info, value_fields, value_buf);
+        }
+        else
+        {
+            throw Exception("Unknown row format " + std::to_string(row_version), ErrorCodes::LOGICAL_ERROR);
+        }
+        auto row_value = std::make_shared<const TiKVValue>(std::move(value_buf.str()));
+        for (size_t i = 0; i < num_rows; i++)
+            data_list_read.emplace_back(pk, del_mark_value, version_value, row_value);
+    }
+
+    bool decodeColumns(DecodingStorageSchemaSnapshotConstPtr decoding_schema, bool force_decode) const
+    {
+        RegionBlockReader reader{decoding_schema};
+        Block block = createBlockSortByColumnID(decoding_schema);
+        return reader.read(block, data_list_read, force_decode);
+    }
+
+    std::pair<TableInfo, std::vector<Field>> getNormalTableInfoFields(const ColumnIDs & handle_ids, bool is_common_handle) const
+    {
+        return getTableInfoAndFields(
+            handle_ids,
+            is_common_handle,
+            ColumnIDValue(2, handle_value),
+            ColumnIDValue(3, std::numeric_limits<UInt64>::max()),
+            ColumnIDValue(4, std::numeric_limits<Float32>::min()),
+            ColumnIDValue(9, String("aaa")),
+            ColumnIDValue(10, DecimalField(ToDecimal<UInt64, Decimal64>(12345678910ULL, 4), 4)),
+            ColumnIDValueNull<UInt64>(11));
+    }
+};
+
+BENCHMARK_DEFINE_F(RegionBlockReaderBenchTest, CommonHandle)
+(benchmark::State & state)
+{
+    size_t num_rows = state.range(0);
+    auto [table_info, fields] = getNormalTableInfoFields({2, 3, 4}, true);
+    encodeColumns(table_info, fields, RowEncodeVersion::RowV2, num_rows);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+    for (auto _ : state)
+    {
+        decodeColumns(decoding_schema, true);
+    }
+}
+
+
+BENCHMARK_DEFINE_F(RegionBlockReaderBenchTest, PKIsNotHandle)
+(benchmark::State & state)
+{
+    size_t num_rows = state.range(0);
+    auto [table_info, fields] = getNormalTableInfoFields({EXTRA_HANDLE_COLUMN_ID}, false);
+    encodeColumns(table_info, fields, RowEncodeVersion::RowV2, num_rows);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+    for (auto _ : state)
+    {
+        decodeColumns(decoding_schema, true);
+    }
+}
+
+BENCHMARK_DEFINE_F(RegionBlockReaderBenchTest, PKIsHandle)
+(benchmark::State & state)
+{
+    size_t num_rows = state.range(0);
+    auto [table_info, fields] = getNormalTableInfoFields({2}, false);
+    encodeColumns(table_info, fields, RowEncodeVersion::RowV2, num_rows);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+    for (auto _ : state)
+    {
+        decodeColumns(decoding_schema, true);
+    }
+}
+
+constexpr size_t num_iterations_test = 1000;
+
+BENCHMARK_REGISTER_F(RegionBlockReaderBenchTest, PKIsHandle)->Iterations(num_iterations_test)->Arg(1)->Arg(10)->Arg(100);
+BENCHMARK_REGISTER_F(RegionBlockReaderBenchTest, CommonHandle)->Iterations(num_iterations_test)->Arg(1)->Arg(10)->Arg(100);
+BENCHMARK_REGISTER_F(RegionBlockReaderBenchTest, PKIsNotHandle)->Iterations(num_iterations_test)->Arg(1)->Arg(10)->Arg(100);
+
+} // namespace DB::tests
diff --git a/dbms/src/Storages/Transaction/tests/gtest_decoding_storage_schema_snapshot.cpp b/dbms/src/Storages/Transaction/tests/gtest_decoding_storage_schema_snapshot.cpp
new file mode 100644
index 00000000000..1de9809ecad
--- /dev/null
+++ b/dbms/src/Storages/Transaction/tests/gtest_decoding_storage_schema_snapshot.cpp
@@ -0,0 +1,65 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Storages/Transaction/DecodingStorageSchemaSnapshot.h>
+#include <TestUtils/TiFlashTestBasic.h>
+#include <gtest/gtest.h>
+
+#include "RowCodecTestUtils.h"
+
+namespace DB::tests
+{
+static TableInfo getTableInfoByJson(const String & json_table_info)
+{
+    return TableInfo(json_table_info);
+}
+TEST(DecodingStorageSchemaSnapshotTest, CheckPKInfosUnderClusteredIndex)
+{
+    // table with column [A,B,C,D], primary keys [A,C]
+    const String json_table_info = R"json({"id":75,"name":{"O":"test","L":"test"},"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"id":1,"name":{"O":"A","L":"a"},"offset":0,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4099,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":2,"name":{"O":"B","L":"b"},"offset":1,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":15,"Flag":0,"Flen":20,"Decimal":0,"Charset":"utf8mb4","Collate":"utf8mb4_bin","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":3,"name":{"O":"C","L":"c"},"offset":2,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4099,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":4,"name":{"O":"D","L":"d"},"offset":3,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2}],"index_info":[{"id":1,"idx_name":{"O":"PRIMARY","L":"primary"},"tbl_name":{"O":"","L":""},"idx_cols":[{"name":{"O":"A","L":"a"},"offset":0,"length":-1},{"name":{"O":"C","L":"c"},"offset":2,"length":-1}],"state":5,"comment":"","index_type":1,"is_unique":true,"is_primary":true,"is_invisible":false,"is_global":false}],"constraint_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"is_common_handle":true,"common_handle_version":1,"comment":"","auto_inc_id":0,"auto_id_cache":0,"auto_rand_id":0,"max_col_id":4,"max_idx_id":1,"max_cst_id":0,"update_timestamp":434039123413303302,"ShardRowIDBits":0,"max_shard_row_id_bits":0,"auto_random_bits":0,"pre_split_regions":0,"partition":null,"compression":"","view":null,"sequence":null,"Lock":null,"version":4,"tiflash_replica":{"Count":1,"LocationLabels":[],"Available":false,"AvailablePartitionIDs":null},"is_columnar":false,"temp_table_type":0,"cache_table_status":0,"policy_ref_info":null,"stats_options":null})json";
+    auto table_info = getTableInfoByJson(json_table_info);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+
+    //check decoding_schema->pk_column_ids infos
+    ASSERT_EQ(decoding_schema->pk_column_ids.size(), 2);
+    ASSERT_EQ(decoding_schema->pk_column_ids[0], 1);
+    ASSERT_EQ(decoding_schema->pk_column_ids[1], 3);
+
+    //check decoding_schema->pk_pos_map infos
+    ASSERT_EQ(decoding_schema->pk_column_ids.size(), decoding_schema->pk_pos_map.size());
+    // there are three hidden column in the decoded block, so the position of A,C is 3,5
+    ASSERT_EQ(decoding_schema->pk_pos_map.at(decoding_schema->pk_column_ids[0]), 3);
+    ASSERT_EQ(decoding_schema->pk_pos_map.at(decoding_schema->pk_column_ids[1]), 5);
+}
+
+TEST(DecodingStorageSchemaSnapshotTest, CheckPKInfosUnderClusteredIndexAfterDropColumn)
+{
+    // drop column B for [A,B,C,D]; table with column [A,C,D], primary keys [A,C]
+    const String json_table_info = R"json({"id":75,"name":{"O":"test","L":"test"},"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"id":1,"name":{"O":"A","L":"a"},"offset":0,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4099,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":3,"name":{"O":"C","L":"c"},"offset":2,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4099,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":4,"name":{"O":"D","L":"d"},"offset":3,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2}],"index_info":[{"id":1,"idx_name":{"O":"PRIMARY","L":"primary"},"tbl_name":{"O":"","L":""},"idx_cols":[{"name":{"O":"A","L":"a"},"offset":0,"length":-1},{"name":{"O":"C","L":"c"},"offset":2,"length":-1}],"state":5,"comment":"","index_type":1,"is_unique":true,"is_primary":true,"is_invisible":false,"is_global":false}],"constraint_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"is_common_handle":true,"common_handle_version":1,"comment":"","auto_inc_id":0,"auto_id_cache":0,"auto_rand_id":0,"max_col_id":4,"max_idx_id":1,"max_cst_id":0,"update_timestamp":434039123413303302,"ShardRowIDBits":0,"max_shard_row_id_bits":0,"auto_random_bits":0,"pre_split_regions":0,"partition":null,"compression":"","view":null,"sequence":null,"Lock":null,"version":4,"tiflash_replica":{"Count":1,"LocationLabels":[],"Available":false,"AvailablePartitionIDs":null},"is_columnar":false,"temp_table_type":0,"cache_table_status":0,"policy_ref_info":null,"stats_options":null})json";
+    auto table_info = getTableInfoByJson(json_table_info);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+
+    //check decoding_schema->pk_column_ids infos
+    ASSERT_EQ(decoding_schema->pk_column_ids.size(), 2);
+    ASSERT_EQ(decoding_schema->pk_column_ids[0], 1);
+    ASSERT_EQ(decoding_schema->pk_column_ids[1], 3);
+
+    //check decoding_schema->pk_pos_map infos
+    ASSERT_EQ(decoding_schema->pk_column_ids.size(), decoding_schema->pk_pos_map.size());
+    // there are three hidden column in the decoded block, so the position of A,C is 3,4
+    ASSERT_EQ(decoding_schema->pk_pos_map.at(decoding_schema->pk_column_ids[0]), 3);
+    ASSERT_EQ(decoding_schema->pk_pos_map.at(decoding_schema->pk_column_ids[1]), 4);
+}
+
+} // namespace DB::tests
diff --git a/dbms/src/Storages/Transaction/tests/gtest_kvstore.cpp b/dbms/src/Storages/Transaction/tests/gtest_kvstore.cpp
index f0cafce3914..77aab06f6cf 100644
--- a/dbms/src/Storages/Transaction/tests/gtest_kvstore.cpp
+++ b/dbms/src/Storages/Transaction/tests/gtest_kvstore.cpp
@@ -18,10 +18,9 @@
 #include <Storages/Transaction/Region.h>
 #include <Storages/Transaction/RegionExecutionResult.h>
 #include <Storages/Transaction/TMTContext.h>
+#include <Storages/Transaction/tests/region_helper.h>
 #include <TestUtils/TiFlashTestBasic.h>
 
-#include "region_helper.h"
-
 namespace DB
 {
 namespace RegionBench
@@ -37,13 +36,6 @@ extern void ChangeRegionStateRange(RegionState & region_state, bool source_at_le
 
 namespace tests
 {
-RegionPtr makeRegion(UInt64 id, const std::string start_key, const std::string end_key, const TiFlashRaftProxyHelper * proxy_helper = nullptr)
-{
-    return std::make_shared<Region>(
-        RegionMeta(createPeer(2, true), createRegionInfo(id, std::move(start_key), std::move(end_key)), initialApplyState()),
-        proxy_helper);
-}
-
 class RegionKVStoreTest : public ::testing::Test
 {
 public:
@@ -1187,6 +1179,12 @@ void RegionKVStoreTest::testKVStore()
             ASSERT_EQ(e.message(), "unsupported admin command type InvalidAdmin");
         }
     }
+    {
+        // There shall be data to flush.
+        ASSERT_EQ(kvs.needFlushRegionData(19, ctx.getTMTContext()), true);
+        // Force flush until succeed only for testing.
+        ASSERT_EQ(kvs.tryFlushRegionData(19, true, ctx.getTMTContext()), true);
+    }
 }
 
 void test_mergeresult()
diff --git a/dbms/src/Storages/Transaction/tests/gtest_region_block_reader.cpp b/dbms/src/Storages/Transaction/tests/gtest_region_block_reader.cpp
index 6a883230854..d08b4dd3738 100644
--- a/dbms/src/Storages/Transaction/tests/gtest_region_block_reader.cpp
+++ b/dbms/src/Storages/Transaction/tests/gtest_region_block_reader.cpp
@@ -26,13 +26,13 @@ using ColumnIDs = std::vector<ColumnID>;
 class RegionBlockReaderTestFixture : public ::testing::Test
 {
 protected:
-    Int64 handle_value_ = 100;
-    UInt8 del_mark_value_ = 0;
-    UInt64 version_value_ = 100;
-    size_t rows_ = 3;
+    Int64 handle_value = 100;
+    UInt8 del_mark_value = 0;
+    UInt64 version_value = 100;
+    size_t rows = 3;
 
-    RegionDataReadInfoList data_list_read_;
-    std::unordered_map<ColumnID, Field> fields_map_;
+    RegionDataReadInfoList data_list_read;
+    std::unordered_map<ColumnID, Field> fields_map;
 
     enum RowEncodeVersion
     {
@@ -43,8 +43,8 @@ class RegionBlockReaderTestFixture : public ::testing::Test
 protected:
     void SetUp() override
     {
-        data_list_read_.clear();
-        fields_map_.clear();
+        data_list_read.clear();
+        fields_map.clear();
     }
 
     void TearDown() override {}
@@ -52,8 +52,12 @@ class RegionBlockReaderTestFixture : public ::testing::Test
     void encodeColumns(TableInfo & table_info, std::vector<Field> & fields, RowEncodeVersion row_version)
     {
         // for later check
+        std::unordered_map<String, size_t> column_name_columns_index_map;
         for (size_t i = 0; i < table_info.columns.size(); i++)
-            fields_map_.emplace(table_info.columns[i].id, fields[i]);
+        {
+            fields_map.emplace(table_info.columns[i].id, fields[i]);
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
 
         std::vector<Field> value_fields;
         std::vector<Field> pk_fields;
@@ -72,13 +76,13 @@ class RegionBlockReaderTestFixture : public ::testing::Test
             auto & primary_index_info = table_info.getPrimaryIndexInfo();
             for (size_t i = 0; i < primary_index_info.idx_cols.size(); i++)
             {
-                size_t pk_offset = primary_index_info.idx_cols[i].offset;
-                EncodeDatum(pk_fields[i], table_info.columns[pk_offset].getCodecFlag(), pk_buf);
+                auto idx = column_name_columns_index_map[primary_index_info.idx_cols[i].name];
+                EncodeDatum(pk_fields[i], table_info.columns[idx].getCodecFlag(), pk_buf);
             }
         }
         else
         {
-            DB::EncodeInt64(handle_value_, pk_buf);
+            DB::EncodeInt64(handle_value, pk_buf);
         }
         RawTiDBPK pk{std::make_shared<String>(pk_buf.releaseStr())};
         // create value
@@ -96,44 +100,44 @@ class RegionBlockReaderTestFixture : public ::testing::Test
             throw Exception("Unknown row format " + std::to_string(row_version), ErrorCodes::LOGICAL_ERROR);
         }
         auto row_value = std::make_shared<const TiKVValue>(std::move(value_buf.str()));
-        for (size_t i = 0; i < rows_; i++)
-            data_list_read_.emplace_back(pk, del_mark_value_, version_value_, row_value);
+        for (size_t i = 0; i < rows; i++)
+            data_list_read.emplace_back(pk, del_mark_value, version_value, row_value);
     }
 
     void checkBlock(DecodingStorageSchemaSnapshotConstPtr decoding_schema, const Block & block) const
     {
         ASSERT_EQ(block.columns(), decoding_schema->column_defines->size());
-        for (size_t row = 0; row < rows_; row++)
+        for (size_t row = 0; row < rows; row++)
         {
             for (size_t pos = 0; pos < block.columns(); pos++)
             {
-                auto & column_element = block.getByPosition(pos);
+                const auto & column_element = block.getByPosition(pos);
                 if (row == 0)
                 {
-                    ASSERT_EQ(column_element.column->size(), rows_);
+                    ASSERT_EQ(column_element.column->size(), rows);
                 }
                 if (column_element.name == EXTRA_HANDLE_COLUMN_NAME)
                 {
                     if (decoding_schema->is_common_handle)
                     {
-                        ASSERT_EQ((*column_element.column)[row], Field(*std::get<0>(data_list_read_[row])));
+                        ASSERT_EQ((*column_element.column)[row], Field(*std::get<0>(data_list_read[row])));
                     }
                     else
                     {
-                        ASSERT_EQ((*column_element.column)[row], Field(handle_value_));
+                        ASSERT_EQ((*column_element.column)[row], Field(handle_value));
                     }
                 }
                 else if (column_element.name == VERSION_COLUMN_NAME)
                 {
-                    ASSERT_EQ((*column_element.column)[row], Field(version_value_));
+                    ASSERT_EQ((*column_element.column)[row], Field(version_value));
                 }
                 else if (column_element.name == TAG_COLUMN_NAME)
                 {
-                    ASSERT_EQ((*column_element.column)[row], Field(NearestFieldType<UInt8>::Type(del_mark_value_)));
+                    ASSERT_EQ((*column_element.column)[row], Field(NearestFieldType<UInt8>::Type(del_mark_value)));
                 }
                 else
                 {
-                    ASSERT_EQ((*column_element.column)[row], fields_map_.at(column_element.column_id));
+                    ASSERT_EQ((*column_element.column)[row], fields_map.at(column_element.column_id));
                 }
             }
         }
@@ -143,7 +147,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
     {
         RegionBlockReader reader{decoding_schema};
         Block block = createBlockSortByColumnID(decoding_schema);
-        if (!reader.read(block, data_list_read_, force_decode))
+        if (!reader.read(block, data_list_read, force_decode))
             return false;
 
         checkBlock(decoding_schema, block);
@@ -155,7 +159,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         return getTableInfoAndFields(
             handle_ids,
             is_common_handle,
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(3, std::numeric_limits<UInt64>::max()),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(9, String("aaa")),
@@ -170,7 +174,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
             handle_ids,
             is_common_handle,
             ColumnIDValue(1, String("")),
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(3, std::numeric_limits<UInt64>::max()),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(8, String("")),
@@ -182,12 +186,12 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         // add default value for missing column
         std::vector<ColumnID> missing_column_ids{1, 8, 13};
         String missing_column_default_value = String("default");
-        for (size_t i = 0; i < table_info.columns.size(); i++)
+        for (auto & column : table_info.columns)
         {
-            if (std::find(missing_column_ids.begin(), missing_column_ids.end(), table_info.columns[i].id) != missing_column_ids.end())
+            if (std::find(missing_column_ids.begin(), missing_column_ids.end(), column.id) != missing_column_ids.end())
             {
-                table_info.columns[i].origin_default_value = missing_column_default_value;
-                fields_map_.emplace(table_info.columns[i].id, Field(missing_column_default_value));
+                column.origin_default_value = missing_column_default_value;
+                fields_map.emplace(column.id, Field(missing_column_default_value));
             }
         }
         return table_info;
@@ -199,7 +203,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         std::tie(table_info, std::ignore) = getTableInfoAndFields(
             handle_ids,
             is_common_handle,
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(9, String("aaa")),
             ColumnIDValue(10, DecimalField(ToDecimal<UInt64, Decimal64>(12345678910ULL, 4), 4)));
@@ -212,7 +216,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         std::tie(table_info, std::ignore) = getTableInfoAndFields(
             handle_ids,
             is_common_handle,
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(3, std::numeric_limits<UInt8>::max()),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(9, String("aaa")),
@@ -227,7 +231,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         std::tie(table_info, std::ignore) = getTableInfoAndFields(
             handle_ids,
             is_common_handle,
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(3, std::numeric_limits<UInt64>::max()),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(9, String("aaa")),
diff --git a/dbms/src/Storages/Transaction/tests/gtest_table_info.cpp b/dbms/src/Storages/Transaction/tests/gtest_table_info.cpp
index 516a173b151..871153cb0e9 100644
--- a/dbms/src/Storages/Transaction/tests/gtest_table_info.cpp
+++ b/dbms/src/Storages/Transaction/tests/gtest_table_info.cpp
@@ -42,7 +42,7 @@ struct ParseCase
     std::function<void(const TableInfo & table_info)> check;
 };
 
-TEST(TiDBTableInfo_test, ParseFromJSON)
+TEST(TiDBTableInfoTest, ParseFromJSON)
 try
 {
     auto cases = {
@@ -136,54 +136,54 @@ struct StmtCase
     }
 };
 
-TEST(TiDBTableInfo_test, GenCreateTableStatement)
+TEST(TiDBTableInfoTest, GenCreateTableStatement)
 try
 {
     auto cases = //
         {StmtCase{
              1145, //
              R"json({"id":1939,"db_name":{"O":"customer","L":"customer"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", //
-             R"json({"id":1145,"name":{"O":"customerdebt","L":"customerdebt"},"cols":[{"id":1,"name":{"O":"id","L":"id"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"type":{"Tp":8,"Flag":515,"Flen":20,"Decimal":0},"state":5,"comment":"i\"d"}],"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"负债信息","partition":null})json", //
-             R"stmt(CREATE TABLE `customer`.`customerdebt`(`id` Int64) Engine = DeltaMerge((`id`), '{"cols":[{"comment":"i\\"d","default":null,"default_bit":null,"id":1,"name":{"L":"id","O":"id"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":null,"Collate":null,"Decimal":0,"Elems":null,"Flag":515,"Flen":20,"Tp":8}}],"comment":"\\u8D1F\\u503A\\u4FE1\\u606F","id":1145,"index_info":[],"is_common_handle":false,"name":{"L":"customerdebt","O":"customerdebt"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":0}'))stmt", //
+             R"json({"id":1145,"name":{"O":"customerdebt","L":"customerdebt"},"cols":[{"id":1,"name":{"O":"id","L":"id"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"type":{"Tp":8,"Flag":515,"Flen":20,"Decimal":0},"state":5,"comment":"i\"d"}],"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"负债信息","partition":null,"tiflash_mode":"fast"})json", //
+             R"stmt(CREATE TABLE `customer`.`customerdebt`(`id` Int64) Engine = DeltaMerge((`id`), '{"cols":[{"comment":"i\\"d","default":null,"default_bit":null,"id":1,"name":{"L":"id","O":"id"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":null,"Collate":null,"Decimal":0,"Elems":null,"Flag":515,"Flen":20,"Tp":8}}],"comment":"\\u8D1F\\u503A\\u4FE1\\u606F","id":1145,"index_info":[],"is_common_handle":false,"name":{"L":"customerdebt","O":"customerdebt"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_mode":"fast","tiflash_replica":{"Count":0},"update_timestamp":0}'))stmt", //
          },
          StmtCase{
              2049, //
              R"json({"id":1939,"db_name":{"O":"customer","L":"customer"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", //
-             R"json({"id":2049,"name":{"O":"customerdebt","L":"customerdebt"},"cols":[{"id":1,"name":{"O":"id","L":"id"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"type":{"Tp":8,"Flag":515,"Flen":20,"Decimal":0},"state":5,"comment":"i\"d"}],"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"负债信息","update_timestamp":404545295996944390,"partition":null})json", //
-             R"stmt(CREATE TABLE `customer`.`customerdebt`(`id` Int64) Engine = DeltaMerge((`id`), '{"cols":[{"comment":"i\\"d","default":null,"default_bit":null,"id":1,"name":{"L":"id","O":"id"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":null,"Collate":null,"Decimal":0,"Elems":null,"Flag":515,"Flen":20,"Tp":8}}],"comment":"\\u8D1F\\u503A\\u4FE1\\u606F","id":2049,"index_info":[],"is_common_handle":false,"name":{"L":"customerdebt","O":"customerdebt"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":404545295996944390}'))stmt", //
+             R"json({"id":2049,"name":{"O":"customerdebt","L":"customerdebt"},"cols":[{"id":1,"name":{"O":"id","L":"id"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"type":{"Tp":8,"Flag":515,"Flen":20,"Decimal":0},"state":5,"comment":"i\"d"}],"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"负债信息","update_timestamp":404545295996944390,"partition":null,"tiflash_mode":""})json", //
+             R"stmt(CREATE TABLE `customer`.`customerdebt`(`id` Int64) Engine = DeltaMerge((`id`), '{"cols":[{"comment":"i\\"d","default":null,"default_bit":null,"id":1,"name":{"L":"id","O":"id"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":null,"Collate":null,"Decimal":0,"Elems":null,"Flag":515,"Flen":20,"Tp":8}}],"comment":"\\u8D1F\\u503A\\u4FE1\\u606F","id":2049,"index_info":[],"is_common_handle":false,"name":{"L":"customerdebt","O":"customerdebt"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_mode":"","tiflash_replica":{"Count":0},"update_timestamp":404545295996944390}'))stmt", //
          },
          StmtCase{
              31, //
              R"json({"id":1,"db_name":{"O":"db1","L":"db1"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", //
-             R"json({"id":31,"name":{"O":"simple_t","L":"simple_t"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":""}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":404545295996944390,"ShardRowIDBits":0,"partition":null})json", //
-             R"stmt(CREATE TABLE `db1`.`simple_t`(`i` Nullable(Int32), `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":0,"Flen":11,"Tp":3}}],"comment":"","id":31,"index_info":[],"is_common_handle":false,"name":{"L":"simple_t","O":"simple_t"},"partition":null,"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":404545295996944390}'))stmt", //
+             R"json({"id":31,"name":{"O":"simple_t","L":"simple_t"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":""}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":404545295996944390,"ShardRowIDBits":0,"partition":null,"tiflash_mode":""})json", //
+             R"stmt(CREATE TABLE `db1`.`simple_t`(`i` Nullable(Int32), `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":0,"Flen":11,"Tp":3}}],"comment":"","id":31,"index_info":[],"is_common_handle":false,"name":{"L":"simple_t","O":"simple_t"},"partition":null,"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_mode":"","tiflash_replica":{"Count":0},"update_timestamp":404545295996944390}'))stmt", //
          },
          StmtCase{
              33, //
              R"json({"id":2,"db_name":{"O":"db2","L":"db2"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", //
-             R"json({"id":33,"name":{"O":"pk_t","L":"pk_t"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":3,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":""}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":404545312978108418,"ShardRowIDBits":0,"partition":null})json", //
-             R"stmt(CREATE TABLE `db2`.`pk_t`(`i` Int32) Engine = DeltaMerge((`i`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":3,"Flen":11,"Tp":3}}],"comment":"","id":33,"index_info":[],"is_common_handle":false,"name":{"L":"pk_t","O":"pk_t"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":404545312978108418}'))stmt", //
+             R"json({"id":33,"name":{"O":"pk_t","L":"pk_t"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":3,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":""}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":404545312978108418,"ShardRowIDBits":0,"partition":null,"tiflash_mode":""})json", //
+             R"stmt(CREATE TABLE `db2`.`pk_t`(`i` Int32) Engine = DeltaMerge((`i`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":3,"Flen":11,"Tp":3}}],"comment":"","id":33,"index_info":[],"is_common_handle":false,"name":{"L":"pk_t","O":"pk_t"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_mode":"","tiflash_replica":{"Count":0},"update_timestamp":404545312978108418}'))stmt", //
          },
          StmtCase{
              35, //
              R"json({"id":1,"db_name":{"O":"db1","L":"db1"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", //
-             R"json({"id":35,"name":{"O":"not_null_t","L":"not_null_t"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4097,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":""}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":404545324922961926,"ShardRowIDBits":0,"partition":null})json", //
-             R"stmt(CREATE TABLE `db1`.`not_null_t`(`i` Int32, `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":4097,"Flen":11,"Tp":3}}],"comment":"","id":35,"index_info":[],"is_common_handle":false,"name":{"L":"not_null_t","O":"not_null_t"},"partition":null,"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":404545324922961926}'))stmt", //
+             R"json({"id":35,"name":{"O":"not_null_t","L":"not_null_t"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4097,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":""}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":404545324922961926,"ShardRowIDBits":0,"partition":null,"tiflash_mode":""})json", //
+             R"stmt(CREATE TABLE `db1`.`not_null_t`(`i` Int32, `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":4097,"Flen":11,"Tp":3}}],"comment":"","id":35,"index_info":[],"is_common_handle":false,"name":{"L":"not_null_t","O":"not_null_t"},"partition":null,"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_mode":"","tiflash_replica":{"Count":0},"update_timestamp":404545324922961926}'))stmt", //
          },
          StmtCase{
              37, //
              R"json({"id":2,"db_name":{"O":"db2","L":"db2"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json",
-             R"json({"id":37,"name":{"O":"mytable","L":"mytable"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"mycol","L":"mycol"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":15,"Flag":4099,"Flen":256,"Decimal":0,"Charset":"utf8","Collate":"utf8_bin","Elems":null},"state":5,"comment":""}],"index_info":[{"id":1,"idx_name":{"O":"PRIMARY","L":"primary"},"tbl_name":{"O":"","L":""},"idx_cols":[{"name":{"O":"mycol","L":"mycol"},"offset":0,"length":-1}],"is_unique":true,"is_primary":true,"state":5,"comment":"","index_type":1}],"fk_info":null,"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":1,"update_timestamp":404566455285710853,"ShardRowIDBits":0,"partition":null})json", //
-             R"stmt(CREATE TABLE `db2`.`mytable`(`mycol` String) Engine = DeltaMerge((`mycol`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"mycol","O":"mycol"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"utf8","Collate":"utf8_bin","Decimal":0,"Elems":null,"Flag":4099,"Flen":256,"Tp":15}}],"comment":"","id":37,"index_info":[],"is_common_handle":false,"name":{"L":"mytable","O":"mytable"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":404566455285710853}'))stmt", //
+             R"json({"id":37,"name":{"O":"mytable","L":"mytable"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"mycol","L":"mycol"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":15,"Flag":4099,"Flen":256,"Decimal":0,"Charset":"utf8","Collate":"utf8_bin","Elems":null},"state":5,"comment":""}],"index_info":[{"id":1,"idx_name":{"O":"PRIMARY","L":"primary"},"tbl_name":{"O":"","L":""},"idx_cols":[{"name":{"O":"mycol","L":"mycol"},"offset":0,"length":-1}],"is_unique":true,"is_primary":true,"state":5,"comment":"","index_type":1}],"fk_info":null,"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":1,"update_timestamp":404566455285710853,"ShardRowIDBits":0,"partition":null,"tiflash_mode":""})json", //
+             R"stmt(CREATE TABLE `db2`.`mytable`(`mycol` String) Engine = DeltaMerge((`mycol`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"mycol","O":"mycol"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"utf8","Collate":"utf8_bin","Decimal":0,"Elems":null,"Flag":4099,"Flen":256,"Tp":15}}],"comment":"","id":37,"index_info":[],"is_common_handle":false,"name":{"L":"mytable","O":"mytable"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_mode":"","tiflash_replica":{"Count":0},"update_timestamp":404566455285710853}'))stmt", //
          },
          StmtCase{
              32, //
              R"json({"id":1,"db_name":{"O":"test","L":"test"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", //
-             R"json({"id":31,"name":{"O":"range_part_t","L":"range_part_t"},"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","version":0}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":407445773801488390,"ShardRowIDBits":0,"partition":{"type":1,"expr":"`i`","columns":null,"enable":true,"definitions":[{"id":32,"name":{"O":"p0","L":"p0"},"less_than":["0"]},{"id":33,"name":{"O":"p1","L":"p1"},"less_than":["100"]}],"num":0},"compression":"","version":1})json", //
-             R"stmt(CREATE TABLE `test`.`range_part_t_32`(`i` Nullable(Int32), `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"belonging_table_id":31,"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":0,"Flen":11,"Tp":3}}],"comment":"","id":32,"index_info":[],"is_common_handle":false,"is_partition_sub_table":true,"name":{"L":"range_part_t_32","O":"range_part_t_32"},"partition":null,"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":407445773801488390}'))stmt", //
+             R"json({"id":31,"name":{"O":"range_part_t","L":"range_part_t"},"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","version":0}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":407445773801488390,"ShardRowIDBits":0,"partition":{"type":1,"expr":"`i`","columns":null,"enable":true,"definitions":[{"id":32,"name":{"O":"p0","L":"p0"},"less_than":["0"]},{"id":33,"name":{"O":"p1","L":"p1"},"less_than":["100"]}],"num":0},"compression":"","version":1,"tiflash_mode":""})json", //
+             R"stmt(CREATE TABLE `test`.`range_part_t_32`(`i` Nullable(Int32), `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"belonging_table_id":31,"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":0,"Flen":11,"Tp":3}}],"comment":"","id":32,"index_info":[],"is_common_handle":false,"is_partition_sub_table":true,"name":{"L":"range_part_t_32","O":"range_part_t_32"},"partition":null,"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_mode":"","tiflash_replica":{"Count":0},"update_timestamp":407445773801488390}'))stmt", //
          }};
 
-    for (auto & c : cases)
+    for (const auto & c : cases)
     {
         c.verifyTableInfo();
     }
diff --git a/dbms/src/Storages/Transaction/tests/region_helper.h b/dbms/src/Storages/Transaction/tests/region_helper.h
index 2808ace0ecb..39bae2669ab 100644
--- a/dbms/src/Storages/Transaction/tests/region_helper.h
+++ b/dbms/src/Storages/Transaction/tests/region_helper.h
@@ -18,8 +18,10 @@
 
 #include <optional>
 
-using namespace DB;
-
+namespace DB
+{
+namespace tests
+{
 #define ASSERT_CHECK(cond, res)                                  \
     do                                                           \
     {                                                            \
@@ -37,7 +39,7 @@ using namespace DB;
 #define ASSERT_CHECK_EQUAL(a, b, res)                                         \
     do                                                                        \
     {                                                                         \
-        if (!(a == b))                                                        \
+        if (!((a) == (b)))                                                    \
         {                                                                     \
             std::cerr << __FILE__ << ":" << __LINE__ << ":"                   \
                       << " Assertion " << #a << " == " << #b << " failed.\n"; \
@@ -76,3 +78,16 @@ inline RegionMeta createRegionMeta(UInt64 id, DB::TableID table_id, std::optiona
                       /*region=*/createRegionInfo(id, RecordKVFormat::genKey(table_id, 0), RecordKVFormat::genKey(table_id, 300)),
                       /*apply_state_=*/apply_state.value_or(initialApplyState()));
 }
+
+inline RegionPtr makeRegion(UInt64 id, const std::string start_key, const std::string end_key, const TiFlashRaftProxyHelper * proxy_helper = nullptr)
+{
+    return std::make_shared<Region>(
+        RegionMeta(
+            createPeer(2, true),
+            createRegionInfo(id, std::move(start_key), std::move(end_key)),
+            initialApplyState()),
+        proxy_helper);
+}
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Storages/registerStorages.cpp b/dbms/src/Storages/registerStorages.cpp
index a709be0b017..ddf815316ab 100644
--- a/dbms/src/Storages/registerStorages.cpp
+++ b/dbms/src/Storages/registerStorages.cpp
@@ -27,7 +27,6 @@ void registerStorageNull(StorageFactory & factory);
 void registerStorageMerge(StorageFactory & factory);
 void registerStorageBuffer(StorageFactory & factory);
 void registerStorageMemory(StorageFactory & factory);
-void registerStorageFile(StorageFactory & factory);
 void registerStorageDictionary(StorageFactory & factory);
 void registerStorageSet(StorageFactory & factory);
 void registerStorageJoin(StorageFactory & factory);
@@ -47,7 +46,6 @@ void registerStorages()
     registerStorageMerge(factory);
     registerStorageBuffer(factory);
     registerStorageMemory(factory);
-    registerStorageFile(factory);
     registerStorageDictionary(factory);
     registerStorageSet(factory);
     registerStorageJoin(factory);
diff --git a/dbms/src/Storages/tests/gtest_filter_parser.cpp b/dbms/src/Storages/tests/gtest_filter_parser.cpp
index 8820c05d2da..3a554fcf4b6 100644
--- a/dbms/src/Storages/tests/gtest_filter_parser.cpp
+++ b/dbms/src/Storages/tests/gtest_filter_parser.cpp
@@ -98,7 +98,7 @@ DM::RSOperatorPtr FilterParserTest::generateRsOperator(const String table_info_j
     DM::ColumnDefines columns_to_read;
     {
         NamesAndTypes source_columns;
-        std::tie(source_columns, std::ignore) = parseColumnsFromTableInfo(table_info, log->getLog());
+        std::tie(source_columns, std::ignore) = parseColumnsFromTableInfo(table_info);
         dag_query = std::make_unique<DAGQueryInfo>(
             conditions,
             DAGPreparedSets(),
diff --git a/dbms/src/TableFunctions/ITableFunction.cpp b/dbms/src/TableFunctions/ITableFunction.cpp
index ca05075cac0..d262a5637f7 100644
--- a/dbms/src/TableFunctions/ITableFunction.cpp
+++ b/dbms/src/TableFunctions/ITableFunction.cpp
@@ -15,17 +15,10 @@
 #include <Common/ProfileEvents.h>
 #include <TableFunctions/ITableFunction.h>
 
-
-namespace ProfileEvents
-{
-extern const Event TableFunctionExecute;
-}
-
 namespace DB
 {
 StoragePtr ITableFunction::execute(const ASTPtr & ast_function, const Context & context) const
 {
-    ProfileEvents::increment(ProfileEvents::TableFunctionExecute);
     return executeImpl(ast_function, context);
 }
 
diff --git a/dbms/src/TableFunctions/TableFunctionCatBoostPool.cpp b/dbms/src/TableFunctions/TableFunctionCatBoostPool.cpp
deleted file mode 100644
index ab5cd7e5849..00000000000
--- a/dbms/src/TableFunctions/TableFunctionCatBoostPool.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2022 PingCAP, Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <Common/typeid_cast.h>
-#include <Parsers/ASTFunction.h>
-#include <Parsers/ASTLiteral.h>
-#include <Storages/StorageCatBoostPool.h>
-#include <TableFunctions/TableFunctionCatBoostPool.h>
-#include <TableFunctions/TableFunctionFactory.h>
-
-
-namespace DB
-{
-namespace ErrorCodes
-{
-extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
-extern const int BAD_ARGUMENTS;
-} // namespace ErrorCodes
-
-
-StoragePtr TableFunctionCatBoostPool::executeImpl(const ASTPtr & ast_function, const Context & context) const
-{
-    ASTs & args_func = typeid_cast<ASTFunction &>(*ast_function).children;
-
-    std::string err = "Table function '" + getName() + "' requires 2 parameters: "
-        + "column descriptions file, dataset description file";
-
-    if (args_func.size() != 1)
-        throw Exception(err, ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
-
-    ASTs & args = typeid_cast<ASTExpressionList &>(*args_func.at(0)).children;
-
-    if (args.size() != 2)
-        throw Exception(err, ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
-
-    auto getStringLiteral = [](const IAST & node, const char * description) {
-        auto lit = typeid_cast<const ASTLiteral *>(&node);
-        if (!lit)
-            throw Exception(description + String(" must be string literal (in single quotes)."), ErrorCodes::BAD_ARGUMENTS);
-
-        if (lit->value.getType() != Field::Types::String)
-            throw Exception(description + String(" must be string literal (in single quotes)."), ErrorCodes::BAD_ARGUMENTS);
-
-        return safeGet<const String &>(lit->value);
-    };
-    String column_descriptions_file = getStringLiteral(*args[0], "Column descriptions file");
-    String dataset_description_file = getStringLiteral(*args[1], "Dataset description file");
-
-    return StorageCatBoostPool::create(context, column_descriptions_file, dataset_description_file);
-}
-
-void registerTableFunctionCatBoostPool(TableFunctionFactory & factory)
-{
-    factory.registerFunction<TableFunctionCatBoostPool>();
-}
-
-} // namespace DB
diff --git a/dbms/src/TableFunctions/TableFunctionCatBoostPool.h b/dbms/src/TableFunctions/TableFunctionCatBoostPool.h
deleted file mode 100644
index 0b5b32dfffe..00000000000
--- a/dbms/src/TableFunctions/TableFunctionCatBoostPool.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2022 PingCAP, Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <TableFunctions/ITableFunction.h>
-
-
-namespace DB
-{
-/* catboostPool('column_descriptions_file', 'dataset_description_file')
- * Create storage from CatBoost dataset.
- */
-class TableFunctionCatBoostPool : public ITableFunction
-{
-public:
-    static constexpr auto name = "catBoostPool";
-    std::string getName() const override { return name; }
-
-private:
-    StoragePtr executeImpl(const ASTPtr & ast_function, const Context & context) const override;
-};
-
-} // namespace DB
diff --git a/dbms/src/TableFunctions/TableFunctionFile.cpp b/dbms/src/TableFunctions/TableFunctionFile.cpp
deleted file mode 100644
index 0ff1a5b443f..00000000000
--- a/dbms/src/TableFunctions/TableFunctionFile.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright 2022 PingCAP, Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <Common/Exception.h>
-#include <Common/typeid_cast.h>
-#include <DataTypes/DataTypeFactory.h>
-#include <Interpreters/Context.h>
-#include <Interpreters/evaluateConstantExpression.h>
-#include <Parsers/ASTFunction.h>
-#include <Parsers/ASTLiteral.h>
-#include <Storages/StorageFile.h>
-#include <TableFunctions/ITableFunction.h>
-#include <TableFunctions/TableFunctionFactory.h>
-#include <TableFunctions/TableFunctionFile.h>
-
-#include <boost/algorithm/string.hpp>
-
-namespace DB
-{
-namespace ErrorCodes
-{
-extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
-extern const int DATABASE_ACCESS_DENIED;
-} // namespace ErrorCodes
-
-StoragePtr TableFunctionFile::executeImpl(const ASTPtr & ast_function, const Context & context) const
-{
-    // Parse args
-    ASTs & args_func = typeid_cast<ASTFunction &>(*ast_function).children;
-
-    if (args_func.size() != 1)
-        throw Exception("Table function '" + getName() + "' must have arguments.", ErrorCodes::LOGICAL_ERROR);
-
-    ASTs & args = typeid_cast<ASTExpressionList &>(*args_func.at(0)).children;
-
-    if (args.size() != 3)
-        throw Exception("Table function '" + getName() + "' requires exactly 3 arguments: path, format and structure.",
-                        ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
-
-    for (size_t i = 0; i < 3; ++i)
-        args[i] = evaluateConstantExpressionOrIdentifierAsLiteral(args[i], context);
-
-    std::string path = static_cast<const ASTLiteral &>(*args[0]).value.safeGet<String>();
-    std::string format = static_cast<const ASTLiteral &>(*args[1]).value.safeGet<String>();
-    std::string structure = static_cast<const ASTLiteral &>(*args[2]).value.safeGet<String>();
-
-    // Create sample block
-    std::vector<std::string> structure_vals;
-    boost::split(structure_vals, structure, boost::algorithm::is_any_of(" ,"), boost::algorithm::token_compress_on);
-
-    if (structure_vals.size() % 2 != 0)
-        throw Exception("Odd number of elements in section structure: must be a list of name type pairs", ErrorCodes::LOGICAL_ERROR);
-
-    Block sample_block;
-    const DataTypeFactory & data_type_factory = DataTypeFactory::instance();
-
-    for (size_t i = 0, size = structure_vals.size(); i < size; i += 2)
-    {
-        ColumnWithTypeAndName column;
-        column.name = structure_vals[i];
-        column.type = data_type_factory.get(structure_vals[i + 1]);
-        column.column = column.type->createColumn();
-        sample_block.insert(std::move(column));
-    }
-
-    // Create table
-    StoragePtr storage = StorageFile::create(
-        path,
-        -1,
-        context.getUserFilesPath(),
-        getName(),
-        format,
-        ColumnsDescription{sample_block.getNamesAndTypesList()},
-        const_cast<Context &>(context));
-
-    storage->startup();
-
-    return storage;
-}
-
-
-void registerTableFunctionFile(TableFunctionFactory & factory)
-{
-    factory.registerFunction<TableFunctionFile>();
-}
-
-} // namespace DB
diff --git a/dbms/src/TableFunctions/TableFunctionFile.h b/dbms/src/TableFunctions/TableFunctionFile.h
deleted file mode 100644
index dda367c2679..00000000000
--- a/dbms/src/TableFunctions/TableFunctionFile.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2022 PingCAP, Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <TableFunctions/ITableFunction.h>
-
-
-namespace DB
-{
-/* file(path, format, structure) - creates a temporary storage from file
- *
- *
- * The file must be in the clickhouse data directory.
- * The relative path begins with the clickhouse data directory.
- */
-class TableFunctionFile : public ITableFunction
-{
-public:
-    static constexpr auto name = "file";
-    std::string getName() const override { return name; }
-
-private:
-    StoragePtr executeImpl(const ASTPtr & ast_function, const Context & context) const override;
-};
-
-
-} // namespace DB
diff --git a/dbms/src/TableFunctions/registerTableFunctions.cpp b/dbms/src/TableFunctions/registerTableFunctions.cpp
index 2eac0ce0548..8449077cc96 100644
--- a/dbms/src/TableFunctions/registerTableFunctions.cpp
+++ b/dbms/src/TableFunctions/registerTableFunctions.cpp
@@ -21,16 +21,13 @@ namespace DB
 {
 void registerTableFunctionMerge(TableFunctionFactory & factory);
 void registerTableFunctionNumbers(TableFunctionFactory & factory);
-void registerTableFunctionCatBoostPool(TableFunctionFactory & factory);
-void registerTableFunctionFile(TableFunctionFactory & factory);
+
 void registerTableFunctions()
 {
     auto & factory = TableFunctionFactory::instance();
 
     registerTableFunctionMerge(factory);
     registerTableFunctionNumbers(factory);
-    registerTableFunctionCatBoostPool(factory);
-    registerTableFunctionFile(factory);
 }
 
 } // namespace DB
diff --git a/dbms/src/TestUtils/ColumnsToTiPBExpr.cpp b/dbms/src/TestUtils/ColumnsToTiPBExpr.cpp
index dcf727614b1..ea19ff08dd3 100644
--- a/dbms/src/TestUtils/ColumnsToTiPBExpr.cpp
+++ b/dbms/src/TestUtils/ColumnsToTiPBExpr.cpp
@@ -36,6 +36,7 @@ void columnToTiPBExpr(tipb::Expr * expr, const ColumnWithTypeAndName column, siz
             if (column.column->isColumnNullable())
             {
                 auto [col, null_map] = removeNullable(column.column.get());
+                (void)null_map;
                 is_const = col->isColumnConst();
             }
         }
@@ -97,6 +98,7 @@ void columnsToTiPBExprForTiDBCast(
             if (type_column.column->isColumnNullable())
             {
                 auto [col, null_map] = removeNullable(type_column.column.get());
+                (void)null_map;
                 is_const = col->isColumnConst();
             }
         }
diff --git a/dbms/src/TestUtils/ExecutorTestUtils.cpp b/dbms/src/TestUtils/ExecutorTestUtils.cpp
index 91c1430f7a0..763c74b45b6 100644
--- a/dbms/src/TestUtils/ExecutorTestUtils.cpp
+++ b/dbms/src/TestUtils/ExecutorTestUtils.cpp
@@ -12,11 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <AggregateFunctions/registerAggregateFunctions.h>
 #include <Common/FmtUtils.h>
 #include <Flash/Coprocessor/DAGQuerySource.h>
 #include <Interpreters/executeQuery.h>
 #include <TestUtils/ExecutorTestUtils.h>
 #include <TestUtils/executorSerializer.h>
+
+#include <functional>
+
 namespace DB::tests
 {
 DAGContext & ExecutorTest::getDAGContext()
@@ -34,15 +38,20 @@ void ExecutorTest::initializeContext()
 
 void ExecutorTest::SetUpTestCase()
 {
-    try
-    {
-        DB::registerFunctions();
-        DB::registerAggregateFunctions();
-    }
-    catch (DB::Exception &)
-    {
-        // Maybe another test has already registered, ignore exception here.
-    }
+    auto register_func = [](std::function<void()> func) {
+        try
+        {
+            func();
+        }
+        catch (DB::Exception &)
+        {
+            // Maybe another test has already registered, ignore exception here.
+        }
+    };
+
+    register_func(DB::registerFunctions);
+    register_func(DB::registerAggregateFunctions);
+    register_func(DB::registerWindowFunctions);
 }
 
 void ExecutorTest::initializeClientInfo()
@@ -96,33 +105,43 @@ Block mergeBlocks(Blocks blocks)
 }
 } // namespace
 
-void ExecutorTest::readAndAssertBlock(BlockInputStreamPtr stream, const ColumnsWithTypeAndName & expect_columns)
+DB::ColumnsWithTypeAndName readBlock(BlockInputStreamPtr stream)
 {
     Blocks actual_blocks;
-    Block except_block(expect_columns);
     stream->readPrefix();
     while (auto block = stream->read())
     {
         actual_blocks.push_back(block);
     }
     stream->readSuffix();
-    Block actual_block = mergeBlocks(actual_blocks);
-    ASSERT_BLOCK_EQ(except_block, actual_block);
+    return mergeBlocks(actual_blocks).getColumnsWithTypeAndName();
+}
+
+void ExecutorTest::enablePlanner(bool is_enable)
+{
+    context.context.setSetting("enable_planner", is_enable ? "true" : "false");
 }
 
-void ExecutorTest::executeStreams(const std::shared_ptr<tipb::DAGRequest> & request, std::unordered_map<String, ColumnsWithTypeAndName> & source_columns_map, const ColumnsWithTypeAndName & expect_columns, size_t concurrency)
+DB::ColumnsWithTypeAndName ExecutorTest::executeStreams(const std::shared_ptr<tipb::DAGRequest> & request, std::unordered_map<String, ColumnsWithTypeAndName> & source_columns_map, size_t concurrency)
 {
     DAGContext dag_context(*request, "executor_test", concurrency);
     dag_context.setColumnsForTest(source_columns_map);
     context.context.setDAGContext(&dag_context);
     // Currently, don't care about regions information in tests.
     DAGQuerySource dag(context.context);
-    readAndAssertBlock(executeQuery(dag, context.context, false, QueryProcessingStage::Complete).in, expect_columns);
+    return readBlock(executeQuery(dag, context.context, false, QueryProcessingStage::Complete).in);
+}
+
+DB::ColumnsWithTypeAndName ExecutorTest::executeStreams(const std::shared_ptr<tipb::DAGRequest> & request, size_t concurrency)
+{
+    return executeStreams(request, context.executorIdColumnsMap(), concurrency);
 }
 
-void ExecutorTest::executeStreams(const std::shared_ptr<tipb::DAGRequest> & request, const ColumnsWithTypeAndName & expect_columns, size_t concurrency)
+DB::ColumnsWithTypeAndName ExecutorTest::executeStreamsWithSingleSource(const std::shared_ptr<tipb::DAGRequest> & request, const ColumnsWithTypeAndName & source_columns, SourceType type, size_t concurrency)
 {
-    executeStreams(request, context.executorIdColumnsMap(), expect_columns, concurrency);
+    std::unordered_map<String, ColumnsWithTypeAndName> source_columns_map;
+    source_columns_map[getSourceName(type)] = source_columns;
+    return executeStreams(request, source_columns_map, concurrency);
 }
 
 void ExecutorTest::dagRequestEqual(const String & expected_string, const std::shared_ptr<tipb::DAGRequest> & actual)
diff --git a/dbms/src/TestUtils/ExecutorTestUtils.h b/dbms/src/TestUtils/ExecutorTestUtils.h
index 56a07085e50..54fed31f88d 100644
--- a/dbms/src/TestUtils/ExecutorTestUtils.h
+++ b/dbms/src/TestUtils/ExecutorTestUtils.h
@@ -15,18 +15,21 @@
 #pragma once
 
 #include <AggregateFunctions/registerAggregateFunctions.h>
-#include <Common/FmtUtils.h>
-#include <Flash/Coprocessor/DAGContext.h>
 #include <Flash/Statistics/traverseExecutors.h>
 #include <Functions/registerFunctions.h>
 #include <TestUtils/FunctionTestUtils.h>
-#include <TestUtils/TiFlashTestBasic.h>
-#include <TestUtils/TiFlashTestEnv.h>
 #include <TestUtils/executorSerializer.h>
 #include <TestUtils/mockExecutor.h>
+#include <WindowFunctions/registerWindowFunctions.h>
+
 namespace DB::tests
 {
 void executeInterpreter(const std::shared_ptr<tipb::DAGRequest> & request, Context & context);
+
+::testing::AssertionResult check_columns_equality(const ColumnsWithTypeAndName & expected, const ColumnsWithTypeAndName & actual, bool _restrict);
+
+DB::ColumnsWithTypeAndName readBlock(BlockInputStreamPtr stream);
+
 class ExecutorTest : public ::testing::Test
 {
 protected:
@@ -48,45 +51,56 @@ class ExecutorTest : public ::testing::Test
 
     DAGContext & getDAGContext();
 
+    /// for planner
+    void enablePlanner(bool is_enable);
+    template <typename FF>
+    void wrapForDisEnablePlanner(FF && ff)
+    {
+        enablePlanner(false);
+        ff();
+        enablePlanner(true);
+        ff();
+    }
+
     static void dagRequestEqual(const String & expected_string, const std::shared_ptr<tipb::DAGRequest> & actual);
 
     void executeInterpreter(const String & expected_string, const std::shared_ptr<tipb::DAGRequest> & request, size_t concurrency);
 
-    void executeStreams(
-        const std::shared_ptr<tipb::DAGRequest> & request,
-        std::unordered_map<String, ColumnsWithTypeAndName> & source_columns_map,
-        const ColumnsWithTypeAndName & expect_columns,
-        size_t concurrency = 1);
-    void executeStreams(
-        const std::shared_ptr<tipb::DAGRequest> & request,
-        const ColumnsWithTypeAndName & expect_columns,
-        size_t concurrency = 1);
-
-    template <typename T>
-    ColumnWithTypeAndName toNullableVec(const std::vector<std::optional<typename TypeTraits<T>::FieldType>> & v)
+    enum SourceType
     {
-        return createColumn<Nullable<T>>(v);
-    }
-
-    template <typename T>
-    ColumnWithTypeAndName toVec(const std::vector<typename TypeTraits<T>::FieldType> & v)
-    {
-        return createColumn<T>(v);
-    }
+        TableScan,
+        ExchangeReceiver
+    };
 
-    template <typename T>
-    ColumnWithTypeAndName toNullableVec(String name, const std::vector<std::optional<typename TypeTraits<T>::FieldType>> & v)
+    // for single source query, the source executor name is ${type}_0
+    static String getSourceName(SourceType type)
     {
-        return createColumn<Nullable<T>>(v, name);
+        switch (type)
+        {
+        case TableScan:
+            return "table_scan_0";
+        case ExchangeReceiver:
+            return "exchange_receiver_0";
+        default:
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "Unknown Executor Source type {}",
+                            type);
+        }
     }
 
-    template <typename T>
-    ColumnWithTypeAndName toVec(String name, const std::vector<typename TypeTraits<T>::FieldType> & v)
-    {
-        return createColumn<T>(v, name);
-    }
+    ColumnsWithTypeAndName executeStreams(
+        const std::shared_ptr<tipb::DAGRequest> & request,
+        std::unordered_map<String, ColumnsWithTypeAndName> & source_columns_map,
+        size_t concurrency = 1);
+    ColumnsWithTypeAndName executeStreams(
+        const std::shared_ptr<tipb::DAGRequest> & request,
+        size_t concurrency = 1);
 
-    static void readAndAssertBlock(BlockInputStreamPtr stream, const ColumnsWithTypeAndName & expect_columns);
+    ColumnsWithTypeAndName executeStreamsWithSingleSource(
+        const std::shared_ptr<tipb::DAGRequest> & request,
+        const ColumnsWithTypeAndName & source_columns,
+        SourceType type = TableScan,
+        size_t concurrency = 1);
 
 protected:
     MockDAGRequestContext context;
@@ -95,4 +109,4 @@ class ExecutorTest : public ::testing::Test
 
 #define ASSERT_DAGREQUEST_EQAUL(str, request) dagRequestEqual((str), (request));
 #define ASSERT_BLOCKINPUTSTREAM_EQAUL(str, request, concurrency) executeInterpreter((str), (request), (concurrency))
-} // namespace DB::tests
\ No newline at end of file
+} // namespace DB::tests
diff --git a/dbms/src/TestUtils/FunctionTestUtils.cpp b/dbms/src/TestUtils/FunctionTestUtils.cpp
index dae07f7123b..1c8b0242bfa 100644
--- a/dbms/src/TestUtils/FunctionTestUtils.cpp
+++ b/dbms/src/TestUtils/FunctionTestUtils.cpp
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <Columns/ColumnNullable.h>
+#include <Common/FmtUtils.h>
 #include <Core/ColumnNumbers.h>
+#include <Core/Row.h>
 #include <DataTypes/DataTypeNothing.h>
 #include <Flash/Coprocessor/DAGCodec.h>
 #include <Flash/Coprocessor/DAGExpressionAnalyzer.h>
@@ -23,7 +25,10 @@
 #include <TestUtils/ColumnsToTiPBExpr.h>
 #include <TestUtils/FunctionTestUtils.h>
 #include <TestUtils/TiFlashTestBasic.h>
-#include <fmt/core.h>
+
+#include <ext/enumerate.h>
+#include <set>
+
 
 namespace DB
 {
@@ -103,21 +108,118 @@ ::testing::AssertionResult columnEqual(
     return columnEqual(expected.column, actual.column);
 }
 
-void blockEqual(
+::testing::AssertionResult blockEqual(
     const Block & expected,
     const Block & actual)
 {
     size_t columns = actual.columns();
+    size_t expected_columns = expected.columns();
 
-    ASSERT_TRUE(expected.columns() == columns);
+    ASSERT_EQUAL(expected_columns, columns, "Block size mismatch");
 
     for (size_t i = 0; i < columns; ++i)
     {
         const auto & expected_col = expected.getByPosition(i);
         const auto & actual_col = actual.getByPosition(i);
-        ASSERT_TRUE(actual_col.type->getName() == expected_col.type->getName());
-        ASSERT_COLUMN_EQ(expected_col.column, actual_col.column);
+
+        auto cmp_res = columnEqual(expected_col, actual_col);
+        if (!cmp_res)
+            return cmp_res;
+    }
+    return ::testing::AssertionSuccess();
+}
+
+/// size of each column should be the same
+std::multiset<Row> columnsToRowSet(const ColumnsWithTypeAndName & cols)
+{
+    if (cols.empty())
+        return {};
+    if (cols[0].column->empty())
+        return {};
+
+    size_t cols_size = cols.size();
+    std::vector<Row> rows{cols[0].column->size()};
+
+    for (auto & r : rows)
+    {
+        r.resize(cols_size, true);
+    }
+
+    for (auto const & [col_id, col] : ext::enumerate(cols))
+    {
+        for (size_t i = 0, size = col.column->size(); i < size; ++i)
+        {
+            new (rows[i].place(col_id)) Field((*col.column)[i]);
+        }
+    }
+    return {std::make_move_iterator(rows.begin()), std::make_move_iterator(rows.end())};
+}
+
+::testing::AssertionResult columnsEqual(
+    const ColumnsWithTypeAndName & expected,
+    const ColumnsWithTypeAndName & actual,
+    bool _restrict)
+{
+    if (_restrict)
+        return blockEqual(Block(expected), Block(actual));
+
+    auto expect_cols_size = expected.size();
+    auto actual_cols_size = actual.size();
+
+    ASSERT_EQUAL(expect_cols_size, actual_cols_size, "Columns size mismatch");
+
+    for (size_t i = 0; i < expect_cols_size; ++i)
+    {
+        auto const & expect_col = expected[i];
+        auto const & actual_col = actual[i];
+        ASSERT_EQUAL(expect_col.column->getName(), actual_col.column->getName(), fmt::format("Column {} name mismatch", i));
+        ASSERT_EQUAL(expect_col.column->size(), actual_col.column->size(), fmt::format("Column {} size mismatch", i));
+        auto type_eq = dataTypeEqual(expected[i].type, actual[i].type);
+        if (!type_eq)
+            return type_eq;
+    }
+
+    auto const expected_row_set = columnsToRowSet(expected);
+    auto const actual_row_set = columnsToRowSet(actual);
+
+    if (expected_row_set != actual_row_set)
+    {
+        FmtBuffer buf;
+
+        auto expect_it = expected_row_set.begin();
+        auto actual_it = actual_row_set.begin();
+
+        buf.append("Columns row set mismatch\n").append("expected_row_set:\n");
+        for (; expect_it != expected_row_set.end(); ++expect_it, ++actual_it)
+        {
+            buf.joinStr(
+                   expect_it->begin(),
+                   expect_it->end(),
+                   [](const auto & v, FmtBuffer & fb) { fb.append(v.toString()); },
+                   " ")
+                .append("\n");
+            if (*expect_it != *actual_it)
+                break;
+        }
+
+        ++actual_it;
+
+        buf.append("...\nactual_row_set:\n");
+        for (auto it = actual_row_set.begin(); it != actual_it; ++it)
+        {
+            buf.joinStr(
+                   it->begin(),
+                   it->end(),
+                   [](const auto & v, FmtBuffer & fb) { fb.append(v.toString()); },
+                   " ")
+                .append("\n");
+        }
+        buf.append("...\n");
+
+        return testing::AssertionFailure() << buf.toString();
     }
+
+    return testing::AssertionSuccess();
 }
 
 std::pair<ExpressionActionsPtr, String> buildFunction(
@@ -242,5 +344,96 @@ ColumnWithTypeAndName createOnlyNullColumn(size_t size, const String & name)
     return {std::move(col), data_type, name};
 }
 
+ColumnWithTypeAndName toDatetimeVec(String name, const std::vector<String> & v, int fsp)
+{
+    std::vector<typename TypeTraits<MyDateTime>::FieldType> vec;
+    vec.reserve(v.size());
+    for (const auto & value_str : v)
+    {
+        Field value = parseMyDateTime(value_str, fsp);
+        vec.push_back(value.template safeGet<UInt64>());
+    }
+    DataTypePtr data_type = std::make_shared<DataTypeMyDateTime>(fsp);
+    return {makeColumn<MyDateTime>(data_type, vec), data_type, name, 0};
+}
+
+ColumnWithTypeAndName toNullableDatetimeVec(String name, const std::vector<String> & v, int fsp)
+{
+    std::vector<std::optional<typename TypeTraits<MyDateTime>::FieldType>> vec;
+    vec.reserve(v.size());
+    for (const auto & value_str : v)
+    {
+        if (!value_str.empty())
+        {
+            Field value = parseMyDateTime(value_str, fsp);
+            vec.push_back(value.template safeGet<UInt64>());
+        }
+        else
+        {
+            vec.push_back({});
+        }
+    }
+    DataTypePtr data_type = makeNullable(std::make_shared<DataTypeMyDateTime>(fsp));
+    return {makeColumn<Nullable<MyDateTime>>(data_type, vec), data_type, name, 0};
+}
+
+String getColumnsContent(const ColumnsWithTypeAndName & cols)
+{
+    if (cols.size() <= 0)
+        return "";
+    return getColumnsContent(cols, 0, cols[0].column->size() - 1);
+}
+
+String getColumnsContent(const ColumnsWithTypeAndName & cols, size_t begin, size_t end)
+{
+    const size_t col_num = cols.size();
+    if (col_num <= 0)
+        return "";
+
+    const size_t col_size = cols[0].column->size();
+    assert(begin <= end);
+    assert(col_size > end);
+    assert(col_size > begin);
+
+    bool is_same = true;
+
+    for (size_t i = 1; i < col_num; ++i)
+    {
+        if (cols[i].column->size() != col_size)
+            is_same = false;
+    }
+
+    assert(is_same); /// Ensure the sizes of columns in cols are the same
+
+    std::vector<std::pair<size_t, String>> col_content;
+    FmtBuffer fmt_buf;
+    for (size_t i = 0; i < col_num; ++i)
+    {
+        /// Push the column name
+        fmt_buf.append(fmt::format("{}: (", cols[i].name));
+        for (size_t j = begin; j <= end; ++j)
+            col_content.push_back(std::make_pair(j, (*cols[i].column)[j].toString()));
+
+        /// Add content
+        fmt_buf.joinStr(
+            col_content.begin(),
+            col_content.end(),
+            [](const auto & content, FmtBuffer & fmt_buf) {
+                fmt_buf.append(fmt::format("{}: {}", content.first, content.second));
+            },
+            ", ");
+
+        fmt_buf.append(")\n");
+        col_content.clear();
+    }
+
+    return fmt_buf.toString();
+}
+
+ColumnsWithTypeAndName createColumns(const ColumnsWithTypeAndName & cols)
+{
+    return cols;
+}
+
 } // namespace tests
 } // namespace DB
diff --git a/dbms/src/TestUtils/FunctionTestUtils.h b/dbms/src/TestUtils/FunctionTestUtils.h
index 7704c69a89f..8680d1886b1 100644
--- a/dbms/src/TestUtils/FunctionTestUtils.h
+++ b/dbms/src/TestUtils/FunctionTestUtils.h
@@ -514,6 +514,17 @@ ColumnWithTypeAndName createConstColumn(
     return createConstColumn<T>(data_type_args, size, InferredFieldType<T>(std::nullopt), name);
 }
 
+String getColumnsContent(const ColumnsWithTypeAndName & cols);
+
+/// We can designate the range of columns printed with begin and end. range: [begin, end]
+String getColumnsContent(const ColumnsWithTypeAndName & cols, size_t begin, size_t end);
+
+// This wrapper function only serves to construct columns input for function-like macros,
+// since preprocessor recognizes `{col1, col2, col3}` as three arguments instead of one.
+// E.g. preprocessor does not allow us to write `ASSERT_COLUMNS_EQ_R({col1, col2, col3}, actual_cols)`,
+//  but with this func we can write `ASSERT_COLUMNS_EQ_R(createColumns{col1, col2, col3}, actual_cols)` instead.
+ColumnsWithTypeAndName createColumns(const ColumnsWithTypeAndName & cols);
+
 ::testing::AssertionResult dataTypeEqual(
     const DataTypePtr & expected,
     const DataTypePtr & actual);
@@ -527,10 +538,15 @@ ::testing::AssertionResult columnEqual(
     const ColumnWithTypeAndName & expected,
     const ColumnWithTypeAndName & actual);
 
-void blockEqual(
+::testing::AssertionResult blockEqual(
     const Block & expected,
     const Block & actual);
 
+::testing::AssertionResult columnsEqual(
+    const ColumnsWithTypeAndName & expected,
+    const ColumnsWithTypeAndName & actual,
+    bool _restrict);
+
 ColumnWithTypeAndName executeFunction(
     Context & context,
     const String & func_name,
@@ -654,6 +670,33 @@ ColumnWithTypeAndName createNullableColumn(
     return createNullableColumn<T>(data_type_args, vec, null_map, name, 0);
 }
 
+template <typename T>
+ColumnWithTypeAndName toNullableVec(const std::vector<std::optional<typename TypeTraits<T>::FieldType>> & v)
+{
+    return createColumn<Nullable<T>>(v);
+}
+
+template <typename T>
+ColumnWithTypeAndName toVec(const std::vector<typename TypeTraits<T>::FieldType> & v)
+{
+    return createColumn<T>(v);
+}
+
+template <typename T>
+ColumnWithTypeAndName toNullableVec(String name, const std::vector<std::optional<typename TypeTraits<T>::FieldType>> & v)
+{
+    return createColumn<Nullable<T>>(v, name);
+}
+
+template <typename T>
+ColumnWithTypeAndName toVec(String name, const std::vector<typename TypeTraits<T>::FieldType> & v)
+{
+    return createColumn<T>(v, name);
+}
+
+ColumnWithTypeAndName toDatetimeVec(String name, const std::vector<String> & v, int fsp);
+
+ColumnWithTypeAndName toNullableDatetimeVec(String name, const std::vector<String> & v, int fsp);
 class FunctionTest : public ::testing::Test
 {
 protected:
@@ -729,5 +772,10 @@ class FunctionTest : public ::testing::Test
 
 #define ASSERT_COLUMN_EQ(expected, actual) ASSERT_TRUE(DB::tests::columnEqual((expected), (actual)))
 #define ASSERT_BLOCK_EQ(expected, actual) DB::tests::blockEqual((expected), (actual))
+
+/// restrictly checking columns equality, both data set and each row's offset should be the same
+#define ASSERT_COLUMNS_EQ_R(expected, actual) ASSERT_TRUE(DB::tests::columnsEqual((expected), (actual), true))
+/// unrestrictly checking columns equality, only checking data set equality
+#define ASSERT_COLUMNS_EQ_UR(expected, actual) ASSERT_TRUE(DB::tests::columnsEqual((expected), (actual), false))
 } // namespace tests
 } // namespace DB
diff --git a/dbms/src/TestUtils/bench_dbms_main.cpp b/dbms/src/TestUtils/bench_dbms_main.cpp
index 48bd02a71f7..092c45c35e2 100644
--- a/dbms/src/TestUtils/bench_dbms_main.cpp
+++ b/dbms/src/TestUtils/bench_dbms_main.cpp
@@ -20,6 +20,8 @@ int main(int argc, char * argv[])
 {
     benchmark::Initialize(&argc, argv);
     DB::tests::TiFlashTestEnv::setupLogger();
+    // Each time TiFlashTestEnv::getContext() is called, some log will print, it's annoying.
+    Poco::Logger::root().setLevel("error");
     DB::tests::TiFlashTestEnv::initializeGlobalContext();
     if (::benchmark::ReportUnrecognizedArguments(argc, argv))
         return 1;
diff --git a/dbms/src/TestUtils/executorSerializer.cpp b/dbms/src/TestUtils/executorSerializer.cpp
index b8d2b039bd2..a0ae4b11270 100644
--- a/dbms/src/TestUtils/executorSerializer.cpp
+++ b/dbms/src/TestUtils/executorSerializer.cpp
@@ -204,6 +204,66 @@ void serializeExchangeReceiver(const String & executor_id, const tipb::ExchangeR
     buf.append("}\n");
 }
 
+void serializeWindow(const String & executor_id, const tipb::Window & window [[maybe_unused]], FmtBuffer & buf)
+{
+    buf.fmtAppend("{} | partition_by: {{", executor_id);
+    buf.joinStr(
+        window.partition_by().begin(),
+        window.partition_by().end(),
+        [&](const auto & partition_by, FmtBuffer & fb) {
+            fb.append("(");
+            serializeExpression(partition_by.expr(), buf);
+            fb.fmtAppend(", desc: {})", partition_by.desc());
+        },
+        ", ");
+    buf.append("}}, order_by: {");
+    buf.joinStr(
+        window.order_by().begin(),
+        window.order_by().end(),
+        [&](const auto & order_by, FmtBuffer & fb) {
+            fb.append("(");
+            serializeExpression(order_by.expr(), buf);
+            fb.fmtAppend(", desc: {})", order_by.desc());
+        },
+        ", ");
+    buf.append("}, func_desc: {");
+    buf.joinStr(
+        window.func_desc().begin(),
+        window.func_desc().end(),
+        [&](const auto & func, FmtBuffer &) {
+            serializeExpression(func, buf);
+        },
+        ", ");
+    if (window.has_frame())
+    {
+        buf.append("}, frame: {");
+        if (window.frame().has_start())
+        {
+            buf.fmtAppend("start<{}, {}, {}>", window.frame().start().type(), window.frame().start().unbounded(), window.frame().start().offset());
+        }
+        if (window.frame().has_end())
+        {
+            buf.fmtAppend(", end<{}, {}, {}>", window.frame().end().type(), window.frame().end().unbounded(), window.frame().end().offset());
+        }
+    }
+    buf.append("}\n");
+}
+
+void serializeSort(const String & executor_id, const tipb::Sort & sort [[maybe_unused]], FmtBuffer & buf)
+{
+    buf.fmtAppend("{} | isPartialSort: {}, partition_by: {{", executor_id, sort.ispartialsort());
+    buf.joinStr(
+        sort.byitems().begin(),
+        sort.byitems().end(),
+        [&](const auto & by, FmtBuffer & fb) {
+            fb.append("(");
+            serializeExpression(by.expr(), buf);
+            fb.fmtAppend(", desc: {})", by.desc());
+        },
+        ", ");
+    buf.append("}\n");
+}
+
 void ExecutorSerializer::serialize(const tipb::Executor & root_executor, size_t level)
 {
     auto append_str = [&level, this](const tipb::Executor & executor) {
@@ -248,9 +308,11 @@ void ExecutorSerializer::serialize(const tipb::Executor & root_executor, size_t
             serializeExchangeSender(executor.executor_id(), executor.exchange_sender(), buf);
             break;
         case tipb::ExecType::TypeSort:
-            throw TiFlashException("Sort executor is not supported", Errors::Coprocessor::Unimplemented); // todo support sort executor.
+            serializeSort(executor.executor_id(), executor.sort(), buf);
+            break;
         case tipb::ExecType::TypeWindow:
-            throw TiFlashException("Window executor is not supported", Errors::Coprocessor::Unimplemented); // todo support window executor.
+            serializeWindow(executor.executor_id(), executor.window(), buf);
+            break;
         default:
             throw TiFlashException("Should not reach here", Errors::Coprocessor::Internal);
         }
diff --git a/dbms/src/TestUtils/mockExecutor.cpp b/dbms/src/TestUtils/mockExecutor.cpp
index af939002cff..30d05786c9a 100644
--- a/dbms/src/TestUtils/mockExecutor.cpp
+++ b/dbms/src/TestUtils/mockExecutor.cpp
@@ -23,8 +23,6 @@
 #include <TestUtils/mockExecutor.h>
 #include <tipb/executor.pb.h>
 
-#include <unordered_map>
-
 namespace DB::tests
 {
 ASTPtr buildColumn(const String & column_name)
@@ -37,7 +35,7 @@ ASTPtr buildLiteral(const Field & field)
     return std::make_shared<ASTLiteral>(field);
 }
 
-ASTPtr buildOrderByItemList(MockOrderByItems order_by_items)
+ASTPtr buildOrderByItemVec(MockOrderByItemVec order_by_items)
 {
     std::vector<ASTPtr> vec(order_by_items.size());
     size_t i = 0;
@@ -54,6 +52,15 @@ ASTPtr buildOrderByItemList(MockOrderByItems order_by_items)
     return exp_list;
 }
 
+MockWindowFrame buildDefaultRowsFrame()
+{
+    MockWindowFrame frame;
+    frame.type = tipb::WindowFrameType::Rows;
+    frame.end = {tipb::WindowBoundType::CurrentRow, false, 0};
+    frame.start = {tipb::WindowBoundType::CurrentRow, false, 0};
+    return frame;
+}
+
 // a mock DAGRequest should prepare its time_zone, flags, encode_type and output_schema.
 void DAGRequestBuilder::initDAGRequest(tipb::DAGRequest & dag_request)
 {
@@ -85,7 +92,7 @@ std::shared_ptr<tipb::DAGRequest> DAGRequestBuilder::build(MockDAGRequestContext
     return dag_request_ptr;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::mockTable(const String & db, const String & table, const MockColumnInfos & columns)
+DAGRequestBuilder & DAGRequestBuilder::mockTable(const String & db, const String & table, const MockColumnInfoVec & columns)
 {
     assert(!columns.empty());
     TableInfo table_info;
@@ -96,6 +103,9 @@ DAGRequestBuilder & DAGRequestBuilder::mockTable(const String & db, const String
         TiDB::ColumnInfo ret;
         ret.tp = column.second;
         ret.name = column.first;
+        // TODO: find a way to assign decimal field's flen.
+        if (ret.tp == TiDB::TP::TypeNewDecimal)
+            ret.flen = 65;
         ret.id = i++;
         table_info.columns.push_back(std::move(ret));
     }
@@ -104,27 +114,17 @@ DAGRequestBuilder & DAGRequestBuilder::mockTable(const String & db, const String
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::mockTable(const MockTableName & name, const MockColumnInfos & columns)
+DAGRequestBuilder & DAGRequestBuilder::mockTable(const MockTableName & name, const MockColumnInfoVec & columns)
 {
     return mockTable(name.first, name.second, columns);
 }
 
-DAGRequestBuilder & DAGRequestBuilder::mockTable(const MockTableName & name, const MockColumnInfoList & columns)
+DAGRequestBuilder & DAGRequestBuilder::exchangeReceiver(const MockColumnInfoVec & columns, uint64_t fine_grained_shuffle_stream_count)
 {
-    return mockTable(name.first, name.second, columns);
+    return buildExchangeReceiver(columns, fine_grained_shuffle_stream_count);
 }
 
-DAGRequestBuilder & DAGRequestBuilder::exchangeReceiver(const MockColumnInfos & columns)
-{
-    return buildExchangeReceiver(columns);
-}
-
-DAGRequestBuilder & DAGRequestBuilder::exchangeReceiver(const MockColumnInfoList & columns)
-{
-    return buildExchangeReceiver(columns);
-}
-
-DAGRequestBuilder & DAGRequestBuilder::buildExchangeReceiver(const MockColumnInfos & columns)
+DAGRequestBuilder & DAGRequestBuilder::buildExchangeReceiver(const MockColumnInfoVec & columns, uint64_t fine_grained_shuffle_stream_count)
 {
     DAGSchema schema;
     for (const auto & column : columns)
@@ -135,7 +135,7 @@ DAGRequestBuilder & DAGRequestBuilder::buildExchangeReceiver(const MockColumnInf
         schema.push_back({column.first, info});
     }
 
-    root = compileExchangeReceiver(getExecutorIndex(), schema);
+    root = compileExchangeReceiver(getExecutorIndex(), schema, fine_grained_shuffle_stream_count);
     return *this;
 }
 
@@ -170,33 +170,23 @@ DAGRequestBuilder & DAGRequestBuilder::topN(ASTPtr order_exprs, ASTPtr limit_exp
 DAGRequestBuilder & DAGRequestBuilder::topN(const String & col_name, bool desc, int limit)
 {
     assert(root);
-    root = compileTopN(root, getExecutorIndex(), buildOrderByItemList({{col_name, desc}}), buildLiteral(Field(static_cast<UInt64>(limit))));
+    root = compileTopN(root, getExecutorIndex(), buildOrderByItemVec({{col_name, desc}}), buildLiteral(Field(static_cast<UInt64>(limit))));
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::topN(MockOrderByItems order_by_items, int limit)
+DAGRequestBuilder & DAGRequestBuilder::topN(MockOrderByItemVec order_by_items, int limit)
 {
     return topN(order_by_items, buildLiteral(Field(static_cast<UInt64>(limit))));
 }
 
-DAGRequestBuilder & DAGRequestBuilder::topN(MockOrderByItems order_by_items, ASTPtr limit_expr)
+DAGRequestBuilder & DAGRequestBuilder::topN(MockOrderByItemVec order_by_items, ASTPtr limit_expr)
 {
     assert(root);
-    root = compileTopN(root, getExecutorIndex(), buildOrderByItemList(order_by_items), limit_expr);
+    root = compileTopN(root, getExecutorIndex(), buildOrderByItemVec(order_by_items), limit_expr);
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::project(const String & col_name)
-{
-    assert(root);
-    auto exp_list = std::make_shared<ASTExpressionList>();
-    exp_list->children.push_back(buildColumn(col_name));
-
-    root = compileProject(root, getExecutorIndex(), exp_list);
-    return *this;
-}
-
-DAGRequestBuilder & DAGRequestBuilder::project(MockAsts exprs)
+DAGRequestBuilder & DAGRequestBuilder::project(MockAstVec exprs)
 {
     assert(root);
     auto exp_list = std::make_shared<ASTExpressionList>();
@@ -208,7 +198,7 @@ DAGRequestBuilder & DAGRequestBuilder::project(MockAsts exprs)
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::project(MockColumnNames col_names)
+DAGRequestBuilder & DAGRequestBuilder::project(MockColumnNameVec col_names)
 {
     assert(root);
     auto exp_list = std::make_shared<ASTExpressionList>();
@@ -227,12 +217,12 @@ DAGRequestBuilder & DAGRequestBuilder::exchangeSender(tipb::ExchangeType exchang
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::join(const DAGRequestBuilder & right, MockAsts exprs)
+DAGRequestBuilder & DAGRequestBuilder::join(const DAGRequestBuilder & right, MockAstVec exprs)
 {
     return join(right, exprs, ASTTableJoin::Kind::Inner);
 }
 
-DAGRequestBuilder & DAGRequestBuilder::join(const DAGRequestBuilder & right, MockAsts exprs, ASTTableJoin::Kind kind)
+DAGRequestBuilder & DAGRequestBuilder::join(const DAGRequestBuilder & right, MockAstVec exprs, ASTTableJoin::Kind kind)
 {
     assert(root);
     assert(right.root);
@@ -258,7 +248,7 @@ DAGRequestBuilder & DAGRequestBuilder::aggregation(ASTPtr agg_func, ASTPtr group
     return buildAggregation(agg_funcs, group_by_exprs);
 }
 
-DAGRequestBuilder & DAGRequestBuilder::aggregation(MockAsts agg_funcs, MockAsts group_by_exprs)
+DAGRequestBuilder & DAGRequestBuilder::aggregation(MockAstVec agg_funcs, MockAstVec group_by_exprs)
 {
     auto agg_func_list = std::make_shared<ASTExpressionList>();
     auto group_by_expr_list = std::make_shared<ASTExpressionList>();
@@ -276,41 +266,61 @@ DAGRequestBuilder & DAGRequestBuilder::buildAggregation(ASTPtr agg_funcs, ASTPtr
     return *this;
 }
 
-void MockDAGRequestContext::addMockTable(const MockTableName & name, const MockColumnInfoList & columnInfos)
+DAGRequestBuilder & DAGRequestBuilder::window(ASTPtr window_func, MockOrderByItem order_by, MockPartitionByItem partition_by, MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count)
 {
-    std::vector<MockColumnInfo> v_column_info(columnInfos.size());
-    size_t i = 0;
-    for (const auto & info : columnInfos)
-    {
-        v_column_info[i++] = std::move(info);
-    }
-    mock_tables[name.first + "." + name.second] = v_column_info;
+    assert(root);
+    auto window_func_list = std::make_shared<ASTExpressionList>();
+    window_func_list->children.push_back(window_func);
+    root = compileWindow(root, getExecutorIndex(), window_func_list, buildOrderByItemVec({partition_by}), buildOrderByItemVec({order_by}), frame, fine_grained_shuffle_stream_count);
+    return *this;
 }
 
-void MockDAGRequestContext::addMockTable(const String & db, const String & table, const MockColumnInfos & columnInfos)
+DAGRequestBuilder & DAGRequestBuilder::window(ASTPtr window_func, MockOrderByItemVec order_by_vec, MockPartitionByItemVec partition_by_vec, MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count)
 {
-    mock_tables[db + "." + table] = columnInfos;
+    assert(root);
+    auto window_func_list = std::make_shared<ASTExpressionList>();
+    window_func_list->children.push_back(window_func);
+    root = compileWindow(root, getExecutorIndex(), window_func_list, buildOrderByItemVec(partition_by_vec), buildOrderByItemVec(order_by_vec), frame, fine_grained_shuffle_stream_count);
+    return *this;
 }
 
-void MockDAGRequestContext::addMockTable(const MockTableName & name, const MockColumnInfos & columnInfos)
+DAGRequestBuilder & DAGRequestBuilder::window(MockAstVec window_funcs, MockOrderByItemVec order_by_vec, MockPartitionByItemVec partition_by_vec, MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count)
 {
-    mock_tables[name.first + "." + name.second] = columnInfos;
+    assert(root);
+    auto window_func_list = std::make_shared<ASTExpressionList>();
+    for (const auto & func : window_funcs)
+        window_func_list->children.push_back(func);
+    root = compileWindow(root, getExecutorIndex(), window_func_list, buildOrderByItemVec(partition_by_vec), buildOrderByItemVec(order_by_vec), frame, fine_grained_shuffle_stream_count);
+    return *this;
 }
 
-void MockDAGRequestContext::addExchangeRelationSchema(String name, const MockColumnInfos & columnInfos)
+DAGRequestBuilder & DAGRequestBuilder::sort(MockOrderByItem order_by, bool is_partial_sort, uint64_t fine_grained_shuffle_stream_count)
 {
-    exchange_schemas[name] = columnInfos;
+    assert(root);
+    root = compileSort(root, getExecutorIndex(), buildOrderByItemVec({order_by}), is_partial_sort, fine_grained_shuffle_stream_count);
+    return *this;
 }
 
-void MockDAGRequestContext::addExchangeRelationSchema(String name, const MockColumnInfoList & columnInfos)
+DAGRequestBuilder & DAGRequestBuilder::sort(MockOrderByItemVec order_by_vec, bool is_partial_sort, uint64_t fine_grained_shuffle_stream_count)
 {
-    std::vector<MockColumnInfo> v_column_info(columnInfos.size());
-    size_t i = 0;
-    for (const auto & info : columnInfos)
-    {
-        v_column_info[i++] = std::move(info);
-    }
-    exchange_schemas[name] = v_column_info;
+    assert(root);
+    root = compileSort(root, getExecutorIndex(), buildOrderByItemVec(order_by_vec), is_partial_sort, fine_grained_shuffle_stream_count);
+    return *this;
+}
+
+void MockDAGRequestContext::addMockTable(const String & db, const String & table, const MockColumnInfoVec & columnInfos)
+{
+    mock_tables[db + "." + table] = columnInfos;
+}
+
+void MockDAGRequestContext::addMockTable(const MockTableName & name, const MockColumnInfoVec & columnInfos)
+{
+    mock_tables[name.first + "." + name.second] = columnInfos;
+}
+
+void MockDAGRequestContext::addExchangeRelationSchema(String name, const MockColumnInfoVec & columnInfos)
+{
+    exchange_schemas[name] = columnInfos;
 }
 
 void MockDAGRequestContext::addMockTableColumnData(const String & db, const String & table, ColumnsWithTypeAndName columns)
@@ -328,37 +338,19 @@ void MockDAGRequestContext::addExchangeReceiverColumnData(const String & name, C
     mock_exchange_columns[name] = columns;
 }
 
-void MockDAGRequestContext::addMockTable(const String & db, const String & table, const MockColumnInfoList & columnInfos, ColumnsWithTypeAndName columns)
-{
-    addMockTable(db, table, columnInfos);
-    addMockTableColumnData(db, table, columns);
-}
-
-void MockDAGRequestContext::addMockTable(const String & db, const String & table, const MockColumnInfos & columnInfos, ColumnsWithTypeAndName columns)
+void MockDAGRequestContext::addMockTable(const String & db, const String & table, const MockColumnInfoVec & columnInfos, ColumnsWithTypeAndName columns)
 {
     addMockTable(db, table, columnInfos);
     addMockTableColumnData(db, table, columns);
 }
 
-void MockDAGRequestContext::addMockTable(const MockTableName & name, const MockColumnInfoList & columnInfos, ColumnsWithTypeAndName columns)
-{
-    addMockTable(name, columnInfos);
-    addMockTableColumnData(name, columns);
-}
-
-void MockDAGRequestContext::addMockTable(const MockTableName & name, const MockColumnInfos & columnInfos, ColumnsWithTypeAndName columns)
+void MockDAGRequestContext::addMockTable(const MockTableName & name, const MockColumnInfoVec & columnInfos, ColumnsWithTypeAndName columns)
 {
     addMockTable(name, columnInfos);
     addMockTableColumnData(name, columns);
 }
 
-void MockDAGRequestContext::addExchangeReceiver(const String & name, MockColumnInfos columnInfos, ColumnsWithTypeAndName columns)
-{
-    addExchangeRelationSchema(name, columnInfos);
-    addExchangeReceiverColumnData(name, columns);
-}
-
-void MockDAGRequestContext::addExchangeReceiver(const String & name, MockColumnInfoList columnInfos, ColumnsWithTypeAndName columns)
+void MockDAGRequestContext::addExchangeReceiver(const String & name, MockColumnInfoVec columnInfos, ColumnsWithTypeAndName columns)
 {
     addExchangeRelationSchema(name, columnInfos);
     addExchangeReceiverColumnData(name, columns);
@@ -376,9 +368,9 @@ DAGRequestBuilder MockDAGRequestContext::scan(String db_name, String table_name)
     return builder;
 }
 
-DAGRequestBuilder MockDAGRequestContext::receive(String exchange_name)
+DAGRequestBuilder MockDAGRequestContext::receive(String exchange_name, uint64_t fine_grained_shuffle_stream_count)
 {
-    auto builder = DAGRequestBuilder(index).exchangeReceiver(exchange_schemas[exchange_name]);
+    auto builder = DAGRequestBuilder(index).exchangeReceiver(exchange_schemas[exchange_name], fine_grained_shuffle_stream_count);
     receiver_source_task_ids_map[builder.getRoot()->name] = {};
     // If don't have related columns, user must pass input columns as argument of executeStreams in order to run Executors Tests.
     // If user don't want to test executors, it will be safe to run Interpreter Tests.
@@ -388,4 +380,4 @@ DAGRequestBuilder MockDAGRequestContext::receive(String exchange_name)
     }
     return builder;
 }
-} // namespace DB::tests
\ No newline at end of file
+} // namespace DB::tests
diff --git a/dbms/src/TestUtils/mockExecutor.h b/dbms/src/TestUtils/mockExecutor.h
index 95551cdfc9e..8b5a6d300ff 100644
--- a/dbms/src/TestUtils/mockExecutor.h
+++ b/dbms/src/TestUtils/mockExecutor.h
@@ -20,19 +20,18 @@
 #include <Parsers/ASTFunction.h>
 #include <tipb/executor.pb.h>
 
-#include <initializer_list>
-#include <unordered_map>
-
 namespace DB::tests
 {
 using MockColumnInfo = std::pair<String, TiDB::TP>;
-using MockColumnInfos = std::vector<MockColumnInfo>;
-using MockColumnInfoList = std::initializer_list<MockColumnInfo>;
+using MockColumnInfoVec = std::vector<MockColumnInfo>;
 using MockTableName = std::pair<String, String>;
 using MockOrderByItem = std::pair<String, bool>;
-using MockOrderByItems = std::initializer_list<MockOrderByItem>;
-using MockColumnNames = std::initializer_list<String>;
-using MockAsts = std::initializer_list<ASTPtr>;
+using MockOrderByItemVec = std::vector<MockOrderByItem>;
+using MockPartitionByItem = std::pair<String, bool>;
+using MockPartitionByItemVec = std::vector<MockPartitionByItem>;
+using MockColumnNameVec = std::vector<String>;
+using MockAstVec = std::vector<ASTPtr>;
+using MockWindowFrame = mock::MockWindowFrame;
 
 class MockDAGRequestContext;
 
@@ -64,12 +63,10 @@ class DAGRequestBuilder
 
     std::shared_ptr<tipb::DAGRequest> build(MockDAGRequestContext & mock_context);
 
-    DAGRequestBuilder & mockTable(const String & db, const String & table, const MockColumnInfos & columns);
-    DAGRequestBuilder & mockTable(const MockTableName & name, const MockColumnInfos & columns);
-    DAGRequestBuilder & mockTable(const MockTableName & name, const MockColumnInfoList & columns);
+    DAGRequestBuilder & mockTable(const String & db, const String & table, const MockColumnInfoVec & columns);
+    DAGRequestBuilder & mockTable(const MockTableName & name, const MockColumnInfoVec & columns);
 
-    DAGRequestBuilder & exchangeReceiver(const MockColumnInfos & columns);
-    DAGRequestBuilder & exchangeReceiver(const MockColumnInfoList & columns);
+    DAGRequestBuilder & exchangeReceiver(const MockColumnInfoVec & columns, uint64_t fine_grained_shuffle_stream_count = 0);
 
     DAGRequestBuilder & filter(ASTPtr filter_expr);
 
@@ -78,28 +75,34 @@ class DAGRequestBuilder
 
     DAGRequestBuilder & topN(ASTPtr order_exprs, ASTPtr limit_expr);
     DAGRequestBuilder & topN(const String & col_name, bool desc, int limit);
-    DAGRequestBuilder & topN(MockOrderByItems order_by_items, int limit);
-    DAGRequestBuilder & topN(MockOrderByItems order_by_items, ASTPtr limit_expr);
+    DAGRequestBuilder & topN(MockOrderByItemVec order_by_items, int limit);
+    DAGRequestBuilder & topN(MockOrderByItemVec order_by_items, ASTPtr limit_expr);
 
-    DAGRequestBuilder & project(const String & col_name);
-    DAGRequestBuilder & project(MockAsts expr);
-    DAGRequestBuilder & project(MockColumnNames col_names);
+    DAGRequestBuilder & project(MockAstVec exprs);
+    DAGRequestBuilder & project(MockColumnNameVec col_names);
 
     DAGRequestBuilder & exchangeSender(tipb::ExchangeType exchange_type);
 
-    // Currentlt only support inner join, left join and right join.
+    // Currently only support inner join, left join and right join.
     // TODO support more types of join.
-    DAGRequestBuilder & join(const DAGRequestBuilder & right, MockAsts exprs);
-    DAGRequestBuilder & join(const DAGRequestBuilder & right, MockAsts exprs, ASTTableJoin::Kind kind);
+    DAGRequestBuilder & join(const DAGRequestBuilder & right, MockAstVec exprs);
+    DAGRequestBuilder & join(const DAGRequestBuilder & right, MockAstVec exprs, ASTTableJoin::Kind kind);
 
     // aggregation
     DAGRequestBuilder & aggregation(ASTPtr agg_func, ASTPtr group_by_expr);
-    DAGRequestBuilder & aggregation(MockAsts agg_funcs, MockAsts group_by_exprs);
+    DAGRequestBuilder & aggregation(MockAstVec agg_funcs, MockAstVec group_by_exprs);
+
+    // window
+    DAGRequestBuilder & window(ASTPtr window_func, MockOrderByItem order_by, MockPartitionByItem partition_by, MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count = 0);
+    DAGRequestBuilder & window(MockAstVec window_funcs, MockOrderByItemVec order_by_vec, MockPartitionByItemVec partition_by_vec, MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count = 0);
+    DAGRequestBuilder & window(ASTPtr window_func, MockOrderByItemVec order_by_vec, MockPartitionByItemVec partition_by_vec, MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count = 0);
+    DAGRequestBuilder & sort(MockOrderByItem order_by, bool is_partial_sort, uint64_t fine_grained_shuffle_stream_count = 0);
+    DAGRequestBuilder & sort(MockOrderByItemVec order_by_vec, bool is_partial_sort, uint64_t fine_grained_shuffle_stream_count = 0);
 
 private:
     void initDAGRequest(tipb::DAGRequest & dag_request);
     DAGRequestBuilder & buildAggregation(ASTPtr agg_funcs, ASTPtr group_by_exprs);
-    DAGRequestBuilder & buildExchangeReceiver(const MockColumnInfos & columns);
+    DAGRequestBuilder & buildExchangeReceiver(const MockColumnInfoVec & columns, uint64_t fine_grained_shuffle_stream_count = 0);
 
     ExecutorPtr root;
     DAGProperties properties;
@@ -123,30 +126,25 @@ class MockDAGRequestContext
         return DAGRequestBuilder(index);
     }
 
-    void addMockTable(const MockTableName & name, const MockColumnInfoList & columnInfos);
-    void addMockTable(const String & db, const String & table, const MockColumnInfos & columnInfos);
-    void addMockTable(const MockTableName & name, const MockColumnInfos & columnInfos);
-    void addExchangeRelationSchema(String name, const MockColumnInfos & columnInfos);
-    void addExchangeRelationSchema(String name, const MockColumnInfoList & columnInfos);
+    void addMockTable(const String & db, const String & table, const MockColumnInfoVec & columnInfos);
+    void addMockTable(const MockTableName & name, const MockColumnInfoVec & columnInfos);
+    void addExchangeRelationSchema(String name, const MockColumnInfoVec & columnInfos);
     void addMockTableColumnData(const String & db, const String & table, ColumnsWithTypeAndName columns);
-    void addMockTable(const String & db, const String & table, const MockColumnInfoList & columnInfos, ColumnsWithTypeAndName columns);
-    void addMockTable(const String & db, const String & table, const MockColumnInfos & columnInfos, ColumnsWithTypeAndName columns);
-    void addMockTable(const MockTableName & name, const MockColumnInfoList & columnInfos, ColumnsWithTypeAndName columns);
-    void addMockTable(const MockTableName & name, const MockColumnInfos & columnInfos, ColumnsWithTypeAndName columns);
+    void addMockTable(const String & db, const String & table, const MockColumnInfoVec & columnInfos, ColumnsWithTypeAndName columns);
+    void addMockTable(const MockTableName & name, const MockColumnInfoVec & columnInfos, ColumnsWithTypeAndName columns);
     void addMockTableColumnData(const MockTableName & name, ColumnsWithTypeAndName columns);
     void addExchangeReceiverColumnData(const String & name, ColumnsWithTypeAndName columns);
-    void addExchangeReceiver(const String & name, MockColumnInfos columnInfos, ColumnsWithTypeAndName columns);
-    void addExchangeReceiver(const String & name, MockColumnInfoList columnInfos, ColumnsWithTypeAndName columns);
+    void addExchangeReceiver(const String & name, MockColumnInfoVec columnInfos, ColumnsWithTypeAndName columns);
 
     std::unordered_map<String, ColumnsWithTypeAndName> & executorIdColumnsMap() { return executor_id_columns_map; }
 
     DAGRequestBuilder scan(String db_name, String table_name);
-    DAGRequestBuilder receive(String exchange_name);
+    DAGRequestBuilder receive(String exchange_name, uint64_t fine_grained_shuffle_stream_count = 0);
 
 private:
     size_t index;
-    std::unordered_map<String, MockColumnInfos> mock_tables;
-    std::unordered_map<String, MockColumnInfos> exchange_schemas;
+    std::unordered_map<String, MockColumnInfoVec> mock_tables;
+    std::unordered_map<String, MockColumnInfoVec> exchange_schemas;
     std::unordered_map<String, ColumnsWithTypeAndName> mock_table_columns;
     std::unordered_map<String, ColumnsWithTypeAndName> mock_exchange_columns;
     std::unordered_map<String, ColumnsWithTypeAndName> executor_id_columns_map; /// <executor_id, columns>
@@ -161,8 +159,10 @@ class MockDAGRequestContext
 
 ASTPtr buildColumn(const String & column_name);
 ASTPtr buildLiteral(const Field & field);
-ASTPtr buildFunction(MockAsts exprs, const String & name);
-ASTPtr buildOrderByItemList(MockOrderByItems order_by_items);
+ASTPtr buildFunction(MockAstVec exprs, const String & name);
+ASTPtr buildOrderByItemVec(MockOrderByItemVec order_by_items);
+
+MockWindowFrame buildDefaultRowsFrame();
 
 #define col(name) buildColumn((name))
 #define lit(field) buildLiteral((field))
@@ -173,7 +173,12 @@ ASTPtr buildOrderByItemList(MockOrderByItems order_by_items);
 #define gt(expr1, expr2) makeASTFunction("greater", (expr1), (expr2))
 #define And(expr1, expr2) makeASTFunction("and", (expr1), (expr2))
 #define Or(expr1, expr2) makeASTFunction("or", (expr1), (expr2))
-#define NOT(expr) makeASTFunction("not", (expr1), (expr2))
-#define Max(expr) makeASTFunction("max", expr)
-
-} // namespace DB::tests
\ No newline at end of file
+#define NOT(expr) makeASTFunction("not", (expr))
+#define Max(expr) makeASTFunction("max", (expr))
+#define Sum(expr) makeASTFunction("sum", (expr))
+/// Window functions
+#define RowNumber() makeASTFunction("RowNumber")
+#define Rank() makeASTFunction("Rank")
+#define DenseRank() makeASTFunction("DenseRank")
+
+} // namespace DB::tests
diff --git a/dbms/src/TestUtils/tests/gtest_mock_executors.cpp b/dbms/src/TestUtils/tests/gtest_mock_executors.cpp
index 214148fe47f..72f0bb505d1 100644
--- a/dbms/src/TestUtils/tests/gtest_mock_executors.cpp
+++ b/dbms/src/TestUtils/tests/gtest_mock_executors.cpp
@@ -76,7 +76,7 @@ TEST_F(MockDAGRequestTest, Projection)
 try
 {
     auto request = context.scan("test_db", "test_table")
-                       .project("s1")
+                       .project({"s1"})
                        .build(context);
     {
         String expected = "project_1 | {<0, String>}\n"
@@ -252,5 +252,17 @@ try
 }
 CATCH
 
+TEST_F(MockDAGRequestTest, MockWindow)
+try
+{
+    auto request = context.scan("test_db", "test_table").sort({"s1", false}, true).window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame()).build(context);
+    {
+        String expected = "window_2 | partition_by: {(<1, String>, desc: false)}}, order_by: {(<0, String>, desc: true)}, func_desc: {row_number()}, frame: {start<2, false, 0>, end<2, false, 0>}\n"
+                          " sort_1 | isPartialSort: true, partition_by: {(<0, String>, desc: false)}\n"
+                          "  table_scan_0 | {<0, String>, <1, String>}\n";
+        ASSERT_DAGREQUEST_EQAUL(expected, request);
+    }
+}
+CATCH
 } // namespace tests
 } // namespace DB
\ No newline at end of file
diff --git a/dbms/src/TestUtils/tests/gtest_print_columns.cpp b/dbms/src/TestUtils/tests/gtest_print_columns.cpp
new file mode 100644
index 00000000000..50631fc4f4a
--- /dev/null
+++ b/dbms/src/TestUtils/tests/gtest_print_columns.cpp
@@ -0,0 +1,57 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/ExecutorTestUtils.h>
+#include <TestUtils/mockExecutor.h>
+
+namespace DB
+{
+namespace tests
+{
+
+class PrintColumnsTest : public DB::tests::ExecutorTest
+{
+public:
+    using ColStringType = std::optional<typename TypeTraits<String>::FieldType>;
+    using ColInt32Type = std::optional<typename TypeTraits<Int32>::FieldType>;
+    using ColumnWithString = std::vector<ColStringType>;
+    using ColumnWithInt32 = std::vector<ColInt32Type>;
+
+    void initializeContext() override
+    {
+        test_cols.push_back(toNullableVec<Int32>("col1", ColumnWithInt32{36, 34, 32, 27, {}, {}}));
+        test_cols.push_back(toNullableVec<String>("col2", ColumnWithString{"female", "male", "male", "female", "male", "female"}));
+        col_len = test_cols[0].column->size();
+    }
+
+    ColumnsWithTypeAndName test_cols;
+    size_t col_len;
+    const String result1{"col1: (0: Int64_36, 1: Int64_34, 2: Int64_32, 3: Int64_27, 4: NULL, 5: NULL)\ncol2: (0: 'female', 1: 'male', 2: 'male', 3: 'female', 4: 'male', 5: 'female')\n"};
+    const String result2{"col1: (0: Int64_36, 1: Int64_34, 2: Int64_32, 3: Int64_27, 4: NULL, 5: NULL)\ncol2: (0: 'female', 1: 'male', 2: 'male', 3: 'female', 4: 'male', 5: 'female')\n"};
+    const String result3{"col1: (0: Int64_36)\ncol2: (0: 'female')\n"};
+    const String result4{"col1: (1: Int64_34, 2: Int64_32, 3: Int64_27, 4: NULL)\ncol2: (1: 'male', 2: 'male', 3: 'female', 4: 'male')\n"};
+};
+
+TEST_F(PrintColumnsTest, SimpleTest)
+try
+{
+    EXPECT_EQ(getColumnsContent(test_cols), result1);
+    EXPECT_EQ(getColumnsContent(test_cols, 0, col_len - 1), result2);
+    EXPECT_EQ(getColumnsContent(test_cols, 0, 0), result3);
+    EXPECT_EQ(getColumnsContent(test_cols, 1, col_len - 2), result4);
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/TiDB/Schema/SchemaBuilder-internal.h b/dbms/src/TiDB/Schema/SchemaBuilder-internal.h
index a331205ce8c..94edcbea204 100644
--- a/dbms/src/TiDB/Schema/SchemaBuilder-internal.h
+++ b/dbms/src/TiDB/Schema/SchemaBuilder-internal.h
@@ -35,7 +35,7 @@ struct TableInfo;
 }
 namespace DB
 {
-std::tuple<NamesAndTypes, Strings> parseColumnsFromTableInfo(const TiDB::TableInfo & table_info, Poco::Logger * log);
+std::tuple<NamesAndTypes, Strings> parseColumnsFromTableInfo(const TiDB::TableInfo & table_info);
 
 constexpr char tmpNamePrefix[] = "_tiflash_tmp_";
 
diff --git a/dbms/src/TiDB/Schema/SchemaBuilder.cpp b/dbms/src/TiDB/Schema/SchemaBuilder.cpp
index 99e540e6c95..6e4ad10e344 100644
--- a/dbms/src/TiDB/Schema/SchemaBuilder.cpp
+++ b/dbms/src/TiDB/Schema/SchemaBuilder.cpp
@@ -36,6 +36,7 @@
 #include <Storages/IManageableStorage.h>
 #include <Storages/MutableSupport.h>
 #include <Storages/Transaction/TMTContext.h>
+#include <Storages/Transaction/TiDB.h>
 #include <Storages/Transaction/TypeMapping.h>
 #include <TiDB/Schema/SchemaBuilder-internal.h>
 #include <TiDB/Schema/SchemaBuilder.h>
@@ -320,7 +321,7 @@ inline SchemaChanges detectSchemaChanges(
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyAlterPhysicalTable(DBInfoPtr db_info, TableInfoPtr table_info, ManageableStoragePtr storage)
+void SchemaBuilder<Getter, NameMapper>::applyAlterPhysicalTable(const DBInfoPtr & db_info, const TableInfoPtr & table_info, const ManageableStoragePtr & storage)
 {
     LOG_FMT_INFO(log, "Altering table {}", name_mapper.debugCanonicalName(*db_info, *table_info));
 
@@ -370,7 +371,15 @@ void SchemaBuilder<Getter, NameMapper>::applyAlterPhysicalTable(DBInfoPtr db_inf
         const auto & schema_change = schema_changes[i];
         /// Update column infos by applying schema change in this step.
         schema_change.second(orig_table_info);
-        /// Update schema version aggressively for the sake of correctness.
+        /// Update schema version aggressively for the sake of correctness（for read part).
+        /// In read action, we will use table_info.schema_version(storage_version) and TiDBSchemaSyncer.cur_version(global_version) to compare with query_version, to decide whether we can read under this query_version, or we need to make the schema newer.
+        /// In our comparison logic, we only serve the query when the query schema version meet the criterion: storage_version <= query_version <= global_version(The more detail info you can refer the comments in DAGStorageInterpreter::getAndLockStorages.)
+        /// And when apply multi diffs here, we only update global_version when all diffs have been applied.
+        /// So the global_version may be less than the actual "global_version" of the local schema in the process of applying schema changes.
+        /// And if we don't update the storage_version ahead of time, we may meet the following case when apply multiple diffs: storage_version <= global_version < actual "global_version".
+        /// If we receive a query with the same version as global_version, we can have the following scenario: storage_version <= global_version == query_version < actual "global_version".
+        /// And because storage_version <= global_version == query_version meet the criterion of serving the query, the query will be served. But query_version < actual "global_version" indicates that we use a newer schema to server an older query which may cause some inconsistency issue.
+        /// So we update storage_version aggressively to prevent the above scenario happens.
         orig_table_info.schema_version = target_version;
         auto alter_lock = storage->lockForAlter(getThreadName());
         storage->alterFromTiDB(
@@ -386,7 +395,7 @@ void SchemaBuilder<Getter, NameMapper>::applyAlterPhysicalTable(DBInfoPtr db_inf
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyAlterTable(DBInfoPtr db_info, TableID table_id)
+void SchemaBuilder<Getter, NameMapper>::applyAlterTable(const DBInfoPtr & db_info, TableID table_id)
 {
     auto table_info = getter.getTableInfo(db_info->id, table_id);
     if (table_info == nullptr)
@@ -405,7 +414,7 @@ void SchemaBuilder<Getter, NameMapper>::applyAlterTable(DBInfoPtr db_info, Table
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyAlterLogicalTable(DBInfoPtr db_info, TableInfoPtr table_info, ManageableStoragePtr storage)
+void SchemaBuilder<Getter, NameMapper>::applyAlterLogicalTable(const DBInfoPtr & db_info, const TableInfoPtr & table_info, const ManageableStoragePtr & storage)
 {
     // Alter logical table first.
     applyAlterPhysicalTable(db_info, table_info, storage);
@@ -534,6 +543,11 @@ void SchemaBuilder<Getter, NameMapper>::applyDiff(const SchemaDiff & diff)
         applySetTiFlashReplica(db_info, diff.table_id);
         break;
     }
+    case SchemaActionType::SetTiFlashMode:
+    {
+        applySetTiFlashMode(db_info, diff.table_id);
+        break;
+    }
     default:
     {
         if (diff.type < SchemaActionType::MaxRecognizedType)
@@ -561,7 +575,7 @@ void SchemaBuilder<Getter, NameMapper>::applyDiff(const SchemaDiff & diff)
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyPartitionDiff(TiDB::DBInfoPtr db_info, TableID table_id)
+void SchemaBuilder<Getter, NameMapper>::applyPartitionDiff(const TiDB::DBInfoPtr & db_info, TableID table_id)
 {
     auto table_info = getter.getTableInfo(db_info->id, table_id);
     if (table_info == nullptr)
@@ -585,7 +599,7 @@ void SchemaBuilder<Getter, NameMapper>::applyPartitionDiff(TiDB::DBInfoPtr db_in
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyPartitionDiff(TiDB::DBInfoPtr db_info, TableInfoPtr table_info, ManageableStoragePtr storage)
+void SchemaBuilder<Getter, NameMapper>::applyPartitionDiff(const TiDB::DBInfoPtr & db_info, const TableInfoPtr & table_info, const ManageableStoragePtr & storage)
 {
     const auto & orig_table_info = storage->getTableInfo();
     if (!orig_table_info.isLogicalPartitionTable())
@@ -651,7 +665,7 @@ void SchemaBuilder<Getter, NameMapper>::applyPartitionDiff(TiDB::DBInfoPtr db_in
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyRenameTable(DBInfoPtr new_db_info, TableID table_id)
+void SchemaBuilder<Getter, NameMapper>::applyRenameTable(const DBInfoPtr & new_db_info, TableID table_id)
 {
     auto new_table_info = getter.getTableInfo(new_db_info->id, table_id);
     if (new_table_info == nullptr)
@@ -671,9 +685,9 @@ void SchemaBuilder<Getter, NameMapper>::applyRenameTable(DBInfoPtr new_db_info,
 
 template <typename Getter, typename NameMapper>
 void SchemaBuilder<Getter, NameMapper>::applyRenameLogicalTable(
-    DBInfoPtr new_db_info,
-    TableInfoPtr new_table_info,
-    ManageableStoragePtr storage)
+    const DBInfoPtr & new_db_info,
+    const TableInfoPtr & new_table_info,
+    const ManageableStoragePtr & storage)
 {
     applyRenamePhysicalTable(new_db_info, *new_table_info, storage);
 
@@ -695,9 +709,9 @@ void SchemaBuilder<Getter, NameMapper>::applyRenameLogicalTable(
 
 template <typename Getter, typename NameMapper>
 void SchemaBuilder<Getter, NameMapper>::applyRenamePhysicalTable(
-    DBInfoPtr new_db_info,
-    TableInfo & new_table_info,
-    ManageableStoragePtr storage)
+    const DBInfoPtr & new_db_info,
+    const TableInfo & new_table_info,
+    const ManageableStoragePtr & storage)
 {
     const auto old_mapped_db_name = storage->getDatabaseName();
     const auto new_mapped_db_name = name_mapper.mapDatabaseName(*new_db_info);
@@ -900,7 +914,7 @@ String createDatabaseStmt(Context & context, const DBInfo & db_info, const Schem
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyCreateSchema(TiDB::DBInfoPtr db_info)
+void SchemaBuilder<Getter, NameMapper>::applyCreateSchema(const TiDB::DBInfoPtr & db_info)
 {
     GET_METRIC(tiflash_schema_internal_ddl_count, type_create_db).Increment();
     LOG_FMT_INFO(log, "Creating database {}", name_mapper.debugDatabaseName(*db_info));
@@ -963,13 +977,12 @@ void SchemaBuilder<Getter, NameMapper>::applyDropSchema(const String & db_name)
 }
 
 std::tuple<NamesAndTypes, Strings>
-parseColumnsFromTableInfo(const TiDB::TableInfo & table_info, Poco::Logger * log)
+parseColumnsFromTableInfo(const TiDB::TableInfo & table_info)
 {
     NamesAndTypes columns;
     std::vector<String> primary_keys;
     for (const auto & column : table_info.columns)
     {
-        LOG_FMT_DEBUG(log, "Analyzing column: {}, type: {}", column.name, static_cast<int>(column.tp));
         DataTypePtr type = getDataTypeByColumnInfo(column);
         columns.emplace_back(column.name, type);
         if (column.hasPriKeyFlag())
@@ -999,7 +1012,7 @@ String createTableStmt(
     Poco::Logger * log)
 {
     LOG_FMT_DEBUG(log, "Analyzing table info : {}", table_info.serialize());
-    auto [columns, pks] = parseColumnsFromTableInfo(table_info, log);
+    auto [columns, pks] = parseColumnsFromTableInfo(table_info);
 
     String stmt;
     WriteBufferFromString stmt_buf(stmt);
@@ -1040,7 +1053,7 @@ String createTableStmt(
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyCreatePhysicalTable(DBInfoPtr db_info, TableInfoPtr table_info)
+void SchemaBuilder<Getter, NameMapper>::applyCreatePhysicalTable(const DBInfoPtr & db_info, const TableInfoPtr & table_info)
 {
     GET_METRIC(tiflash_schema_internal_ddl_count, type_create_table).Increment();
     LOG_FMT_INFO(log, "Creating table {}", name_mapper.debugCanonicalName(*db_info, *table_info));
@@ -1102,7 +1115,7 @@ void SchemaBuilder<Getter, NameMapper>::applyCreatePhysicalTable(DBInfoPtr db_in
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyCreateTable(TiDB::DBInfoPtr db_info, TableID table_id)
+void SchemaBuilder<Getter, NameMapper>::applyCreateTable(const TiDB::DBInfoPtr & db_info, TableID table_id)
 {
     auto table_info = getter.getTableInfo(db_info->id, table_id);
     if (table_info == nullptr)
@@ -1116,7 +1129,7 @@ void SchemaBuilder<Getter, NameMapper>::applyCreateTable(TiDB::DBInfoPtr db_info
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyCreateLogicalTable(TiDB::DBInfoPtr db_info, TableInfoPtr table_info)
+void SchemaBuilder<Getter, NameMapper>::applyCreateLogicalTable(const TiDB::DBInfoPtr & db_info, const TableInfoPtr & table_info)
 {
     if (table_info->isLogicalPartitionTable())
     {
@@ -1162,7 +1175,7 @@ void SchemaBuilder<Getter, NameMapper>::applyDropPhysicalTable(const String & db
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyDropTable(DBInfoPtr db_info, TableID table_id)
+void SchemaBuilder<Getter, NameMapper>::applyDropTable(const DBInfoPtr & db_info, TableID table_id)
 {
     auto & tmt_context = context.getTMTContext();
     auto * storage = tmt_context.getStorages().get(table_id).get();
@@ -1186,13 +1199,14 @@ void SchemaBuilder<Getter, NameMapper>::applyDropTable(DBInfoPtr db_info, TableI
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplica(TiDB::DBInfoPtr db_info, TableID table_id)
+void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplica(const TiDB::DBInfoPtr & db_info, TableID table_id)
 {
     auto latest_table_info = getter.getTableInfo(db_info->id, table_id);
     if (unlikely(latest_table_info == nullptr))
     {
         throw TiFlashException(fmt::format("miss table in TiKV : {}", table_id), Errors::DDL::StaleSchema);
     }
+
     auto & tmt_context = context.getTMTContext();
     auto storage = tmt_context.getStorages().get(latest_table_info->id);
     if (unlikely(storage == nullptr))
@@ -1201,18 +1215,37 @@ void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplica(TiDB::DBInfoPtr d
                                Errors::DDL::MissingTable);
     }
 
-    auto managed_storage = std::dynamic_pointer_cast<IManageableStorage>(storage);
-    if (unlikely(!managed_storage))
-        throw Exception(fmt::format("{} is not a ManageableStorage", name_mapper.debugCanonicalName(*db_info, *latest_table_info)));
+    applySetTiFlashReplicaOnLogicalTable(db_info, latest_table_info, storage);
+}
 
-    applySetTiFlashReplica(db_info, latest_table_info, managed_storage);
+template <typename Getter, typename NameMapper>
+void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplicaOnLogicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage)
+{
+    applySetTiFlashReplicaOnPhysicalTable(db_info, table_info, storage);
+
+    if (table_info->isLogicalPartitionTable())
+    {
+        auto & tmt_context = context.getTMTContext();
+
+        for (const auto & part_def : table_info->partition.definitions)
+        {
+            auto new_part_table_info = table_info->producePartitionTableInfo(part_def.id, name_mapper);
+            auto part_storage = tmt_context.getStorages().get(new_part_table_info->id);
+            if (unlikely(part_storage == nullptr))
+            {
+                throw TiFlashException(fmt::format("miss table in TiFlash : {}", name_mapper.debugCanonicalName(*db_info, *new_part_table_info)),
+                                       Errors::DDL::MissingTable);
+            }
+            applySetTiFlashReplicaOnPhysicalTable(db_info, new_part_table_info, part_storage);
+        }
+    }
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplica(
-    TiDB::DBInfoPtr db_info,
-    TiDB::TableInfoPtr latest_table_info,
-    ManageableStoragePtr storage)
+void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplicaOnPhysicalTable(
+    const TiDB::DBInfoPtr & db_info,
+    const TiDB::TableInfoPtr & latest_table_info,
+    const ManageableStoragePtr & storage)
 {
     if (storage->getTableInfo().replica_info.count == latest_table_info->replica_info.count)
         return;
@@ -1231,6 +1264,75 @@ void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplica(
     LOG_FMT_INFO(log, "Updated replica info for {}", name_mapper.debugCanonicalName(*db_info, table_info));
 }
 
+
+template <typename Getter, typename NameMapper>
+void SchemaBuilder<Getter, NameMapper>::applySetTiFlashMode(const TiDB::DBInfoPtr & db_info, TableID table_id)
+{
+    auto latest_table_info = getter.getTableInfo(db_info->id, table_id);
+
+    if (unlikely(latest_table_info == nullptr))
+    {
+        throw TiFlashException(fmt::format("miss table in TiKV : {}", table_id), Errors::DDL::StaleSchema);
+    }
+
+    auto & tmt_context = context.getTMTContext();
+    auto storage = tmt_context.getStorages().get(latest_table_info->id);
+    if (unlikely(storage == nullptr))
+    {
+        throw TiFlashException(fmt::format("miss table in TiFlash : {}", name_mapper.debugCanonicalName(*db_info, *latest_table_info)),
+                               Errors::DDL::MissingTable);
+    }
+
+    applySetTiFlashModeOnLogicalTable(db_info, latest_table_info, storage);
+}
+
+template <typename Getter, typename NameMapper>
+void SchemaBuilder<Getter, NameMapper>::applySetTiFlashModeOnLogicalTable(
+    const TiDB::DBInfoPtr & db_info,
+    const TiDB::TableInfoPtr & table_info,
+    const ManageableStoragePtr & storage)
+{
+    applySetTiFlashModeOnPhysicalTable(db_info, table_info, storage);
+
+    if (table_info->isLogicalPartitionTable())
+    {
+        auto & tmt_context = context.getTMTContext();
+        for (const auto & part_def : table_info->partition.definitions)
+        {
+            auto new_part_table_info = table_info->producePartitionTableInfo(part_def.id, name_mapper);
+            auto part_storage = tmt_context.getStorages().get(table_info->id);
+            if (unlikely(part_storage == nullptr))
+            {
+                throw TiFlashException(fmt::format("miss table in TiFlash : {}", name_mapper.debugCanonicalName(*db_info, *new_part_table_info)),
+                                       Errors::DDL::MissingTable);
+            }
+            applySetTiFlashModeOnPhysicalTable(db_info, new_part_table_info, part_storage);
+        }
+    }
+}
+
+
+template <typename Getter, typename NameMapper>
+void SchemaBuilder<Getter, NameMapper>::applySetTiFlashModeOnPhysicalTable(
+    const TiDB::DBInfoPtr & db_info,
+    const TiDB::TableInfoPtr & latest_table_info,
+    const ManageableStoragePtr & storage)
+{
+    if (storage->getTableInfo().tiflash_mode == latest_table_info->tiflash_mode)
+        return;
+
+    TiDB::TableInfo table_info = storage->getTableInfo();
+    table_info.tiflash_mode = latest_table_info->tiflash_mode;
+    AlterCommands commands;
+
+    LOG_FMT_INFO(log, "Updating tiflash mode for {} to {}", name_mapper.debugCanonicalName(*db_info, table_info), TiFlashModeToString(table_info.tiflash_mode));
+
+    auto alter_lock = storage->lockForAlter(getThreadName());
+    storage->alterFromTiDB(alter_lock, commands, name_mapper.mapDatabaseName(*db_info), table_info, name_mapper, context);
+    LOG_FMT_INFO(log, "Updated tiflash mode for {} to {}", name_mapper.debugCanonicalName(*db_info, table_info), TiFlashModeToString(table_info.tiflash_mode));
+}
+
+
 template <typename Getter, typename NameMapper>
 void SchemaBuilder<Getter, NameMapper>::syncAllSchema()
 {
@@ -1299,7 +1401,9 @@ void SchemaBuilder<Getter, NameMapper>::syncAllSchema()
             /// Rename if needed.
             applyRenameLogicalTable(db, table, storage);
             /// Update replica info if needed.
-            applySetTiFlashReplica(db, table, storage);
+            applySetTiFlashReplicaOnLogicalTable(db, table, storage);
+            /// Update tiflash mode if needed.
+            applySetTiFlashModeOnLogicalTable(db, table, storage);
             /// Alter if needed.
             applyAlterLogicalTable(db, table, storage);
             LOG_FMT_DEBUG(log, "Table {} synced during sync all schemas", name_mapper.debugCanonicalName(*db, *table));
diff --git a/dbms/src/TiDB/Schema/SchemaBuilder.h b/dbms/src/TiDB/Schema/SchemaBuilder.h
index 8446765f74a..827203a682f 100644
--- a/dbms/src/TiDB/Schema/SchemaBuilder.h
+++ b/dbms/src/TiDB/Schema/SchemaBuilder.h
@@ -55,39 +55,44 @@ struct SchemaBuilder
 
     bool applyCreateSchema(DatabaseID schema_id);
 
-    void applyCreateSchema(TiDB::DBInfoPtr db_info);
+    void applyCreateSchema(const TiDB::DBInfoPtr & db_info);
 
-    void applyCreateTable(TiDB::DBInfoPtr db_info, TableID table_id);
+    void applyCreateTable(const TiDB::DBInfoPtr & db_info, TableID table_id);
 
-    void applyCreateLogicalTable(TiDB::DBInfoPtr db_info, TiDB::TableInfoPtr table_info);
+    void applyCreateLogicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info);
 
-    void applyCreatePhysicalTable(TiDB::DBInfoPtr db_info, TiDB::TableInfoPtr table_info);
+    void applyCreatePhysicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info);
 
-    void applyDropTable(TiDB::DBInfoPtr db_info, TableID table_id);
+    void applyDropTable(const TiDB::DBInfoPtr & db_info, TableID table_id);
 
     /// Parameter schema_name should be mapped.
     void applyDropPhysicalTable(const String & db_name, TableID table_id);
 
-    void applyPartitionDiff(TiDB::DBInfoPtr db_info, TableID table_id);
+    void applyPartitionDiff(const TiDB::DBInfoPtr & db_info, TableID table_id);
 
-    void applyPartitionDiff(TiDB::DBInfoPtr db_info, TiDB::TableInfoPtr table_info, ManageableStoragePtr storage);
+    void applyPartitionDiff(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
 
-    void applyAlterTable(TiDB::DBInfoPtr db_info, TableID table_id);
+    void applyAlterTable(const TiDB::DBInfoPtr & db_info, TableID table_id);
 
-    void applyAlterLogicalTable(TiDB::DBInfoPtr db_info, TiDB::TableInfoPtr table_info, ManageableStoragePtr storage);
+    void applyAlterLogicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
 
-    void applyAlterPhysicalTable(TiDB::DBInfoPtr db_info, TiDB::TableInfoPtr table_info, ManageableStoragePtr storage);
+    void applyAlterPhysicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
 
-    void applyRenameTable(TiDB::DBInfoPtr new_db_info, TiDB::TableID table_id);
+    void applyRenameTable(const TiDB::DBInfoPtr & new_db_info, TiDB::TableID table_id);
 
-    void applyRenameLogicalTable(TiDB::DBInfoPtr new_db_info, TiDB::TableInfoPtr new_table_info, ManageableStoragePtr storage);
+    void applyRenameLogicalTable(const TiDB::DBInfoPtr & new_db_info, const TiDB::TableInfoPtr & new_table_info, const ManageableStoragePtr & storage);
 
-    void applyRenamePhysicalTable(TiDB::DBInfoPtr new_db_info, TiDB::TableInfo & new_table_info, ManageableStoragePtr storage);
+    void applyRenamePhysicalTable(const TiDB::DBInfoPtr & new_db_info, const TiDB::TableInfo & new_table_info, const ManageableStoragePtr & storage);
 
     void applyExchangeTablePartition(const SchemaDiff & diff);
 
-    void applySetTiFlashReplica(TiDB::DBInfoPtr db_info, TableID table_id);
-    void applySetTiFlashReplica(TiDB::DBInfoPtr db_info, TiDB::TableInfoPtr table_info, ManageableStoragePtr storage);
+    void applySetTiFlashReplica(const TiDB::DBInfoPtr & db_info, TableID table_id);
+    void applySetTiFlashReplicaOnLogicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
+    void applySetTiFlashReplicaOnPhysicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
+
+    void applySetTiFlashMode(const TiDB::DBInfoPtr & db_info, TableID table_id);
+    void applySetTiFlashModeOnLogicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
+    void applySetTiFlashModeOnPhysicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
 };
 
 } // namespace DB
diff --git a/dbms/src/TiDB/Schema/SchemaGetter.cpp b/dbms/src/TiDB/Schema/SchemaGetter.cpp
index 7f52f9301b1..6e333d6ba87 100644
--- a/dbms/src/TiDB/Schema/SchemaGetter.cpp
+++ b/dbms/src/TiDB/Schema/SchemaGetter.cpp
@@ -19,7 +19,6 @@
 
 namespace DB
 {
-
 namespace ErrorCodes
 {
 extern const int SCHEMA_SYNC_ERROR;
@@ -188,18 +187,26 @@ Int64 SchemaGetter::getVersion()
     return std::stoll(ver);
 }
 
+bool SchemaGetter::checkSchemaDiffExists(Int64 ver)
+{
+    String key = getSchemaDiffKey(ver);
+    String data = TxnStructure::get(snap, key);
+    return !data.empty();
+}
+
 String SchemaGetter::getSchemaDiffKey(Int64 ver)
 {
     return std::string(schemaDiffPrefix) + ":" + std::to_string(ver);
 }
 
-SchemaDiff SchemaGetter::getSchemaDiff(Int64 ver)
+std::optional<SchemaDiff> SchemaGetter::getSchemaDiff(Int64 ver)
 {
     String key = getSchemaDiffKey(ver);
     String data = TxnStructure::get(snap, key);
     if (data.empty())
     {
-        throw TiFlashException("cannot find schema diff for version: " + std::to_string(ver), Errors::Table::SyncError);
+        LOG_FMT_WARNING(log, "The schema diff for version {}, key {} is empty.", ver, key);
+        return std::nullopt;
     }
     SchemaDiff diff;
     diff.deserialize(data);
diff --git a/dbms/src/TiDB/Schema/SchemaGetter.h b/dbms/src/TiDB/Schema/SchemaGetter.h
index cfa5e1c6335..72fd00678f7 100644
--- a/dbms/src/TiDB/Schema/SchemaGetter.h
+++ b/dbms/src/TiDB/Schema/SchemaGetter.h
@@ -26,8 +26,11 @@
 
 #include <common/logger_useful.h>
 
+#include <optional>
+
 namespace DB
 {
+// The enum results are completely the same as the DDL Action listed in the "parser/model/ddl.go" of TiDB codebase, which must be keeping in sync.
 enum class SchemaActionType : Int8
 {
     None = 0,
@@ -91,11 +94,14 @@ enum class SchemaActionType : Int8
     AlterTableStatsOptions = 58,
     AlterNoCacheTable = 59,
     CreateTables = 60,
+    ActionMultiSchemaChange = 61,
+    SetTiFlashMode = 62,
+
 
     // If we supporte new type from TiDB.
     // MaxRecognizedType also needs to be changed.
     // It should always be equal to the maximum supported type + 1
-    MaxRecognizedType = 61,
+    MaxRecognizedType = 63,
 };
 
 struct AffectedOption
@@ -137,7 +143,9 @@ struct SchemaGetter
 
     Int64 getVersion();
 
-    SchemaDiff getSchemaDiff(Int64 ver);
+    bool checkSchemaDiffExists(Int64 ver);
+
+    std::optional<SchemaDiff> getSchemaDiff(Int64 ver);
 
     static String getSchemaDiffKey(Int64 ver);
 
diff --git a/dbms/src/TiDB/Schema/TiDBSchemaSyncer.h b/dbms/src/TiDB/Schema/TiDBSchemaSyncer.h
index 4fdba195acb..a23aeab139f 100644
--- a/dbms/src/TiDB/Schema/TiDBSchemaSyncer.h
+++ b/dbms/src/TiDB/Schema/TiDBSchemaSyncer.h
@@ -106,21 +106,31 @@ struct TiDBSchemaSyncer : public SchemaSyncer
         Stopwatch watch;
         SCOPE_EXIT({ GET_METRIC(tiflash_schema_apply_duration_seconds).Observe(watch.elapsedSeconds()); });
 
-        LOG_FMT_INFO(log, "start to sync schemas. current version is: {} and try to sync schema version to: {}", cur_version, version);
+        LOG_FMT_INFO(log, "Start to sync schemas. current version is: {} and try to sync schema version to: {}", cur_version, version);
 
         // Show whether the schema mutex is held for a long time or not.
         GET_METRIC(tiflash_schema_applying).Set(1.0);
         SCOPE_EXIT({ GET_METRIC(tiflash_schema_applying).Set(0.0); });
 
         GET_METRIC(tiflash_schema_apply_count, type_diff).Increment();
-        if (!tryLoadSchemaDiffs(getter, version, context))
+        // After the feature concurrent DDL, TiDB does `update schema version` before `set schema diff`, and they are done in separate transactions.
+        // So TiFlash may see a schema version X but no schema diff X, meaning that the transaction of schema diff X has not been committed or has
+        // been aborted.
+        // However, TiDB makes sure that if we get a schema version X, then the schema diff X-1 must exist. Otherwise the transaction of schema diff
+        // X-1 is aborted and we can safely ignore it.
+        // Since TiDB can not make sure the schema diff of the latest schema version X is not empty, under this situation we should set the `cur_version`
+        // to X-1 and try to fetch the schema diff X next time.
+        Int64 version_after_load_diff = 0;
+        if (version_after_load_diff = tryLoadSchemaDiffs(getter, version, context); version_after_load_diff == -1)
         {
             GET_METRIC(tiflash_schema_apply_count, type_full).Increment();
             loadAllSchema(getter, version, context);
+            // After loadAllSchema, we need update `version_after_load_diff` by last diff value exist or not
+            version_after_load_diff = getter.checkSchemaDiffExists(version) ? version : version - 1;
         }
-        cur_version = version;
+        cur_version = version_after_load_diff;
         GET_METRIC(tiflash_schema_version).Set(cur_version);
-        LOG_FMT_INFO(log, "end sync schema, version has been updated to {}", cur_version);
+        LOG_FMT_INFO(log, "End sync schema, version has been updated to {}{}", cur_version, cur_version == version ? "" : "(latest diff is empty)");
         return true;
     }
 
@@ -144,30 +154,60 @@ struct TiDBSchemaSyncer : public SchemaSyncer
         return it->second;
     }
 
-    bool tryLoadSchemaDiffs(Getter & getter, Int64 version, Context & context)
+    // Return Values
+    // - if latest schema diff is not empty, return the (latest_version)
+    // - if latest schema diff is empty, return the (latest_version - 1)
+    // - if error happend, return (-1)
+    Int64 tryLoadSchemaDiffs(Getter & getter, Int64 latest_version, Context & context)
     {
-        if (isTooOldSchema(cur_version, version))
+        if (isTooOldSchema(cur_version, latest_version))
         {
-            return false;
+            return -1;
         }
 
-        LOG_FMT_DEBUG(log, "try load schema diffs.");
+        LOG_FMT_DEBUG(log, "Try load schema diffs.");
 
-        SchemaBuilder<Getter, NameMapper> builder(getter, context, databases, version);
+        SchemaBuilder<Getter, NameMapper> builder(getter, context, databases, latest_version);
 
         Int64 used_version = cur_version;
-        std::vector<SchemaDiff> diffs;
-        while (used_version < version)
+        // First get all schema diff from `cur_version` to `latest_version`. Only apply the schema diff(s) if we fetch all
+        // schema diff without any exception.
+        std::vector<std::optional<SchemaDiff>> diffs;
+        while (used_version < latest_version)
         {
             used_version++;
             diffs.push_back(getter.getSchemaDiff(used_version));
         }
-        LOG_FMT_DEBUG(log, "end load schema diffs with total {} entries.", diffs.size());
+        LOG_FMT_DEBUG(log, "End load schema diffs with total {} entries.", diffs.size());
+
         try
         {
-            for (const auto & diff : diffs)
+            for (size_t diff_index = 0; diff_index < diffs.size(); ++diff_index)
             {
-                builder.applyDiff(diff);
+                const auto & schema_diff = diffs[diff_index];
+
+                if (!schema_diff)
+                {
+                    // If `schema diff` from `latest_version` got empty `schema diff`
+                    // Then we won't apply to `latest_version`, but we will apply to `latest_version - 1`
+                    // If `schema diff` from [`cur_version`, `latest_version - 1`] got empty `schema diff`
+                    // Then we should just skip it.
+                    //
+                    // example:
+                    //  - `cur_version` is 1, `latest_version` is 10
+                    //  - The schema diff of schema version [2,4,6] is empty, Then we just skip it.
+                    //  - The schema diff of schema version 10 is empty, Then we should just apply version into 9
+                    if (diff_index != diffs.size() - 1)
+                    {
+                        LOG_FMT_WARNING(log, "Skip the schema diff from version {}. ", cur_version + diff_index + 1);
+                        continue;
+                    }
+
+                    // if diff_index == diffs.size() - 1, return used_version - 1;
+                    return used_version - 1;
+                }
+
+                builder.applyDiff(*schema_diff);
             }
         }
         catch (TiFlashException & e)
@@ -177,7 +217,7 @@ struct TiDBSchemaSyncer : public SchemaSyncer
                 GET_METRIC(tiflash_schema_apply_count, type_failed).Increment();
             }
             LOG_FMT_WARNING(log, "apply diff meets exception : {} \n stack is {}", e.displayText(), e.getStackTrace().toString());
-            return false;
+            return -1;
         }
         catch (Exception & e)
         {
@@ -187,21 +227,22 @@ struct TiDBSchemaSyncer : public SchemaSyncer
             }
             GET_METRIC(tiflash_schema_apply_count, type_failed).Increment();
             LOG_FMT_WARNING(log, "apply diff meets exception : {} \n stack is {}", e.displayText(), e.getStackTrace().toString());
-            return false;
+            return -1;
         }
         catch (Poco::Exception & e)
         {
             GET_METRIC(tiflash_schema_apply_count, type_failed).Increment();
             LOG_FMT_WARNING(log, "apply diff meets exception : {}", e.displayText());
-            return false;
+            return -1;
         }
         catch (std::exception & e)
         {
             GET_METRIC(tiflash_schema_apply_count, type_failed).Increment();
             LOG_FMT_WARNING(log, "apply diff meets exception : {}", e.what());
-            return false;
+            return -1;
         }
-        return true;
+
+        return used_version;
     }
 
     void loadAllSchema(Getter & getter, Int64 version, Context & context)
diff --git a/dbms/src/WindowFunctions/tests/gtest_window_functions.cpp b/dbms/src/WindowFunctions/tests/gtest_window_functions.cpp
index e4205f6f938..06253cac66e 100644
--- a/dbms/src/WindowFunctions/tests/gtest_window_functions.cpp
+++ b/dbms/src/WindowFunctions/tests/gtest_window_functions.cpp
@@ -12,334 +12,180 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Common/MyTime.h>
-#include <Core/Block.h>
-#include <DataStreams/MockTableScanBlockInputStream.h>
-#include <Flash/Coprocessor/DAGQueryBlockInterpreter.h>
-#include <Flash/Coprocessor/InterpreterDAG.h>
-#include <TestUtils/FunctionTestUtils.h>
-#include <WindowFunctions/registerWindowFunctions.h>
-#include <google/protobuf/util/json_util.h>
+#include <TestUtils/ExecutorTestUtils.h>
 
 namespace DB::tests
 {
-class WindowFunction : public DB::tests::FunctionTest
+class WindowExecutorTestRunner : public DB::tests::ExecutorTest
 {
-protected:
-    std::shared_ptr<DAGQueryBlockInterpreter> mock_interpreter;
-
-    void SetUp() override
-    {
-        DB::tests::FunctionTest::SetUp();
-        DB::registerWindowFunctions();
-    }
-
-    template <typename T>
-    ColumnWithTypeAndName toNullableVec(String name, const std::vector<std::optional<typename TypeTraits<T>::FieldType>> & v)
-    {
-        return createColumn<Nullable<T>>(v, name);
-    }
-
-    template <typename T>
-    ColumnWithTypeAndName toVec(String name, const std::vector<typename TypeTraits<T>::FieldType> & v)
-    {
-        return createColumn<T>(v, name);
-    }
-
-    template <typename T>
-    static ColumnWithTypeAndName toConst(const T s)
-    {
-        return createConstColumn<T>(1, s);
-    }
-
-    static ColumnWithTypeAndName toDatetimeVec(String name, const std::vector<String> & v, int fsp)
-    {
-        std::vector<typename TypeTraits<MyDateTime>::FieldType> vec;
-        for (const auto & value_str : v)
-        {
-            Field value = parseMyDateTime(value_str, fsp);
-            vec.push_back(value.template safeGet<UInt64>());
-        }
-        DataTypePtr data_type = std::make_shared<DataTypeMyDateTime>(fsp);
-        return {makeColumn<MyDateTime>(data_type, vec), data_type, name, 0};
-    }
-
-    static ColumnWithTypeAndName toNullableDatetimeVec(String name, const std::vector<String> & v, int fsp)
-    {
-        std::vector<std::optional<typename TypeTraits<MyDateTime>::FieldType>> vec;
-        for (const auto & value_str : v)
-        {
-            if (!value_str.empty())
-            {
-                Field value = parseMyDateTime(value_str, fsp);
-                vec.push_back(value.template safeGet<UInt64>());
-            }
-            else
-            {
-                vec.push_back({});
-            }
-        }
-        DataTypePtr data_type = makeNullable(std::make_shared<DataTypeMyDateTime>(fsp));
-        return {makeColumn<Nullable<MyDateTime>>(data_type, vec), data_type, name, 0};
-    }
-
-    void setMaxBlockSize(int size)
-    {
-        context.getSettingsRef().max_block_size.set(size);
-    }
-
-    void mockInterpreter(std::vector<NameAndTypePair> source_columns, Context context)
-    {
-        std::vector<BlockInputStreams> mock_input_streams_vec = {};
-        DAGQueryBlock mock_query_block(0, static_cast<const google::protobuf::RepeatedPtrField<tipb::Executor>>(nullptr));
-        std::vector<SubqueriesForSets> mock_subqueries_for_sets = {};
-        mock_interpreter = std::make_shared<DAGQueryBlockInterpreter>(context,
-                                                                      mock_input_streams_vec,
-                                                                      mock_query_block,
-                                                                      1);
-
-        mock_interpreter->analyzer = std::make_unique<DAGExpressionAnalyzer>(std::move(source_columns), context);
-    }
-
-    void mockExecuteTableScan(DAGPipeline & pipeline, ColumnsWithTypeAndName columns)
-    {
-        pipeline.streams.push_back(std::make_shared<MockTableScanBlockInputStream>(columns, context.getSettingsRef().max_block_size));
-        mock_interpreter->input_streams_vec.push_back(pipeline.streams);
-    }
-
-    void mockExecuteWindowOrder(DAGPipeline & pipeline, std::string sort_json_str)
+public:
+    void initializeContext() override
     {
-        tipb::Sort sort;
-        google::protobuf::util::JsonStringToMessage(sort_json_str, &sort);
-        mock_interpreter->handleWindowOrder(pipeline, sort);
-        mock_interpreter->input_streams_vec[0] = pipeline.streams;
-        NamesWithAliases final_project;
-        for (const auto & column : (*mock_interpreter->analyzer).source_columns)
-        {
-            final_project.push_back({column.name, ""});
-        }
-        mockExecuteProject(pipeline, final_project);
-    }
-
-    void mockExecuteWindow(DAGPipeline & pipeline, std::string window_json_str)
-    {
-        tipb::Window window;
-        google::protobuf::util::JsonStringToMessage(window_json_str, &window);
-        mock_interpreter->handleWindow(pipeline, window);
-        mock_interpreter->input_streams_vec[0] = pipeline.streams;
-        NamesWithAliases final_project;
-        for (const auto & column : (*mock_interpreter->analyzer).source_columns)
-        {
-            final_project.push_back({column.name, ""});
-        }
-        mockExecuteProject(pipeline, final_project);
-    }
-
-    void mockExecuteProject(DAGPipeline & pipeline, NamesWithAliases & final_project)
-    {
-        mock_interpreter->executeProject(pipeline, final_project);
-    }
-
-    static Block mergeBlocks(Blocks blocks)
-    {
-        if (blocks.empty())
-        {
-            return {};
-        }
-        Block sample_block;
-        std::vector<MutableColumnPtr> actual_cols;
-
-        for (const auto & block : blocks)
-        {
-            if (!sample_block)
-            {
-                sample_block = block;
-                for (const auto & column : block.getColumnsWithTypeAndName())
-                {
-                    actual_cols.push_back(column.type->createColumn());
-                }
-            }
-
-            for (size_t i = 0; i < block.columns(); ++i)
-            {
-                for (size_t j = 0; j < block.rows(); ++j)
-                {
-                    actual_cols[i]->insert((*(block.getColumnsWithTypeAndName())[i].column)[j]);
-                }
-            }
-        }
-
-        ColumnsWithTypeAndName actual_columns;
-
-        for (size_t i = 0; i < actual_cols.size(); ++i)
-        {
-            actual_columns.push_back({std::move(actual_cols[i]), sample_block.getColumnsWithTypeAndName()[i].type, sample_block.getColumnsWithTypeAndName()[i].name, sample_block.getColumnsWithTypeAndName()[i].column_id});
-        }
-        return Block(actual_columns);
-    }
-
-    void testOneWindowFunction(const std::vector<NameAndTypePair> & source_column_types, const ColumnsWithTypeAndName & source_columns, const ColumnsWithTypeAndName & expect_columns, const std::string window_json_str, const std::string sort_json_str)
-    {
-        mockInterpreter(source_column_types, context);
-        DAGPipeline pipeline;
-        ExpressionActionsChain chain;
-        Block except_block(expect_columns);
-
-        mockExecuteTableScan(pipeline, source_columns);
-
-        mockExecuteWindowOrder(pipeline, sort_json_str);
-
-        mockExecuteWindow(pipeline, window_json_str);
-
-        auto stream = pipeline.firstStream();
-
-        Blocks actual_blocks;
-        while (Block block = stream->read())
-        {
-            actual_blocks.push_back(block);
-        }
-
-        Block actual_block = mergeBlocks(actual_blocks);
-
-        if (actual_block)
-        {
-            // Check that input columns is properly split to many blocks
-            ASSERT_EQ(actual_blocks.size(), (actual_block.rows() - 1) / context.getSettingsRef().max_block_size + 1);
-        }
-        ASSERT_BLOCK_EQ(except_block, actual_block);
+        ExecutorTest::initializeContext();
+        context.addMockTable(
+            {"test_db", "test_table"},
+            {{"partition", TiDB::TP::TypeLongLong}, {"order", TiDB::TP::TypeLongLong}},
+            {toVec<Int64>("partition", {1, 1, 1, 1, 2, 2, 2, 2}),
+             toVec<Int64>("order", {1, 1, 2, 2, 1, 1, 2, 2})});
+        context.addMockTable(
+            {"test_db", "test_table_string"},
+            {{"partition", TiDB::TP::TypeString}, {"order", TiDB::TP::TypeString}},
+            {toVec<String>("partition", {"banana", "banana", "banana", "banana", "apple", "apple", "apple", "apple"}),
+             toVec<String>("order", {"apple", "apple", "banana", "banana", "apple", "apple", "banana", "banana"})});
+
+        context.addMockTable(
+            {"test_db", "test_table_more_cols"},
+            {{"partition1", TiDB::TP::TypeLongLong}, {"partition2", TiDB::TP::TypeLongLong}, {"order1", TiDB::TP::TypeLongLong}, {"order2", TiDB::TP::TypeLongLong}},
+            {toVec<Int64>("partition1", {1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2}),
+             toVec<Int64>("partition2", {1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2}),
+             toVec<Int64>("order1", {2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1}),
+             toVec<Int64>("order2", {2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1})});
+
+        context.addMockTable(
+            {"test_db", "test_table_float64"},
+            {{"partition", TiDB::TP::TypeDouble}, {"order", TiDB::TP::TypeDouble}},
+            {toVec<Float64>("partition", {1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}),
+             toVec<Float64>("order", {1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00})});
+
+        context.addMockTable(
+            {"test_db", "test_table_datetime"},
+            {{"partition", TiDB::TP::TypeDatetime}, {"order", TiDB::TP::TypeDatetime}});
+
+        context.addMockTable(
+            {"test_db", "test_table_for_rank"},
+            {{"partition", TiDB::TP::TypeLongLong}, {"order", TiDB::TP::TypeLongLong}},
+            {toVec<Int64>("partition", {1, 1, 1, 1, 2, 2, 2, 2}),
+             toVec<Int64>("order", {1, 1, 2, 2, 1, 1, 2, 2})});
     }
 };
 
-TEST_F(WindowFunction, testWindowFunctionByPartitionAndOrder)
+TEST_F(WindowExecutorTestRunner, testWindowFunctionByPartitionAndOrder)
 try
 {
-    setMaxBlockSize(3);
-
-    std::string window_json;
-    std::string sort_json;
-
     /***** row_number with different types of input *****/
     // int - sql : select *, row_number() over w1 from test1 window w1 as (partition by partition_int order by order_int)
-    window_json = R"({"funcDesc":[{"tp":"RowNumber","sig":"Unspecified","fieldType":{"tp":8,"flag":128,"flen":21,"decimal":-1,"collate":63,"charset":"binary"},"hasDistinct":false}],"partitionBy":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"orderBy":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"frame":{"type":"Rows","start":{"type":"CurrentRow","unbounded":false,"offset":"0"},"end":{"type":"CurrentRow","unbounded":false,"offset":"0"}},"child":{"tp":"TypeSort","executorId":"Sort_12","sort":{"byItems":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"isPartialSort":true,"child":{"tp":"TypeExchangeReceiver","exchangeReceiver":{"encodedTaskMeta":["CIGAkMCV6NP+BRABIg4xMjcuMC4wLjE6MzkzMA=="],"fieldTypes":[{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"}]},"executorId":"ExchangeReceiver_11"}}}})";
-    sort_json = R"({"byItems":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"isPartialSort":true,"child":{"tp":"TypeExchangeReceiver","exchangeReceiver":{"encodedTaskMeta":["CIGAkMCV6NP+BRABIg4xMjcuMC4wLjE6MzkzMA=="],"fieldTypes":[{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"}]},"executorId":"ExchangeReceiver_11"}})";
-    testOneWindowFunction(
-        {NameAndTypePair("partition", std::make_shared<DataTypeInt64>()), NameAndTypePair("order", std::make_shared<DataTypeInt64>())},
-        {toVec<Int64>("partition", {1, 1, 1, 1, 2, 2, 2, 2}), toVec<Int64>("order", {1, 1, 2, 2, 1, 1, 2, 2})},
-        {toVec<Int64>("partition", {1, 1, 1, 1, 2, 2, 2, 2}), toVec<Int64>("order", {1, 1, 2, 2, 1, 1, 2, 2}), toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})},
-        window_json,
-        sort_json);
+    auto request = context
+                       .scan("test_db", "test_table")
+                       .sort({{"partition", false}, {"order", false}, {"partition", false}, {"order", false}}, true)
+                       .window(RowNumber(), {"order", false}, {"partition", false}, buildDefaultRowsFrame())
+                       .build(context);
+    ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                        createColumns({toNullableVec<Int64>("partition", {1, 1, 1, 1, 2, 2, 2, 2}),
+                                       toNullableVec<Int64>("order", {1, 1, 2, 2, 1, 1, 2, 2}),
+                                       toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // null input
-    testOneWindowFunction(
-        {NameAndTypePair("partition", makeNullable(std::make_shared<DataTypeInt64>())), NameAndTypePair("order", makeNullable(std::make_shared<DataTypeInt64>()))},
+    executeStreamsWithSingleSource(
+        request,
         {toNullableVec<Int64>("partition", {}), toNullableVec<Int64>("order", {})},
-        {},
-        window_json,
-        sort_json);
+        {});
 
     // nullable
-    testOneWindowFunction(
-        {NameAndTypePair("partition", makeNullable(std::make_shared<DataTypeInt64>())), NameAndTypePair("order", makeNullable(std::make_shared<DataTypeInt64>()))},
-        {toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}), toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2})},
-        {toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}), toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2}), toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})},
-        window_json,
-        sort_json);
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(request, {toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}), {toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2})}}),
+                        createColumns({toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}), toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2}), toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // string - sql : select *, row_number() over w1 from test2 window w1 as (partition by partition_string order by order_string)
-    window_json = R"({"funcDesc":[{"tp":"RowNumber","sig":"Unspecified","fieldType":{"tp":8,"flag":128,"flen":21,"decimal":-1,"collate":63,"charset":"binary"},"hasDistinct":false}],"partitionBy":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"},"hasDistinct":false},"desc":false}],"orderBy":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"},"hasDistinct":false},"desc":false}],"frame":{"type":"Rows","start":{"type":"CurrentRow","unbounded":false,"offset":"0"},"end":{"type":"CurrentRow","unbounded":false,"offset":"0"}},"child":{"tp":"TypeSort","executorId":"Sort_12","sort":{"byItems":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"},"hasDistinct":false},"desc":false}],"isPartialSort":true,"child":{"tp":"TypeExchangeReceiver","exchangeReceiver":{"encodedTaskMeta":["CIGA8Nz57tP+BRABIg4xMjcuMC4wLjE6MzkzMA=="],"fieldTypes":[{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"},{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"}]},"executorId":"ExchangeReceiver_11"}}}})";
-    sort_json = R"({"byItems":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"},"hasDistinct":false},"desc":false}],"isPartialSort":true,"child":{"tp":"TypeExchangeReceiver","exchangeReceiver":{"encodedTaskMeta":["CIGA8Nz57tP+BRABIg4xMjcuMC4wLjE6MzkzMA=="],"fieldTypes":[{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"},{"tp":254,"flag":0,"flen":32,"decimal":0,"collate":46,"charset":"utf8mb4"}]},"executorId":"ExchangeReceiver_11"}})";
-    testOneWindowFunction(
-        {NameAndTypePair("partition", std::make_shared<DataTypeString>()), NameAndTypePair("order", std::make_shared<DataTypeString>())},
-        {toVec<String>("partition", {"banana", "banana", "banana", "banana", "apple", "apple", "apple", "apple"}), toVec<String>("order", {"apple", "apple", "banana", "banana", "apple", "apple", "banana", "banana"})},
-        {toVec<String>("partition", {"apple", "apple", "apple", "apple", "banana", "banana", "banana", "banana"}), toVec<String>("order", {"apple", "apple", "banana", "banana", "apple", "apple", "banana", "banana"}), toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})},
-        window_json,
-        sort_json);
+    request = context
+                  .scan("test_db", "test_table_string")
+                  .sort({{"partition", false}, {"order", false}, {"partition", false}, {"order", false}}, true)
+                  .window(RowNumber(), {"order", false}, {"partition", false}, buildDefaultRowsFrame())
+                  .build(context);
 
-    // nullable
-    testOneWindowFunction(
-        {NameAndTypePair("partition", makeNullable(std::make_shared<DataTypeString>())), NameAndTypePair("order", makeNullable(std::make_shared<DataTypeString>()))},
-        {toNullableVec<String>("partition", {"banana", "banana", "banana", "banana", {}, "apple", "apple", "apple", "apple"}), toNullableVec<String>("order", {"apple", "apple", "banana", "banana", {}, "apple", "apple", "banana", "banana"})},
-        {toNullableVec<String>("partition", {{}, "apple", "apple", "apple", "apple", "banana", "banana", "banana", "banana"}), toNullableVec<String>("order", {{}, "apple", "apple", "banana", "banana", "apple", "apple", "banana", "banana"}), toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})},
-        window_json,
-        sort_json);
+    ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                        createColumns({toNullableVec<String>("partition", {"apple", "apple", "apple", "apple", "banana", "banana", "banana", "banana"}),
+                                       toNullableVec<String>("order", {"apple", "apple", "banana", "banana", "apple", "apple", "banana", "banana"}),
+                                       toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})}));
 
-    // decimal - sql : select *, row_number() over w1 from test3 window w1 as (partition by partition_float order by order_decimal)
-    window_json = R"({"funcDesc":[{"tp":"RowNumber","sig":"Unspecified","fieldType":{"tp":8,"flag":128,"flen":21,"decimal":-1,"collate":63,"charset":"binary"},"hasDistinct":false}],"partitionBy":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"orderBy":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"frame":{"type":"Rows","start":{"type":"CurrentRow","unbounded":false,"offset":"0"},"end":{"type":"CurrentRow","unbounded":false,"offset":"0"}},"child":{"tp":"TypeSort","executorId":"Sort_12","sort":{"byItems":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"isPartialSort":true,"child":{"tp":"TypeExchangeReceiver","exchangeReceiver":{"encodedTaskMeta":["CIGAoN3M99P+BRABIg4xMjcuMC4wLjE6MzkzMA=="],"fieldTypes":[{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"},{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"}]},"executorId":"ExchangeReceiver_11"}}}})";
-    sort_json = R"({"byItems":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"isPartialSort":true,"child":{"tp":"TypeExchangeReceiver","exchangeReceiver":{"encodedTaskMeta":["CIGAoN3M99P+BRABIg4xMjcuMC4wLjE6MzkzMA=="],"fieldTypes":[{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"},{"tp":246,"flag":0,"flen":6,"decimal":2,"collate":63,"charset":"binary"}]},"executorId":"ExchangeReceiver_11"}})";
-    testOneWindowFunction(
-        {NameAndTypePair("partition", std::make_shared<DataTypeDecimal256>()), NameAndTypePair("order", std::make_shared<DataTypeDecimal256>())},
-        {toVec<Float64>("partition", {1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}), toVec<Float64>("order", {1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00})},
-        {toVec<Float64>("partition", {1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}), toVec<Float64>("order", {1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00}), toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})},
-        window_json,
-        sort_json);
+    // nullable
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(request,
+                                                       {toNullableVec<String>("partition", {"banana", "banana", "banana", "banana", {}, "apple", "apple", "apple", "apple"}),
+                                                        toNullableVec<String>("order", {"apple", "apple", "banana", "banana", {}, "apple", "apple", "banana", "banana"})}),
+                        createColumns({toNullableVec<String>("partition", {{}, "apple", "apple", "apple", "apple", "banana", "banana", "banana", "banana"}),
+                                       toNullableVec<String>("order", {{}, "apple", "apple", "banana", "banana", "apple", "apple", "banana", "banana"}),
+                                       toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})}));
+
+    // float64 - sql : select *, row_number() over w1 from test3 window w1 as (partition by partition_float order by order_float64)
+    request = context
+                  .scan("test_db", "test_table_float64")
+                  .sort({{"partition", false}, {"order", false}, {"partition", false}, {"order", false}}, true)
+                  .window(RowNumber(), {"order", false}, {"partition", false}, buildDefaultRowsFrame())
+                  .build(context);
+
+    ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                        createColumns({toNullableVec<Float64>("partition", {1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}),
+                                       toNullableVec<Float64>("order", {1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00}),
+                                       toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // nullable
-    testOneWindowFunction(
-        {NameAndTypePair("partition", makeNullable(std::make_shared<DataTypeInt64>())), NameAndTypePair("order", makeNullable(std::make_shared<DataTypeInt64>()))},
-        {toNullableVec<Float64>("partition", {{}, 1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}), toNullableVec<Float64>("order", {{}, 1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00})},
-        {toNullableVec<Float64>("partition", {{}, 1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}), toNullableVec<Float64>("order", {{}, 1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00}), toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})},
-        window_json,
-        sort_json);
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(request,
+                                                       {toNullableVec<Float64>("partition", {{}, 1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}),
+                                                        toNullableVec<Float64>("order", {{}, 1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00})}),
+                        createColumns({toNullableVec<Float64>("partition", {{}, 1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}),
+                                       toNullableVec<Float64>("order", {{}, 1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00}),
+                                       toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // datetime - select *, row_number() over w1 from test4 window w1 as (partition by partition_datetime order by order_datetime);
-    window_json = R"({"funcDesc":[{"tp":"RowNumber","sig":"Unspecified","fieldType":{"tp":8,"flag":128,"flen":21,"decimal":-1,"collate":63,"charset":"binary"},"hasDistinct":false}],"partitionBy":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"orderBy":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"frame":{"type":"Rows","start":{"type":"CurrentRow","unbounded":false,"offset":"0"},"end":{"type":"CurrentRow","unbounded":false,"offset":"0"}},"child":{"tp":"TypeSort","executorId":"Sort_12","sort":{"byItems":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"isPartialSort":true,"child":{"tp":"TypeExchangeReceiver","exchangeReceiver":{"encodedTaskMeta":["CIGAsNmBhdT+BRABIg4xMjcuMC4wLjE6MzkzMA=="],"fieldTypes":[{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"},{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"}]},"executorId":"ExchangeReceiver_11"}}}})";
-    sort_json = R"({"byItems":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"isPartialSort":true,"child":{"tp":"TypeExchangeReceiver","exchangeReceiver":{"encodedTaskMeta":["CIGAsNmBhdT+BRABIg4xMjcuMC4wLjE6MzkzMA=="],"fieldTypes":[{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"},{"tp":12,"flag":128,"flen":26,"decimal":6,"collate":63,"charset":"binary"}]},"executorId":"ExchangeReceiver_11"}})";
-    testOneWindowFunction(
-        {NameAndTypePair("partition", std::make_shared<DataTypeMyDateTime>()), NameAndTypePair("order", std::make_shared<DataTypeMyDateTime>())},
-        {toDatetimeVec("partition", {"20220101010102", "20220101010102", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010101", "20220101010101"}, 0),
-         toDatetimeVec("order", {"20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0)},
-        {toDatetimeVec("partition", {"20220101010101", "20220101010101", "20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010102", "20220101010102"}, 0),
-         toDatetimeVec("order", {"20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0),
-         toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})},
-        window_json,
-        sort_json);
+    request = context
+                  .scan("test_db", "test_table_datetime")
+                  .sort({{"partition", false}, {"order", false}, {"partition", false}, {"order", false}}, true)
+                  .window(RowNumber(), {"order", false}, {"partition", false}, buildDefaultRowsFrame())
+                  .build(context);
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(request,
+                                                       {toNullableDatetimeVec("partition", {"20220101010102", "20220101010102", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010101", "20220101010101"}, 0),
+                                                        toDatetimeVec("order", {"20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0)}),
+                        createColumns({toNullableDatetimeVec("partition", {"20220101010101", "20220101010101", "20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010102", "20220101010102"}, 0),
+                                       toNullableDatetimeVec("order", {"20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0),
+                                       toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // nullable
-    testOneWindowFunction(
-        {NameAndTypePair("partition", makeNullable(std::make_shared<DataTypeMyDateTime>())), NameAndTypePair("order", makeNullable(std::make_shared<DataTypeMyDateTime>()))},
-        {toNullableDatetimeVec("partition", {"20220101010102", {}, "20220101010102", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010101", "20220101010101"}, 0),
-         toNullableDatetimeVec("order", {"20220101010101", {}, "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0)},
-        {toNullableDatetimeVec("partition", {{}, "20220101010101", "20220101010101", "20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010102", "20220101010102"}, 0),
-         toNullableDatetimeVec("order", {{}, "20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0),
-         toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})},
-        window_json,
-        sort_json);
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(request,
+                                                       {toNullableDatetimeVec("partition", {"20220101010102", {}, "20220101010102", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010101", "20220101010101"}, 0),
+                                                        toNullableDatetimeVec("order", {"20220101010101", {}, "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0)}),
+                        createColumns({toNullableDatetimeVec("partition", {{}, "20220101010101", "20220101010101", "20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010102", "20220101010102"}, 0),
+                                       toNullableDatetimeVec("order", {{}, "20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0),
+                                       toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // 2 partiton key and 2 order key
     // sql : select *, row_number() over w1 from test6 window w1 as (partition by partition_int1, partition_int2 order by order_int1,order_int2)
-    window_json = R"({"funcDesc":[{"tp":"RowNumber","sig":"Unspecified","fieldType":{"tp":8,"flag":128,"flen":21,"decimal":-1,"collate":63,"charset":"binary"},"hasDistinct":false}],"partitionBy":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"orderBy":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAI=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAM=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"frame":{"type":"Rows","start":{"type":"CurrentRow","unbounded":false,"offset":"0"},"end":{"type":"CurrentRow","unbounded":false,"offset":"0"}},"child":{"tp":"TypeSort","executorId":"Sort_12","sort":{"byItems":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAI=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAM=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAI=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAM=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"isPartialSort":true,"child":{"tp":"TypeExchangeReceiver","exchangeReceiver":{"encodedTaskMeta":["CIKA0Img1If/BRABIg4xMjcuMC4wLjE6MzkzMA=="],"fieldTypes":[{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"}]},"executorId":"ExchangeReceiver_11"}}}})";
-    sort_json = R"({"byItems":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAI=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAM=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAI=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAM=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"isPartialSort":true,"child":{"tp":"TypeExchangeReceiver","exchangeReceiver":{"encodedTaskMeta":["CIKA0Img1If/BRABIg4xMjcuMC4wLjE6MzkzMA=="],"fieldTypes":[{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"}]},"executorId":"ExchangeReceiver_11"}})";
-    testOneWindowFunction(
-        {NameAndTypePair("partition1", std::make_shared<DataTypeInt64>()), NameAndTypePair("partition2", std::make_shared<DataTypeInt64>()), NameAndTypePair("order1", std::make_shared<DataTypeInt64>()), NameAndTypePair("order2", std::make_shared<DataTypeInt64>())},
-        {toVec<Int64>("partition1", {1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2}), toVec<Int64>("partition2", {1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2}), toVec<Int64>("order1", {2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1}), toVec<Int64>("order2", {2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1})},
-        {toVec<Int64>("partition1", {1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2}), toVec<Int64>("partition2", {1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2}), toVec<Int64>("order1", {1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2}), toVec<Int64>("order2", {1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2}), toNullableVec<Int64>("row_number", {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3})},
-        window_json,
-        sort_json);
+    request = context
+                  .scan("test_db", "test_table_more_cols")
+                  .sort({{"partition1", false}, {"partition2", false}, {"order1", false}, {"order2", false}}, true)
+                  .window(RowNumber(), {{"order1", false}, {"order2", false}}, {{"partition1", false}, {"partition2", false}}, buildDefaultRowsFrame())
+                  .build(context);
+
+    ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                        createColumns({toNullableVec<Int64>("partition1", {1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2}),
+                                       toNullableVec<Int64>("partition2", {1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2}),
+                                       toNullableVec<Int64>("order1", {1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2}),
+                                       toNullableVec<Int64>("order2", {1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2}),
+                                       toNullableVec<Int64>("row_number", {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3})}));
 
     /***** rank, dense_rank *****/
-    window_json = R"({"funcDesc":[{"tp":"Rank","sig":"Unspecified","fieldType":{"tp":8,"flag":128,"flen":21,"decimal":-1,"collate":63,"charset":"binary"},"hasDistinct":false},{"tp":"DenseRank","sig":"Unspecified","fieldType":{"tp":8,"flag":128,"flen":21,"decimal":-1,"collate":63,"charset":"binary"},"hasDistinct":false}],"partitionBy":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"orderBy":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"child":{"tp":"TypeSort","executorId":"Sort_12","sort":{"byItems":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"isPartialSort":true,"child":{"tp":"TypeExchangeReceiver","exchangeReceiver":{"encodedTaskMeta":["CIGAsOnl3NP+BRABIg4xMjcuMC4wLjE6MzkzMA=="],"fieldTypes":[{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"}]},"executorId":"ExchangeReceiver_11"}}}})";
-    sort_json = R"({"byItems":[{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAA=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false},{"expr":{"tp":"ColumnRef","val":"gAAAAAAAAAE=","sig":"Unspecified","fieldType":{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},"hasDistinct":false},"desc":false}],"isPartialSort":true,"child":{"tp":"TypeExchangeReceiver","exchangeReceiver":{"encodedTaskMeta":["CIGAsOnl3NP+BRABIg4xMjcuMC4wLjE6MzkzMA=="],"fieldTypes":[{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"},{"tp":3,"flag":0,"flen":11,"decimal":0,"collate":63,"charset":"binary"}]},"executorId":"ExchangeReceiver_11"}})";
-    testOneWindowFunction(
-        {NameAndTypePair("partition", std::make_shared<DataTypeInt64>()), NameAndTypePair("order", std::make_shared<DataTypeInt64>())},
-        {toVec<Int64>("partition", {1, 1, 1, 1, 2, 2, 2, 2}), toVec<Int64>("order", {1, 1, 2, 2, 1, 1, 2, 2})},
-        {toVec<Int64>("partition", {1, 1, 1, 1, 2, 2, 2, 2}), toVec<Int64>("order", {1, 1, 2, 2, 1, 1, 2, 2}), toNullableVec<Int64>("rank", {1, 1, 3, 3, 1, 1, 3, 3}), toNullableVec<Int64>("dense_rank", {1, 1, 2, 2, 1, 1, 2, 2})},
-        window_json,
-        sort_json);
+    request = context.scan("test_db", "test_table_for_rank").sort({{"partition", false}, {"order", false}}, true).window({Rank(), DenseRank()}, {{"order", false}}, {{"partition", false}}, MockWindowFrame{}).build(context);
+    ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                        createColumns({toNullableVec<Int64>("partition", {1, 1, 1, 1, 2, 2, 2, 2}),
+                                       toNullableVec<Int64>("order", {1, 1, 2, 2, 1, 1, 2, 2}),
+                                       toNullableVec<Int64>("rank", {1, 1, 3, 3, 1, 1, 3, 3}),
+                                       toNullableVec<Int64>("dense_rank", {1, 1, 2, 2, 1, 1, 2, 2})}));
 
     // nullable
-    testOneWindowFunction(
-        {NameAndTypePair("partition", makeNullable(std::make_shared<DataTypeInt64>())), NameAndTypePair("order", makeNullable(std::make_shared<DataTypeInt64>()))},
-        {toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}), toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2})},
-        {toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}), toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2}), toNullableVec<Int64>("rank", {1, 1, 1, 3, 3, 1, 1, 3, 3}), toNullableVec<Int64>("dense_rank", {1, 1, 1, 2, 2, 1, 1, 2, 2})},
-        window_json,
-        sort_json);
-
-    testOneWindowFunction(
-        {NameAndTypePair("partition", makeNullable(std::make_shared<DataTypeInt64>())), NameAndTypePair("order", makeNullable(std::make_shared<DataTypeInt64>()))},
-        {toNullableVec<Int64>("partition", {{}, {}, 1, 1, 1, 1, 2, 2, 2, 2}), toNullableVec<Int64>("order", {{}, 1, 1, 1, 2, 2, 1, 1, 2, 2})},
-        {toNullableVec<Int64>("partition", {{}, {}, 1, 1, 1, 1, 2, 2, 2, 2}), toNullableVec<Int64>("order", {{}, 1, 1, 1, 2, 2, 1, 1, 2, 2}), toNullableVec<Int64>("rank", {1, 2, 1, 1, 3, 3, 1, 1, 3, 3}), toNullableVec<Int64>("dense_rank", {1, 2, 1, 1, 2, 2, 1, 1, 2, 2})},
-        window_json,
-        sort_json);
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(request,
+                                                       {toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}),
+                                                        toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2})}),
+                        createColumns({toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}),
+                                       toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2}),
+                                       toNullableVec<Int64>("rank", {1, 1, 1, 3, 3, 1, 1, 3, 3}),
+                                       toNullableVec<Int64>("dense_rank", {1, 1, 1, 2, 2, 1, 1, 2, 2})}));
+
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(
+                            request,
+                            {toNullableVec<Int64>("partition", {{}, {}, 1, 1, 1, 1, 2, 2, 2, 2}),
+                             toNullableVec<Int64>("order", {{}, 1, 1, 1, 2, 2, 1, 1, 2, 2})}),
+                        createColumns({toNullableVec<Int64>("partition", {{}, {}, 1, 1, 1, 1, 2, 2, 2, 2}),
+                                       toNullableVec<Int64>("order", {{}, 1, 1, 1, 2, 2, 1, 1, 2, 2}),
+                                       toNullableVec<Int64>("rank", {1, 2, 1, 1, 3, 3, 1, 1, 3, 3}),
+                                       toNullableVec<Int64>("dense_rank", {1, 2, 1, 1, 2, 2, 1, 1, 2, 2})}));
 }
 CATCH
+
 } // namespace DB::tests
diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt
index 5fd25c5d238..2bedb312d07 100644
--- a/libs/libcommon/CMakeLists.txt
+++ b/libs/libcommon/CMakeLists.txt
@@ -198,3 +198,7 @@ if (ARCH_AMD64)
             src/crc64_sse2_asimd.cpp
             APPEND COMPILE_FLAGS "-mpclmul")
 endif()
+
+if (ARCH_AARCH64 AND ARCH_LINUX)
+    target_link_libraries (common PUBLIC tiflash-aarch64-string tiflash-aarch64-math)
+endif()
diff --git a/libs/libcommon/include/common/getMemoryAmount.h b/libs/libcommon/include/common/getMemoryAmount.h
index 98aa87661c3..0807c6f8e12 100644
--- a/libs/libcommon/include/common/getMemoryAmount.h
+++ b/libs/libcommon/include/common/getMemoryAmount.h
@@ -19,5 +19,6 @@
 /**
 * Returns the size of physical memory (RAM) in bytes.
 * Returns 0 on unsupported platform
+* Note: do not support environment under resource isolation mechanism like Docker, CGroup.
 */
 uint64_t getMemoryAmount();
diff --git a/libs/libcommon/include/common/types.h b/libs/libcommon/include/common/types.h
index 139fc10e980..87c7215d91f 100644
--- a/libs/libcommon/include/common/types.h
+++ b/libs/libcommon/include/common/types.h
@@ -25,6 +25,7 @@
 #if defined(__clang__)
 #pragma GCC diagnostic ignored "-Wunknown-warning-option"
 #pragma GCC diagnostic ignored "-Wdeprecated-copy"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #pragma GCC diagnostic ignored "-Wtautological-constant-out-of-range-compare"
 #endif
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
diff --git a/metrics/grafana/tiflash_summary.json b/metrics/grafana/tiflash_summary.json
index f899a47ed10..0d72f950add 100644
--- a/metrics/grafana/tiflash_summary.json
+++ b/metrics/grafana/tiflash_summary.json
@@ -52,7 +52,7 @@
   "gnetId": null,
   "graphTooltip": 1,
   "id": null,
-  "iteration": 1653635389238,
+  "iteration": 1654217728945,
   "links": [],
   "panels": [
     {
@@ -542,7 +542,14 @@
           "pointradius": 5,
           "points": false,
           "renderer": "flot",
-          "seriesOverrides": [],
+          "seriesOverrides": [
+            {
+              "alias": "/limit/",
+              "fill": 0,
+              "nullPointMode": "null",
+              "color": "#C4162A"
+            }
+          ],
           "spaceLength": 10,
           "stack": false,
           "steppedLine": false,
@@ -633,6 +640,13 @@
               "intervalFactor": 1,
               "legendFormat": "{{instance}}",
               "refId": "K"
+            },
+            {
+              "expr": "sum(tiflash_system_current_metric_MemoryCapacity{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)",
+              "legendFormat": "limit-{{instance}}",
+              "exemplar": true,
+              "refId": "L",
+              "hide": false
             }
           ],
           "thresholds": [],
@@ -701,15 +715,15 @@
           "hiddenSeries": false,
           "id": 51,
           "legend": {
-            "alignAsTable": false,
+            "alignAsTable": true,
             "avg": false,
-            "current": false,
+            "current": true,
             "max": false,
             "min": false,
-            "rightSide": false,
+            "rightSide": true,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -728,6 +742,12 @@
               "alias": "total",
               "fill": 0,
               "lines": false
+            },
+            {
+              "alias": "/limit/",
+              "fill": 0,
+              "nullPointMode": "null",
+              "color": "#C4162A"
             }
           ],
           "spaceLength": 10,
@@ -742,6 +762,13 @@
               "legendFormat": "{{instance}}",
               "refId": "A",
               "step": 40
+            },
+            {
+              "expr": "sum(tiflash_system_current_metric_LogicalCPUCores{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)",
+              "legendFormat": "limit-{{instance}}",
+              "exemplar": true,
+              "refId": "B",
+              "intervalFactor": 1
             }
           ],
           "thresholds": [],
@@ -3878,7 +3905,7 @@
           "fill": 0,
           "fillGradient": 0,
           "gridPos": {
-            "h": 8,
+            "h": 5,
             "w": 12,
             "x": 0,
             "y": 21
@@ -3893,6 +3920,7 @@
             "min": false,
             "rightSide": true,
             "show": true,
+            "sideWidth": null,
             "total": false,
             "values": false
           },
@@ -3908,38 +3936,27 @@
           "pointradius": 5,
           "points": false,
           "renderer": "flot",
-          "seriesOverrides": [
-            {
-              "alias": "/(delta_merge)|(seg_)/",
-              "yaxis": 2
-            }
-          ],
+          "seriesOverrides": [],
           "spaceLength": 10,
           "stack": false,
-          "steppedLine": false,
+          "steppedLine": true,
           "targets": [
             {
-              "expr": "sum(rate(tiflash_storage_subtask_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!~\"delta_merge|delta_merge_fg|delta_merge_bg_gc|seg_merge|seg_split|seg_split_fg\"}[1m])) by (type)",
+              "exemplar": true,
+              "expr": "sum(rate(tiflash_storage_subtask_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!~\"(delta_merge|seg_merge|seg_split).*\"}[$__rate_interval])) by (type)",
               "format": "time_series",
               "hide": false,
-              "intervalFactor": 1,
+              "interval": "",
+              "intervalFactor": 2,
               "legendFormat": "{{type}}",
               "refId": "A"
-            },
-            {
-              "expr": "sum(increase(tiflash_storage_subtask_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"delta_merge|delta_merge_fg|delta_merge_bg_gc|seg_merge|seg_split|seg_split_fg\"}[1m])) by (type)",
-              "format": "time_series",
-              "hide": false,
-              "intervalFactor": 1,
-              "legendFormat": "{{type}}",
-              "refId": "B"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "Internal Tasks OPS",
+          "title": "Small Internal Tasks OPS",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -3955,7 +3972,7 @@
           },
           "yaxes": [
             {
-              "decimals": null,
+              "decimals": 1,
               "format": "ops",
               "label": null,
               "logBase": 1,
@@ -3969,7 +3986,7 @@
               "logBase": 1,
               "max": null,
               "min": "0",
-              "show": true
+              "show": false
             }
           ],
           "yaxis": {
@@ -3988,10 +4005,10 @@
             "defaults": {},
             "overrides": []
           },
-          "fill": 1,
+          "fill": 0,
           "fillGradient": 0,
           "gridPos": {
-            "h": 8,
+            "h": 5,
             "w": 12,
             "x": 12,
             "y": 21
@@ -4023,58 +4040,233 @@
           "pointradius": 5,
           "points": false,
           "renderer": "flot",
-          "seriesOverrides": [
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
             {
-              "alias": "/^.*-delta_merge/",
-              "yaxis": 2
+              "exemplar": false,
+              "expr": "histogram_quantile(1, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!~\"(delta_merge|seg_merge|seg_split).*\"}[$__rate_interval])) by (le,type))",
+              "format": "time_series",
+              "hide": false,
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "max-{{type}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Small Internal Tasks Duration",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 1,
+              "format": "s",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
             },
             {
-              "alias": "/^.*-seg_split/",
-              "yaxis": 2
+              "format": "s",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": false
             }
           ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${DS_TEST-CLUSTER}",
+          "description": "Total number of storage's internal sub tasks",
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": []
+          },
+          "fill": 0,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 5,
+            "w": 12,
+            "x": 0,
+            "y": 26
+          },
+          "hiddenSeries": false,
+          "id": 130,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null as zero",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.5.11",
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
           "spaceLength": 10,
           "stack": false,
-          "steppedLine": false,
+          "steppedLine": true,
           "targets": [
             {
-              "expr": "histogram_quantile(1, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,type))",
+              "exemplar": true,
+              "expr": "sum(rate(tiflash_storage_subtask_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"(delta_merge|seg_merge|seg_split).*\"}[$__rate_interval])) by (type)",
               "format": "time_series",
               "hide": false,
-              "intervalFactor": 1,
-              "legendFormat": "max-{{type}}",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "{{type}}",
               "refId": "A"
-            },
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Large Internal Tasks OPS",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
             {
-              "expr": "histogram_quantile(0.99, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,type))",
-              "format": "time_series",
-              "hide": true,
-              "intervalFactor": 1,
-              "legendFormat": "99-{{type}}",
-              "refId": "B"
+              "decimals": 1,
+              "format": "ops",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
             },
             {
-              "expr": "histogram_quantile(0.95, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,type))",
-              "format": "time_series",
-              "hide": true,
-              "intervalFactor": 1,
-              "legendFormat": "95-{{type}}",
-              "refId": "C"
-            },
+              "format": "opm",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${DS_TEST-CLUSTER}",
+          "description": "Duration of storage's internal sub tasks",
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": []
+          },
+          "fill": 0,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 5,
+            "w": 12,
+            "x": 12,
+            "y": 26
+          },
+          "hiddenSeries": false,
+          "id": 131,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "max": true,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": null,
+            "sortDesc": null,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null as zero",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.5.11",
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
             {
-              "expr": "histogram_quantile(0.80, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,type))",
+              "exemplar": true,
+              "expr": "histogram_quantile(1, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"(delta_merge|seg_merge|seg_split).*\"}[$__rate_interval])) by (le,type))",
               "format": "time_series",
-              "hide": true,
-              "intervalFactor": 1,
-              "legendFormat": "80-{{type}}",
-              "refId": "D"
+              "hide": false,
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "max-{{type}}",
+              "refId": "A"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "Internal Tasks Duration",
+          "title": "Large Internal Tasks Duration",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -4090,6 +4282,7 @@
           },
           "yaxes": [
             {
+              "decimals": 1,
               "format": "s",
               "label": null,
               "logBase": 1,
@@ -4103,7 +4296,7 @@
               "logBase": 1,
               "max": null,
               "min": "0",
-              "show": true
+              "show": false
             }
           ],
           "yaxis": {
@@ -4128,7 +4321,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 29
+            "y": 31
           },
           "hiddenSeries": false,
           "id": 43,
@@ -4234,7 +4427,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 29
+            "y": 31
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -4297,7 +4490,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 37
+            "y": 39
           },
           "hiddenSeries": false,
           "id": 46,
@@ -4420,7 +4613,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 37
+            "y": 39
           },
           "hiddenSeries": false,
           "id": 47,
@@ -4544,7 +4737,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 45
+            "y": 47
           },
           "height": "",
           "hiddenSeries": false,
@@ -4674,7 +4867,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 45
+            "y": 47
           },
           "height": "",
           "hiddenSeries": false,
@@ -4802,7 +4995,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 53
+            "y": 55
           },
           "hiddenSeries": false,
           "id": 88,
@@ -5002,7 +5195,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 53
+            "y": 55
           },
           "hiddenSeries": false,
           "id": 67,
@@ -5116,7 +5309,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 61
+            "y": 63
           },
           "hiddenSeries": false,
           "id": 84,
@@ -5216,7 +5409,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 61
+            "y": 63
           },
           "hiddenSeries": false,
           "id": 86,
@@ -8183,5 +8376,5 @@
   "timezone": "",
   "title": "Test-Cluster-TiFlash-Summary",
   "uid": "SVbh2xUWk",
-  "version": 2
-}
+  "version": 1
+}
\ No newline at end of file
diff --git a/release-centos7-llvm/Makefile b/release-centos7-llvm/Makefile
index 1b15df7ddc3..9c1bba42a53 100644
--- a/release-centos7-llvm/Makefile
+++ b/release-centos7-llvm/Makefile
@@ -23,6 +23,10 @@ image_tiflash_llvm_base_aarch64:
 build_tiflash_release_amd64:
 	docker run --rm -v $(realpath ..):/build/tics hub.pingcap.net/tiflash/tiflash-llvm-base:amd64 /build/tics/release-centos7-llvm/scripts/build-release.sh
 
+# Add build_tiflash_debug_amd64 target to enable FailPoints on x86. Since outputs are the same as release version, no new package targets added. 
+build_tiflash_debug_amd64:
+	docker run --rm -v $(realpath ..):/build/tics hub.pingcap.net/tiflash/tiflash-llvm-base:amd64 /build/tics/release-centos7-llvm/scripts/build-debug.sh
+
 build_tiflash_ci_amd64:
 	docker run --rm -v $(realpath ..):/build/tics hub.pingcap.net/tiflash/tiflash-llvm-base:amd64 /build/tics/release-centos7-llvm/scripts/build-tiflash-ci.sh
 
diff --git a/dbms/src/Storages/DeltaMerge/tools/CMakeLists.txt b/release-centos7-llvm/scripts/build-debug.sh
old mode 100644
new mode 100755
similarity index 76%
rename from dbms/src/Storages/DeltaMerge/tools/CMakeLists.txt
rename to release-centos7-llvm/scripts/build-debug.sh
index 36270a0c8e4..59dc9b86a54
--- a/dbms/src/Storages/DeltaMerge/tools/CMakeLists.txt
+++ b/release-centos7-llvm/scripts/build-debug.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright 2022 PingCAP, Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,6 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-include_directories (${CMAKE_CURRENT_BINARY_DIR})
 
-add_subdirectory (workload EXCLUDE_FROM_ALL)
+CMAKE_PREFIX_PATH=$1
+
+set -ueox pipefail
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+
+${SCRIPTPATH}/build-tiflash-release.sh "DEBUG" "${CMAKE_PREFIX_PATH}"
diff --git a/release-centos7-llvm/scripts/build-tiflash-release.sh b/release-centos7-llvm/scripts/build-tiflash-release.sh
index 42993b51afe..01ca00e8706 100755
--- a/release-centos7-llvm/scripts/build-tiflash-release.sh
+++ b/release-centos7-llvm/scripts/build-tiflash-release.sh
@@ -47,7 +47,13 @@ ENABLE_PCH=${ENABLE_PCH:-ON}
 INSTALL_DIR="${SRCPATH}/release-centos7-llvm/tiflash"
 rm -rf ${INSTALL_DIR} && mkdir -p ${INSTALL_DIR}
 
-BUILD_DIR="${SRCPATH}/release-centos7-llvm/build-release"
+if [ $CMAKE_BUILD_TYPE == "RELWITHDEBINFO" ]; then
+  BUILD_DIR="$SRCPATH/release-centos7-llvm/build-release"
+  ENABLE_FAILPOINTS="OFF"
+else
+  BUILD_DIR="$SRCPATH/release-centos7-llvm/build-debug"
+  ENABLE_FAILPOINTS="ON"
+fi
 rm -rf ${BUILD_DIR} && mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
 
 cmake -S "${SRCPATH}" \
@@ -55,6 +61,7 @@ cmake -S "${SRCPATH}" \
   -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
   -DENABLE_TESTING=OFF \
   -DENABLE_TESTS=OFF \
+  -DENABLE_FAILPOINTS=${ENABLE_FAILPOINTS} \
   -Wno-dev \
   -DUSE_CCACHE=OFF \
   -DRUN_HAVE_STD_REGEX=0 \
diff --git a/tests/fullstack-test-dt/clustered_index/ddl.test b/tests/fullstack-test-dt/clustered_index/ddl.test
index 8abe450c11a..6c4925c9619 100644
--- a/tests/fullstack-test-dt/clustered_index/ddl.test
+++ b/tests/fullstack-test-dt/clustered_index/ddl.test
@@ -66,3 +66,89 @@ mysql> set session tidb_isolation_read_engines='tiflash'; select * from test.t_2
 
 mysql> drop table test.t_1;
 mysql> drop table test.t_2;
+
+### about issue 5154 to check whether add column/drop column will effect the cluster index decode
+### drop the column between two columns that are cluster index columns 
+
+mysql> drop table if exists test.t_3;
+mysql> create table test.t_3 (A int, B varchar(20), C int, D int, PRIMARY KEY(A,C) CLUSTERED);
+mysql> insert into test.t_3 values (1,'1',1,1),(2,'2',2,2);
+
+mysql> alter table test.t_3 set tiflash replica 1;
+
+func> wait_table test t_3
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_3;
++---+---+---+---+
+| A | B | C | D |
++---+---+---+---+
+| 1 | 1 | 1 | 1 |
+| 2 | 2 | 2 | 2 |
++---+---+---+---+
+
+mysql> alter table test.t_3 drop column B;
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_3;
++---+---+---+
+| A | C | D |
++---+---+---+
+| 1 | 1 | 1 |
+| 2 | 2 | 2 |
++---+---+---+
+
+# insert some rows
+mysql> insert into test.t_3 values (3,3,3),(4,4,4);
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_3;
++---+---+---+
+| A | C | D |
++---+---+---+
+| 1 | 1 | 1 |
+| 2 | 2 | 2 |
+| 3 | 3 | 3 |
+| 4 | 4 | 4 |
++---+---+---+
+
+mysql> drop table test.t_3;
+
+### add the column between two columns that are cluster index columns 
+mysql> drop table if exists test.t_4
+mysql> create table test.t_4 (A int, B varchar(20), C int, D int, PRIMARY KEY(A,C) CLUSTERED);
+
+mysql> insert into test.t_4 values (1,'1',1,1),(2,'2',2,2);
+
+mysql> alter table test.t_4 set tiflash replica 1;
+
+func> wait_table test t_4
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_4;
++---+---+---+---+
+| A | B | C | D |
++---+---+---+---+
+| 1 | 1 | 1 | 1 |
+| 2 | 2 | 2 | 2 |
++---+---+---+---+
+
+mysql> alter table test.t_4 Add column E int after B;
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_4;
++---+---+------+---+---+
+| A | B |  E   | C | D |
++---+---+------+---+---+
+| 1 | 1 | NULL | 1 | 1 |
+| 2 | 2 | NULL | 2 | 2 |
++---+---+------+---+---+
+
+mysql> insert into test.t_4 values (3,'3',3,3,3),(4,'4',4,4,4);
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_4;
++---+---+------+------+------+
+| A | B |  E   |  C   |   D  |
++---+---+------+------+------+
+| 1 | 1 | NULL |  1   |   1  |
+| 2 | 2 | NULL |  2   |   2  |
+| 3 | 3 |    3 |  3   |   3  |
+| 4 | 4 |    4 |  4   |   4  |
++---+---+------+------+------+
+
+mysql> drop table test.t_4;
\ No newline at end of file
diff --git a/tests/fullstack-test/expr/bitshift_operator.test b/tests/fullstack-test/expr/bitshift_operator.test
new file mode 100644
index 00000000000..0d55a1b56a9
--- /dev/null
+++ b/tests/fullstack-test/expr/bitshift_operator.test
@@ -0,0 +1,43 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mysql> drop table if exists test.t;
+mysql> create table test.t (a int);
+mysql> alter table test.t set tiflash replica 1;
+mysql> insert into test.t values(-1);
+
+func> wait_table test t
+
+mysql> set tidb_enforce_mpp=1; set @@session.tidb_isolation_read_engines = "tiflash"; select a>>0 as v1, a>>64 as v2, a>>10 as v3 from test.t;
++----------------------+------+-------------------+
+| v1                   | v2   | v3                |
++----------------------+------+-------------------+
+| 18446744073709551615 |    0 | 18014398509481983 |
++----------------------+------+-------------------+
+
+mysql> set tidb_enforce_mpp=1; set @@session.tidb_isolation_read_engines = "tiflash"; select a from test.t where a>>100000=0;
++------+
+| a    |
++------+
+|   -1 |
++------+
+
+mysql> set tidb_enforce_mpp=1; set @@session.tidb_isolation_read_engines = "tiflash"; select a from test.t where a>>63=1;
++------+
+| a    |
++------+
+|   -1 |
++------+
+
+mysql> drop table if exists test.t
diff --git a/tests/fullstack-test/expr/duration_pushdown.test b/tests/fullstack-test/expr/duration_pushdown.test
index 63106fa1788..442a708a802 100644
--- a/tests/fullstack-test/expr/duration_pushdown.test
+++ b/tests/fullstack-test/expr/duration_pushdown.test
@@ -106,6 +106,14 @@ mysql> use test; set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflas
 # |         123500 |
 # +----------------+
 
+mysql> use test; set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select time_to_sec(a) from t;
++----------------+
+| time_to_sec(a) |
++----------------+
+|        2520610 |
+|       -2520610 |
++----------------+
+
 
 mysql> drop table if exists test.time_test;
 mysql> create table test.time_test(id int(11),v1 time(3) not null, v2 time(3));
diff --git a/tests/fullstack-test/expr/format.test b/tests/fullstack-test/expr/format.test
index 8cea75d6914..719e30c974d 100644
--- a/tests/fullstack-test/expr/format.test
+++ b/tests/fullstack-test/expr/format.test
@@ -44,3 +44,52 @@ int_val
 1,234.000
 
 mysql> drop table if exists test.t
+
+mysql> create table test.t(id int, value decimal(65,4))
+mysql> alter table test.t set tiflash replica 1
+mysql> insert into test.t values(1,9999999999999999999999999999999999999999999999999999999999999.9999)
+
+func> wait_table test t
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,-3) as result from test.t
+result
+10,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,0) as result from test.t
+result
+10,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,3) as result from test.t
+result
+10,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000.000
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,10) as result from test.t
+result
+9,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999.9999000000
+
+
+mysql> drop table if exists test.t
+
+mysql> create table test.t(id int, value decimal(7,4))
+mysql> alter table test.t set tiflash replica 1
+mysql> insert into test.t values(1,999.9999)
+
+func> wait_table test t
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,-2) as result from test.t
+result
+1,000
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,0) as result from test.t
+result
+1,000
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,2) as result from test.t
+result
+1,000.00
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,10) as result from test.t
+result
+999.9999000000
+
+mysql> drop table if exists test.t
diff --git a/tests/fullstack-test/expr/get_format.test b/tests/fullstack-test/expr/get_format.test
new file mode 100644
index 00000000000..5409302c10a
--- /dev/null
+++ b/tests/fullstack-test/expr/get_format.test
@@ -0,0 +1,60 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mysql> drop table if exists test.t;
+mysql> create table test.t(location varchar(10));
+mysql> insert into test.t values('USA'), ('JIS'), ('ISO'), ('EUR'), ('INTERNAL');
+mysql> alter table test.t set tiflash replica 1;
+func> wait_table test t
+mysql> set @@tidb_enforce_mpp=1; set @@tidb_isolation_read_engines='tiflash'; select GET_FORMAT(DATE, location) from test.t;
++----------------------------+
+| GET_FORMAT(DATE, location) |
++----------------------------+
+| %m.%d.%Y                   |
+| %Y-%m-%d                   |
+| %Y-%m-%d                   |
+| %d.%m.%Y                   |
+| %Y%m%d                     |
++----------------------------+
+mysql> set @@tidb_enforce_mpp=1; set @@tidb_isolation_read_engines='tiflash'; select GET_FORMAT(DATETIME, location) from test.t;
++--------------------------------+
+| GET_FORMAT(DATETIME, location) |
++--------------------------------+
+| %Y-%m-%d %H.%i.%s              |
+| %Y-%m-%d %H:%i:%s              |
+| %Y-%m-%d %H:%i:%s              |
+| %Y-%m-%d %H.%i.%s              |
+| %Y%m%d%H%i%s                   |
++--------------------------------+
+mysql> set @@tidb_enforce_mpp=1; set @@tidb_isolation_read_engines='tiflash'; select GET_FORMAT(TIMESTAMP, location) from test.t;
++---------------------------------+
+| GET_FORMAT(TIMESTAMP, location) |
++---------------------------------+
+| %Y-%m-%d %H.%i.%s               |
+| %Y-%m-%d %H:%i:%s               |
+| %Y-%m-%d %H:%i:%s               |
+| %Y-%m-%d %H.%i.%s               |
+| %Y%m%d%H%i%s                    |
++---------------------------------+
+mysql> set @@tidb_enforce_mpp=1; set @@tidb_isolation_read_engines='tiflash'; select GET_FORMAT(TIME, location) from test.t;
++----------------------------+
+| GET_FORMAT(TIME, location) |
++----------------------------+
+| %h:%i:%s %p                |
+| %H:%i:%s                   |
+| %H:%i:%s                   |
+| %H.%i.%s                   |
+| %H%i%s                     |
++----------------------------+
+mysql> drop table if exists test.t;
diff --git a/tests/fullstack-test/expr/reverse.test b/tests/fullstack-test/expr/reverse.test
new file mode 100644
index 00000000000..9195adf2b7d
--- /dev/null
+++ b/tests/fullstack-test/expr/reverse.test
@@ -0,0 +1,44 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mysql> drop table if exists test.t;
+mysql> create table if not exists test.t(a varchar(256));
+
+
+mysql> insert into test.t values('one week’s time test');
+mysql> insert into test.t values('abc测试def');
+mysql> insert into test.t values('abcテストabc');
+mysql> insert into test.t values('ѐёђѓєѕіїјљњћќѝўџ');
+mysql> insert into test.t values('+ѐ-ё*ђ/ѓ!є@ѕ#і@ї%ј……љ&њ（ћ）ќ￥ѝ#ў@џ！^');
+mysql> insert into test.t values('αβγδεζηθικλμνξοπρστυφχψωσ');
+mysql> insert into test.t values('▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★σ✕');
+mysql> insert into test.t values('թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ');
+mysql> insert into test.t values(NULL);
+mysql> alter table test.t set tiflash replica 1;
+func> wait_table test t
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select reverse(a) from test.t;
++-------------------------------------------------------------------------------------------------+
+| reverse(a)                                                                                      |
++-------------------------------------------------------------------------------------------------+
+| tset emit s’keew eno                                                                            |
+| fed试测cba                                                                                      |
+| cbaトステcba                                                                                    |
+| џўѝќћњљјїіѕєѓђёѐ                                                                                |
+| ^！џ@ў#ѝ￥ќ）ћ（њ&љ……ј%ї@і#ѕ@є!ѓ/ђ*ё-ѐ+                                                         |
+| σωψχφυτσρποξνμλκιθηζεδγβα                                                                       |
+| ✕σ★ω♘ψχ♖φυ♥τσ℉ρπ✚οξ✓νμ♫λκ€ιθ✂ηζ☎εδ➨γβ▼α▲                                                        |
+| շմնբվցղզխլկյհգֆդսապօիւըտռեոքծժճչրջձփթ                                                           |
+| NULL                                                                                            |
++-------------------------------------------------------------------------------------------------+
diff --git a/tests/fullstack-test/mpp/issue_2471.test b/tests/fullstack-test/mpp/issue_2471.test
index 4a1528595e8..9966eaadec3 100644
--- a/tests/fullstack-test/mpp/issue_2471.test
+++ b/tests/fullstack-test/mpp/issue_2471.test
@@ -35,7 +35,7 @@ mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_opt_bro
 => DBGInvoke __enable_fail_point(exception_in_creating_set_input_stream)
 
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_opt_broadcast_cartesian_join=2; select * from a as t1 left join a as t2 on t1.id = t2.id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Fail point FailPoints::exception_in_creating_set_input_stream is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_in_creating_set_input_stream is triggered., e.what() = DB::Exception,
 
 => DBGInvoke __disable_fail_point(exception_in_creating_set_input_stream)
 
diff --git a/tests/fullstack-test/mpp/mpp_fail.test b/tests/fullstack-test/mpp/mpp_fail.test
index 7af5fef3f89..0e272c0b621 100644
--- a/tests/fullstack-test/mpp/mpp_fail.test
+++ b/tests/fullstack-test/mpp/mpp_fail.test
@@ -71,20 +71,20 @@ ERROR 1105 (HY000) at line 1: DB::Exception: Fail point FailPoints::exception_be
 ## exception during mpp run non root task
 => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchange receiver meet error : DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception,, e.what() = DB::Exception,
 => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run)
 
 ## exception during mpp run root task
 => DBGInvoke __enable_fail_point(exception_during_mpp_root_task_run)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Fail point FailPoints::exception_during_mpp_root_task_run is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_root_task_run is triggered., e.what() = DB::Exception,
 => DBGInvoke __disable_fail_point(exception_during_mpp_root_task_run)
 
 ## exception during mpp write err to tunnel
 => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run)
 => DBGInvoke __enable_fail_point(exception_during_mpp_write_err_to_tunnel)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchange receiver meet error : Failed to write error msg to tunnel
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Failed to write error msg to tunnel, e.what() = DB::Exception,
 => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run)
 => DBGInvoke __disable_fail_point(exception_during_mpp_write_err_to_tunnel)
 
@@ -92,7 +92,7 @@ ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchang
 => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run)
 => DBGInvoke __enable_fail_point(exception_during_mpp_close_tunnel)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchange receiver meet error : DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception,, e.what() = DB::Exception,
 => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run)
 => DBGInvoke __disable_fail_point(exception_during_mpp_close_tunnel)
 
@@ -125,7 +125,7 @@ ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchang
 ## ensure build1, build2-probe1, probe2 in the CreatingSets, test the bug where build1 throw exception but not change the build state, thus block the build2-probe1, at last this query hangs.
 => DBGInvoke __enable_fail_point(exception_mpp_hash_build)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; set @@tidb_broadcast_join_threshold_count=0; set @@tidb_broadcast_join_threshold_size=0; select t1.id from test.t t1 join test.t t2 on t1.id = t2.id and t1.id <2 join (select id from test.t group by id) t3 on t2.id=t3.id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Fail point FailPoints::exception_mpp_hash_build is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_mpp_hash_build is triggered., e.what() = DB::Exception,
 => DBGInvoke __disable_fail_point(exception_mpp_hash_build)
 
 # Clean up.
diff --git a/tests/fullstack-test/mpp/window.test b/tests/fullstack-test/mpp/window.test
new file mode 100644
index 00000000000..698d39ef2ea
--- /dev/null
+++ b/tests/fullstack-test/mpp/window.test
@@ -0,0 +1,32 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mysql> drop table if exists test.t1;
+mysql> create table test.t1(c1 int, c2 int);
+mysql> insert into test.t1 values(1, 1),(2, 2),(3, 3),(1, 1),(2, 2),(3, 3),(4, 4);
+mysql> alter table test.t1 set tiflash replica 1;
+func> wait_table test t1
+mysql> use test; set @@tidb_isolation_read_engines='tiflash'; select c1, c2, row_number() over w2, row_number() over w1 from test.t1 window w1 as(partition by c1), w2 as (partition by c1, c2) order by 1, 2, 3, 4;
++------+------+----------------------+----------------------+
+| c1   | c2   | row_number() over w2 | row_number() over w1 |
++------+------+----------------------+----------------------+
+|    1 |    1 |                    1 |                    1 |
+|    1 |    1 |                    2 |                    2 |
+|    2 |    2 |                    1 |                    1 |
+|    2 |    2 |                    2 |                    2 |
+|    3 |    3 |                    1 |                    1 |
+|    3 |    3 |                    2 |                    2 |
+|    4 |    4 |                    1 |                    1 |
++------+------+----------------------+----------------------+
+mysql> drop table if exists test.t1;
diff --git a/tests/fullstack-test2/ddl/alter_table_tiflash_replica_and_mode.test b/tests/fullstack-test2/ddl/alter_table_tiflash_replica_and_mode.test
new file mode 100644
index 00000000000..5e43936379b
--- /dev/null
+++ b/tests/fullstack-test2/ddl/alter_table_tiflash_replica_and_mode.test
@@ -0,0 +1,89 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# test tiflash replica for normal case
+mysql> drop table if exists test.t
+mysql> create table test.t(a int)
+mysql> alter table test.t set tiflash replica 1
+
+func> wait_table test t
+
+>> DBGInvoke get_tiflash_replica_count("test", "t")
+┌─get_tiflash_replica_count(test, t)─┐
+│ 1                                  │
+└────────────────────────────────────┘
+
+# test tiflash mode in normal mode
+>> DBGInvoke get_tiflash_mode("test", "t")
+┌─get_tiflash_mode(test, t)─┐
+│                           │
+└───────────────────────────┘
+
+mysql> alter table test.t set tiflash mode fast
+
+>> DBGInvoke __refresh_schemas()
+
+# test tiflash mode in fast mode
+>> DBGInvoke get_tiflash_mode("test", "t")
+┌─get_tiflash_mode(test, t)───┐
+│ fast                        │
+└─────────────────────────────┘
+
+# test replica for partition tables
+mysql> drop table if exists test.t
+mysql> create table test.t (x int) partition by range (x) (partition p0 values less than (5), partition p1 values less than (10));
+mysql> alter table test.t set tiflash mode fast
+mysql> alter table test.t set tiflash replica 1
+
+func> wait_table test t
+
+>> DBGInvoke get_tiflash_replica_count("test", "t")
+┌─get_tiflash_replica_count(test, t)─┐
+│ 1                                  │
+└────────────────────────────────────┘
+
+>> DBGInvoke get_tiflash_mode("test", "t")
+┌─get_tiflash_mode(test, t)──────────┐
+│ fast                               │
+└────────────────────────────────────┘
+
+>> DBGInvoke get_partition_tables_tiflash_replica_count("test", "t")
+┌─get_partition_tables_tiflash_replica_count(test, t)─┐
+│ 1/1/                                                │
+└─────────────────────────────────────────────────────┘
+
+# test tiflash mode for partition tables
+>> DBGInvoke get_partition_tables_tiflash_mode("test", "t")
+┌─get_partition_tables_tiflash_mode(test, t)─┐
+│ fast/fast/                                 │
+└────────────────────────────────────────────┘
+
+# test replica for add partition tables after set replica
+mysql> alter table test.t add partition (partition p2 values less than (2010));
+
+>> DBGInvoke __refresh_schemas()
+
+>> DBGInvoke get_partition_tables_tiflash_replica_count("test", "t")
+┌─get_partition_tables_tiflash_replica_count(test, t)─┐
+│ 1/1/1/                                              │
+└─────────────────────────────────────────────────────┘
+
+# test tiflash mode for add partition tables after set replica
+>> DBGInvoke get_partition_tables_tiflash_mode("test", "t")
+┌─get_partition_tables_tiflash_mode(test, t)─┐
+│ fast/fast/fast/                            │
+└────────────────────────────────────────────┘
+
+
+
diff --git a/tests/fullstack-test2/ddl/alter_tiflash_mode.test b/tests/fullstack-test2/ddl/alter_tiflash_mode.test
new file mode 100644
index 00000000000..c9f3ef488c4
--- /dev/null
+++ b/tests/fullstack-test2/ddl/alter_tiflash_mode.test
@@ -0,0 +1,48 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int, b int)
+mysql> alter table test.t set tiflash replica 1
+
+func> wait_table test t
+
+# check default mode of tiflash table
+mysql> select table_schema,table_name,replica_count,available,table_mode from information_schema.tiflash_replica where table_schema='test' and table_name='t';
++--------------+------------+---------------+-----------+-----------+
+| table_schema | table_name | replica_count | available | table_mode|
++--------------+------------+---------------+-----------+-----------+
+| test         | t          |             1 |         1 |   NORMAL  |
++--------------+------------+---------------+-----------+-----------+
+
+# check change mode 
+
+mysql> alter table test.t set tiflash mode fast
+mysql> select table_schema,table_name,replica_count,available,table_mode from information_schema.tiflash_replica where table_schema='test' and table_name='t';
++--------------+------------+---------------+-----------+-----------+
+| table_schema | table_name | replica_count | available | table_mode|
++--------------+------------+---------------+-----------+-----------+
+| test         | t          |             1 |         1 |    FAST   |
++--------------+------------+---------------+-----------+-----------+
+
+# check change mode
+mysql> alter table test.t set tiflash mode normal
+mysql> select table_schema,table_name,replica_count,available,table_mode from information_schema.tiflash_replica where table_schema='test' and table_name='t';
++--------------+------------+---------------+-----------+-----------+
+| table_schema | table_name | replica_count | available | table_mode|
++--------------+------------+---------------+-----------+-----------+
+| test         | t          |             1 |         1 |   NORMAL  |
++--------------+------------+---------------+-----------+-----------+
+
+mysql> drop table if exists test.t
\ No newline at end of file
diff --git a/tests/fullstack-test2/ddl/multi_alter_with_write.test b/tests/fullstack-test2/ddl/multi_alter_with_write.test
new file mode 100644
index 00000000000..3284511d775
--- /dev/null
+++ b/tests/fullstack-test2/ddl/multi_alter_with_write.test
@@ -0,0 +1,880 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this test focus on the case when multi DDL actions happen closely
+#（ and these DDL actions will be fetched in the same regular sync schema duration.) 
+# and there are some corresponding insert(write) actions between these DDL actions. 
+# Considering that these write actions and these schema change will arrive at 
+# tiflash in a different order, we simulate these different order situation to check 
+# that our schema module was working correctly.
+
+# TiDB Timeline ： write cmd 1 ｜ alter cmd 1 ｜ write cmd 2 | alter cmd 2 | write cmd 3 
+
+# stop regular schema sync
+=> DBGInvoke __enable_schema_sync_service('false') 
+
+# Enable the failpoint and make it pause before applying the raft cmd to write a row
+>> DBGInvoke __init_fail_point()
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# Enable the failpoint to make our query only start when the write action finished
+>> DBGInvoke __enable_fail_point(unblock_query_init_after_write)
+
+# -----------------------------------------------------------------------------
+# Order 1 : write cmd 1 | alter cmd 1 | write cmd 2 | alter cmd 2 | write cmd 3
+# -----------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+│ 3   │ 0.20 │ ccc   │ 3   │ 0.1 │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# make alter cmd 2 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# ---------------------------------------------------------------------------------------------
+# Order 2 : write cmd 1 | alter cmd 1 | write cmd 2 | write cmd 3 --> sync schema(alter cmd 2)
+# ---------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+│ 3   │ 0.20 │ ccc   │ 3   │ 0.1 │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+# check what happen after write cmd 3 --> call sync schema and get alter cmd 2 happen
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# -----------------------------------------------------------------------------------------------
+# Order 3 : write cmd 1 | alter cmd 1 | alter cmd 2 | write cmd 2 -->sync schema() | write cmd 3
+# -----------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# make alter cmd 2 take effect
+>> DBGInvoke __refresh_schemas()
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# -----------------------------------------------------------------------------------------------
+# Order 4 : write cmd 1 | write cmd 2 --> sync schema(alter cmd 1) | alter cmd 2 | write cmd 3
+# -----------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# check no schema change before write cmd 2 take effect
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+# check what happen after write cmd 2 --> should call sync schema, get the alter cmd 1 happened.
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+│ 3   │ 0.20 │ ccc   │ 3   │ 0.1 │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# make alter cmd 2 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# ---------------------------------------------------------------------------------------------------------------------
+# Order 5 : write cmd 1 | write cmd 2 --> sync schema(alter cmd 1) |  write cmd 3 --> sync schema(alter cmd 2) 
+# ----------------------------------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+# check what happen after write cmd 2 --> should call sync schema, get the alter cmd 1 happened.
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+│ 3   │ 0.20 │ ccc   │ 3   │ 0.1 │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+# check what happen after write cmd 3 --> should call sync schema, get the alter cmd 2 happened.
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# -----------------------------------------------------------------------------------------------
+# Order 6 : write cmd 1 | write cmd 2 --> sync schema(alter cmd 1 alter cmd 2) | write cmd 3
+# -----------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+# check what happen after write cmd 2 --> should call sync schema, get the alter cmd 1 && alter cmd 2 happened.
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# -------------------------------------------------------------------------------
+# Order 7 : alter cmd 1 | write cmd 1 | write cmd 2 | alter cmd 2 | write cmd 3
+# -------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# add a new pre write to make check the alter cmd 1 more convenient.
+mysql> insert into test.t (a, b, c) values (0, 0, ' ');
+ 
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 0   │ 0.00 │       │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+│ 3   │ 0.20 │ ccc   │ 3   │ 0.1 │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# make alter cmd 2 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# --------------------------------------------------------------------------------------------------
+# Order 8 : alter cmd 1 | write cmd 1 | write cmd 2 | write cmd 3 --> sync schema(alter cmd 2) 
+# --------------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# add a new pre write to make check the alter cmd 1 more convenient.
+mysql> insert into test.t (a, b, c) values (0, 0, ' ');
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 0   │ 0.00 │       │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+│ 3   │ 0.20 │ ccc   │ 3   │ 0.1 │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+# check what happen after write cmd 3 --> should call sync schema, get the alter cmd 2 happened.
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# --------------------------------------------------------------------------------------------------
+# Order 9 : alter cmd 1 | write cmd 1 | alter cmd 2 | write cmd 2 -->sync schema() | write cmd 3
+# --------------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# add a new pre write to make check the alter cmd 1 more convenient.
+mysql> insert into test.t (a, b, c) values (0, 0, ' ');
+ 
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 0   │ 0.00 │       │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# make alter cmd 2 take effect
+>> DBGInvoke __refresh_schemas()
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# ------------------------------------------------------------------------------------------------------------------
+# Order 10 : alter cmd 1 | alter cmd 2 | write cmd 1 -->sync schema() | write cmd 2 -->sync schema()  | write cmd 3
+# ------------------------------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# add a new pre write to make check the alter cmd 1 more convenient.
+mysql> insert into test.t (a, b, c) values (0, 0, ' ');
+ 
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 0   │ 0.00 │       │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# make alter cmd 2 take effect
+>> DBGInvoke __refresh_schemas()
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 and write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+
+##
+
+=> DBGInvoke __enable_schema_sync_service('true') 
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+>> DBGInvoke __disable_fail_point(unblock_query_init_after_write)
+>> DBGInvoke __disable_fail_point(pause_query_init)
\ No newline at end of file
diff --git a/tests/fullstack-test2/ddl/rename_table_across_databases.test b/tests/fullstack-test2/ddl/rename_table_across_databases.test
index c78c27138a0..bc27668bd0c 100644
--- a/tests/fullstack-test2/ddl/rename_table_across_databases.test
+++ b/tests/fullstack-test2/ddl/rename_table_across_databases.test
@@ -52,7 +52,7 @@ mysql> set session tidb_isolation_read_engines='tiflash'; select * from test_new
 +------+------+
 
 # check if table info updated.
->> select tidb_database,tidb_name from system.tables where is_tombstone = 0 and (tidb_database = 'test' and tidb_name='t') or (tidb_database='test_new' and tidb_name='t2')
+>> select tidb_database,tidb_name from system.tables where is_tombstone = 0 and ((tidb_database = 'test' and tidb_name='t') or (tidb_database='test_new' and tidb_name='t2'))
 ┌─tidb_database─┬─tidb_name─┐
 │ test_new      │ t2        │
 └───────────────┴───────────┘
diff --git a/tests/run-test.py b/tests/run-test.py
index 843fe7c79b4..a2bcee0ce99 100644
--- a/tests/run-test.py
+++ b/tests/run-test.py
@@ -29,6 +29,7 @@
 UNFINISHED_1_PREFIX = '\t'
 UNFINISHED_2_PREFIX = '   '
 WORD_PH = '{#WORD}'
+LINE_PH = '{#LINE}'
 CURL_TIDB_STATUS_PREFIX = 'curl_tidb> '
 
 verbose = False
@@ -138,18 +139,22 @@ def match_ph_word(line):
 
 # TODO: Support more place holders, eg: {#NUMBER}
 def compare_line(line, template):
-    while True:
-        i = template.find(WORD_PH)
-        if i < 0:
-            return line == template
-        else:
-            if line[:i] != template[:i]:
-                return False
-            j = match_ph_word(line[i:])
-            if j == 0:
-                return False
-            template = template[i + len(WORD_PH):]
-            line = line[i + j:]
+    l = template.find(LINE_PH)
+    if l >= 0:
+        return True
+    else:
+        while True:
+            i = template.find(WORD_PH)
+            if i < 0:
+                return line == template
+            else:
+                if line[:i] != template[:i]:
+                    return False
+                j = match_ph_word(line[i:])
+                if j == 0:
+                    return False
+                template = template[i + len(WORD_PH):]
+                line = line[i + j:]
 
 
 class MySQLCompare:
@@ -194,11 +199,14 @@ def matched(outputs, matches):
             b = MySQLCompare.parse_excepted_outputs(matches)
             return a == b
         else:
-            if len(outputs) != len(matches):
+            if len(outputs) > len(matches):
                 return False
             for i in range(0, len(outputs)):
                 if not compare_line(outputs[i], matches[i]):
                     return False
+            for i in range(len(outputs), len(matches)):
+                if not compare_line("", matches[i]):
+                    return False
             return True
 
 
@@ -212,11 +220,14 @@ def matched(outputs, matches, fuzz):
         b = parse_table_parts(matches, fuzz)
         return a == b
     else:
-        if len(outputs) != len(matches):
+        if len(outputs) > len(matches):
             return False
         for i in range(0, len(outputs)):
             if not compare_line(outputs[i], matches[i]):
                 return False
+        for i in range(len(outputs), len(matches)):
+            if not compare_line("", matches[i]):
+                return False
         return True
 
 
diff --git a/tests/sanitize/tsan.suppression b/tests/sanitize/tsan.suppression
new file mode 100644
index 00000000000..73824caa2b9
--- /dev/null
+++ b/tests/sanitize/tsan.suppression
@@ -0,0 +1 @@
+race:dbms/src/Common/TiFlashMetrics.h
diff --git a/tests/tidb-ci/new_collation_fullstack/expr.test b/tests/tidb-ci/new_collation_fullstack/expr.test
index 15ada0f335c..1e2135c4f2d 100644
--- a/tests/tidb-ci/new_collation_fullstack/expr.test
+++ b/tests/tidb-ci/new_collation_fullstack/expr.test
@@ -35,6 +35,13 @@ mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_s
 |    2 | abc   |
 +------+-------+
 
+mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_storage(tiflash[t]) */ id, value1 from test.t where value1 = 'abc       ';
++------+-------+
+| id   | value1|
++------+-------+
+|    1 | abc   |
+|    2 | abc   |
++------+-------+
 
 mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_storage(tiflash[t]) */ id, value from test.t where value like 'aB%';
 +------+-------+
@@ -62,6 +69,13 @@ mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_s
 |    3 | def   |
 +------+-------+
 
+mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_storage(tiflash[t]) */ id, value1 from test.t where value1 = 'def       ';
++------+-------+
+| id   | value1|
++------+-------+
+|    3 | def   |
++------+-------+
+
 mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_storage(tiflash[t]) */ id, value1 from test.t where value1 in ('Abc','def');
 +------+-------+
 | id   | value1|