diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 4dd4981dfb48a7..32bc9364832d5f 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,17 +1,3 @@
-Related issue = #
+Issue: #
 
-<!--
-Thank you for your contribution!
-
-If it is your first time contributing to Taichi, please read our Contributor Guidelines:
-  https://docs.taichi-lang.org/docs/contributor_guide
-
-- Please always prepend your PR title with tags such as [CUDA], [Lang], [Doc], [Example]. For a complete list of valid PR tags, please check out https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
-- Use upper-case tags (e.g., [Metal]) for PRs that change public APIs. Otherwise, please use lower-case tags (e.g., [metal]).
-- More details: https://docs.taichi-lang.org/docs/contributor_guide#pr-title-format-and-tags
-
-- Please fill in the issue number that this PR relates to.
-- If your PR fixes the issue **completely**, use the `close` or `fixes` prefix so that GitHub automatically closes the issue when the PR is merged. For example,
-    Related issue = close #2345
-- If the PR does not belong to any existing issue, free to leave it blank.
--->
+### Brief Summary
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index ba6b5128dfa31a..0f2cbc90a76407 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -87,7 +87,7 @@ jobs:
           mkdir -m777 shared
           docker create --user dev --name taichi_build --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
             -e DISPLAY -e PY -e GPU_BUILD -e TAICHI_CMAKE_ARGS -e PROJECT_NAME \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.2 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
             /home/dev/taichi/.github/workflows/scripts/unix_build.sh
           tar -cf - ../${{ github.event.repository.name }} --mode u=+rwx,g=+rwx,o=+rwx --owner 1000 --group 1000 | docker cp - taichi_build:/home/dev/
           docker start -a taichi_build
@@ -111,7 +111,7 @@ jobs:
         run: |
           docker create --user dev --name taichi_test --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
             -e DISPLAY -e PY -e GPU_TEST \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.2 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
             /home/dev/unix_test.sh
           docker cp .github/workflows/scripts/unix_test.sh taichi_test:/home/dev/unix_test.sh
           docker cp .github/workflows/scripts/common-utils.sh taichi_test:/home/dev/common-utils.sh
diff --git a/.github/workflows/scripts/win_build.ps1 b/.github/workflows/scripts/win_build.ps1
index 38ecd156a97530..dba6d15e0cc9cf 100644
--- a/.github/workflows/scripts/win_build.ps1
+++ b/.github/workflows/scripts/win_build.ps1
@@ -62,7 +62,7 @@ if (!$llvmVer.CompareTo("10")) {
 } else {
     if (-not (Test-Path "taichi_llvm_15")) {
         WriteInfo("Download and extract LLVM")
-        curl.exe --retry 10 --retry-delay 5 https://github.com/python3kgae/taichi_assets/releases/download/llvm15_vs2019_clang_220731/taichi-llvm-15.0.0-msvc2019.zip -LO
+        curl.exe --retry 10 --retry-delay 5 https://github.com/python3kgae/taichi_assets/releases/download/llvm15_vs2019_clang/taichi-llvm-15.0.0-msvc2019.zip -LO
         if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE; }
         7z x taichi-llvm-15.0.0-msvc2019.zip -otaichi_llvm_15
     }
diff --git a/.github/workflows/scripts/win_build_test_cpu.ps1 b/.github/workflows/scripts/win_build_test_cpu.ps1
index 0b1a4cd5e7faaf..4aa90582517edc 100644
--- a/.github/workflows/scripts/win_build_test_cpu.ps1
+++ b/.github/workflows/scripts/win_build_test_cpu.ps1
@@ -66,6 +66,7 @@ if (!$llvmVer.CompareTo("10")) {
 } else {
     $env:TAICHI_CMAKE_ARGS += " -DLLVM_AS_EXECUTABLE=C:\\taichi_llvm_15\\bin\\llvm-as.exe -DTI_WITH_VULKAN:BOOL=OFF"
     $env:TAICHI_CMAKE_ARGS += " -DTI_LLVM_15:BOOL=ON"
+    $env:TAICHI_CMAKE_ARGS += " -DTI_WITH_DX12:BOOL=ON"
 }
 
 
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 95262572addf94..6ac9cd1927c7a7 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -219,7 +219,7 @@ jobs:
             python: 3.7
             with_cc: OFF
             with_cpp_tests: ON
-            wanted_archs: 'cpu'
+            wanted_archs: 'cpu,vulkan'
     runs-on:
     - self-hosted
     - ${{ matrix.os }}
@@ -405,7 +405,7 @@ jobs:
             --gpus 'all,"capabilities=graphics,utility,display,video,compute"' \
             -v /tmp/.X11-unix:/tmp/.X11-unix \
             -e PY -e GPU_BUILD -e PROJECT_NAME -e TAICHI_CMAKE_ARGS -e DISPLAY \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.3 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
             /home/dev/taichi/.github/workflows/scripts/unix_build.sh
           # A tarball is needed because sccache needs some permissions that only the file owner has.
           # 1000 is the uid and gid of user "dev" in the container.
@@ -444,7 +444,7 @@ jobs:
             -e TI_LITE_TEST \
             -e TI_TEST_OFFLINE_CACHE \
             -e DISPLAY -e PY -e GPU_TEST -e TI_WANTED_ARCHS -e TI_RUN_RELEASE_TESTS \
-             registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.3 \
+             registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
              /home/dev/unix_test.sh
           docker cp .github/workflows/scripts/unix_test.sh taichi_test:/home/dev/unix_test.sh
           docker cp .github/workflows/scripts/common-utils.sh taichi_test:/home/dev/common-utils.sh
@@ -671,7 +671,7 @@ jobs:
           docker run --user dev --name taichi_build_host \
             $DOCKER_RUN_ARGS \
             -v $TAICHI_WHEEL_DIR:/home/dev/taichi/dist \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.3 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
             /home/dev/taichi/.github/workflows/scripts/unix-build-v2.sh
         env:
           TAICHI_CMAKE_ARGS: >-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b111d69110219..0cc5703b7d5dd1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,20 +6,6 @@ cmake_minimum_required(VERSION 3.17)
 
 project(taichi)
 
-# Taichi does not set target architecture explicitly,
-# but rather rely on CMake to detect the host arch.
-#
-# However on Mac m1, there are two available architectures namely x86_64 and arm64.
-# On some combination of "OSX version" and "CMake version", CMake will use x86_64 as default architecture even if it's on m1 chip.
-# This causes conflicts with the precompiled LLVM/Clang binaries downloaded from Taichi's repo (pre-built for arm64)
-#
-# Therefore we force CMake to choose arm64 architecture on arm64 chips.
-if (APPLE)
-  if( "${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "arm64" )
-    set(CMAKE_OSX_ARCHITECTURES ${CMAKE_HOST_SYSTEM_PROCESSOR})
-  endif()
-endif()
-
 if (NOT DEFINED TI_VERSION_MAJOR)
     message(WARNING "It seems that you are running cmake manually, which may cause issues. Please use setup.py to build taichi from source, see https://docs.taichi-lang.org/docs/dev_install for more details.")
     set(TI_VERSION_MAJOR 0)
@@ -117,15 +103,12 @@ if (TI_BUILD_TESTS)
 endif()
 
 option(TI_BUILD_EXAMPLES "Build the CPP examples" ON)
+option(TI_BUILD_RHI_EXAMPLES "Build the Unified Device API examples" OFF)
 
 if(NOT TI_WITH_LLVM OR NOT TI_WITH_METAL)
     set(TI_BUILD_EXAMPLES OFF)
 endif()
 
-if (TI_BUILD_EXAMPLES)
-  include(cmake/TaichiExamples.cmake)
-endif()
-
 message("C++ Flags: ${CMAKE_CXX_FLAGS}")
 message("Build type: ${CMAKE_BUILD_TYPE}")
 
@@ -216,6 +199,14 @@ if (TI_WITH_C_API)
   endif()
 endif()
 
+if (TI_BUILD_EXAMPLES)
+  include(cmake/TaichiExamples.cmake)
+endif()
+
+if (TI_BUILD_RHI_EXAMPLES)
+  add_subdirectory(cpp_examples/rhi_examples)
+endif()
+
 
 option(TI_WITH_GRAPHVIZ "generate dependency graphs between targets" OFF)
 if (TI_WITH_GRAPHVIZ)
diff --git a/c_api/include/taichi/cpp/taichi.hpp b/c_api/include/taichi/cpp/taichi.hpp
index 8e594753d177fd..82af92bcabca90 100644
--- a/c_api/include/taichi/cpp/taichi.hpp
+++ b/c_api/include/taichi/cpp/taichi.hpp
@@ -1,4 +1,5 @@
 // C++ wrapper of Taichi C-API
+#pragma once
 #include <cstring>
 #include <list>
 #include <vector>
diff --git a/ci/Dockerfile.ubuntu.18.04 b/ci/Dockerfile.ubuntu.18.04
index 07aef59d22d6d3..f991f0390461ad 100644
--- a/ci/Dockerfile.ubuntu.18.04
+++ b/ci/Dockerfile.ubuntu.18.04
@@ -93,17 +93,17 @@ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh &
     bash Miniconda3-latest-Linux-x86_64.sh -p /home/dev/miniconda -b
 ENV PATH="/home/dev/miniconda/bin:$PATH"
 
-# Set up multi-python environment
-RUN conda init bash
-RUN conda create -n py36 python=3.6 pytorch cudatoolkit=10.2 -c pytorch -y
-RUN conda create -n py37 python=3.7 pytorch cudatoolkit=10.2 -c pytorch -y
-RUN conda create -n py38 python=3.8 pytorch cudatoolkit=10.2 -c pytorch -y
-RUN conda create -n py39 python=3.9 pytorch cudatoolkit=10.2 -c pytorch -y
-# TODO add torch to 3.10 when supported
-RUN conda create -n py310 python=3.10 -y
-
 # Remove mesa EGL driver, which interferes with the propritary NVIDIA drivers
 RUN rm -f /usr/lib/x86_64-linux-gnu/libEGL_mesa*
 
 WORKDIR /home/dev
 ENV LANG="C.UTF-8"
+
+# Set up multi-python environment
+RUN conda init bash
+RUN conda create -n py36 python=3.6 pytorch cudatoolkit=11.3 -c pytorch -y
+RUN conda create -n py37 python=3.7 pytorch cudatoolkit=11.3 -c pytorch -y
+RUN conda create -n py38 python=3.8 pytorch cudatoolkit=11.3 -c pytorch -y
+RUN conda create -n py39 python=3.9 pytorch cudatoolkit=11.3 -c pytorch -y
+# TODO add torch to 3.10 when supported
+RUN conda create -n py310 python=3.10 -y
diff --git a/ci/windows/win_build_test.ps1 b/ci/windows/win_build_test.ps1
index 2e75e991e371a8..ed32d138bbe6d6 100644
--- a/ci/windows/win_build_test.ps1
+++ b/ci/windows/win_build_test.ps1
@@ -32,7 +32,7 @@ if (!$llvmVer.CompareTo("10")) {
 } else {
     if (-not (Test-Path "taichi_llvm_15")) {
         WriteInfo("Download and extract LLVM")
-        curl.exe --retry 10 --retry-delay 5 https://github.com/python3kgae/taichi_assets/releases/download/llvm15_vs2019_clang_220731/taichi-llvm-15.0.0-msvc2019.zip -LO
+        curl.exe --retry 10 --retry-delay 5 https://github.com/python3kgae/taichi_assets/releases/download/llvm15_vs2019_clang/taichi-llvm-15.0.0-msvc2019.zip -LO
         if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE; }
         7z x taichi-llvm-15.0.0-msvc2019.zip -otaichi_llvm_15
     }
diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index 0b73ac4324c2c0..687b705f37084e 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -81,8 +81,15 @@ if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64" OR "${CMAKE_SYSTEM_PROCESSOR}"
     if (MSVC)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D \"TI_ARCH_x64\"")
     else()
-        message("Setting -march=nehalem for x86_64 processors")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=nehalem -DTI_ARCH_x64")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_ARCH_x64")
+        if ("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
+            # TODO: (penguinliong) Will probably need this in a future version
+            # of Clang. Clang11 doesn't recognize this.
+            #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=apple-m1")
+        else()
+            message("Setting -march=nehalem for x86_64 processors")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=nehalem")
+        endif()
     endif()
     set(ARCH "x64")
 elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64")
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 4c57b56796c924..7478f031890382 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -319,6 +319,11 @@ if (TI_WITH_OPENGL OR TI_WITH_VULKAN OR TI_WITH_DX11)
   target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE gfx_runtime)
 endif()
 
+if (TI_WITH_OPENGL OR TI_WITH_DX11)
+  set(SPIRV_CROSS_CLI false)
+  add_subdirectory(${PROJECT_SOURCE_DIR}/external/SPIRV-Cross ${PROJECT_BINARY_DIR}/external/SPIRV-Cross)
+endif()
+
 # Vulkan Device API
 if (TI_WITH_VULKAN)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_VULKAN")
diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake
index 8655ce7ecb5e97..569de8d71cfa9a 100644
--- a/cmake/TaichiTests.cmake
+++ b/cmake/TaichiTests.cmake
@@ -20,7 +20,8 @@ file(GLOB_RECURSE TAICHI_TESTS_SOURCE
         "tests/cpp/llvm/*.cpp"
         "tests/cpp/program/*.cpp"
         "tests/cpp/struct/*.cpp"
-        "tests/cpp/transforms/*.cpp")
+        "tests/cpp/transforms/*.cpp"
+        "tests/cpp/offline_cache/*.cpp")
 
 if (TI_WITH_OPENGL OR TI_WITH_VULKAN)
     file(GLOB TAICHI_TESTS_GFX_UTILS_SOURCE
diff --git a/cpp_examples/rhi_examples/CMakeLists.txt b/cpp_examples/rhi_examples/CMakeLists.txt
new file mode 100644
index 00000000000000..5cedaca844d22f
--- /dev/null
+++ b/cpp_examples/rhi_examples/CMakeLists.txt
@@ -0,0 +1,31 @@
+macro(make_sample executable_name src_file)
+add_executable(${executable_name})
+set_property(TARGET ${executable_name} PROPERTY CXX_STANDARD 17)
+set_property(TARGET ${executable_name} PROPERTY C_STANDARD 17)
+target_sources(${executable_name} PRIVATE ${src_file} "common.h")
+target_include_directories(${executable_name}
+PRIVATE
+  ${PROJECT_SOURCE_DIR}
+
+  ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
+  ${PROJECT_SOURCE_DIR}/external/eigen
+  ${PROJECT_SOURCE_DIR}/external/FP16/include
+  ${PROJECT_SOURCE_DIR}/external/SPIRV-Reflect
+  ${PROJECT_SOURCE_DIR}/external/spdlog/include
+  ${LLVM_INCLUDE_DIRS}
+
+  ${PROJECT_SOURCE_DIR}/external/volk
+  ${PROJECT_SOURCE_DIR}/external/Vulkan-Headers/include
+  ${PROJECT_SOURCE_DIR}/external/glfw/include
+  ${PROJECT_SOURCE_DIR}/external/glm
+  ${PROJECT_SOURCE_DIR}/external/imgui
+)
+target_include_directories(${executable_name} SYSTEM
+  PUBLIC
+    ${PROJECT_SOURCE_DIR}/external/VulkanMemoryAllocator/include
+  )
+target_link_libraries(${executable_name} taichi_c_api glfw)
+endmacro()
+
+make_sample(sample_1_window sample_1_window.cpp)
+make_sample(sample_2_triangle sample_2_triangle.cpp)
diff --git a/cpp_examples/rhi_examples/common.h b/cpp_examples/rhi_examples/common.h
new file mode 100644
index 00000000000000..b379258131d8ce
--- /dev/null
+++ b/cpp_examples/rhi_examples/common.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include "taichi/rhi/vulkan/vulkan_device.h"
+#include "taichi/rhi/vulkan/vulkan_common.h"
+#include "taichi/rhi/vulkan/vulkan_loader.h"
+#include "taichi/rhi/vulkan/vulkan_device_creator.h"
+
+#define GLFW_INCLUDE_NONE
+#include "GLFW/glfw3.h"
+#include "glm/glm.hpp"
+
+using namespace taichi::lang;
+
+static void glfw_error_callback(int code, const char *description) {
+  TI_WARN("GLFW Error {}: {}", code, description);
+}
+
+std::vector<std::string> get_required_instance_extensions() {
+  std::vector<std::string> extensions;
+
+  uint32_t glfw_ext_count = 0;
+  const char **glfw_extensions;
+  glfw_extensions = glfwGetRequiredInstanceExtensions(&glfw_ext_count);
+
+  for (int i = 0; i < glfw_ext_count; ++i) {
+    extensions.push_back(glfw_extensions[i]);
+  }
+  // VulkanDeviceCreator will check that these are supported
+  extensions.push_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
+
+  return extensions;
+}
+
+std::vector<std::string> get_required_device_extensions() {
+  static std::vector<std::string> extensions{
+      VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+  };
+
+  return extensions;
+}
+
+class App {
+ public:
+  App(int width, int height, const std::string &title) {
+    TI_INFO("Creating App '{}' of {}x{}", title, width, height);
+
+    TI_ASSERT(taichi::lang::vulkan::is_vulkan_api_available());
+
+    if (glfwInit()) {
+      TI_INFO("Initialized GLFW");
+
+      glfwSetErrorCallback(glfw_error_callback);
+
+      glfwWindowHint(GLFW_VISIBLE, GLFW_TRUE);
+      glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
+      glfw_window =
+          glfwCreateWindow(width, height, "Sample Window", nullptr, nullptr);
+
+      TI_INFO("Initialized GLFWWindow");
+    } else {
+      TI_ERROR("failed to init GLFW");
+    }
+
+    {
+      vulkan::VulkanDeviceCreator::Params evd_params;
+      evd_params.api_version = std::nullopt;
+      evd_params.additional_instance_extensions =
+          get_required_instance_extensions();
+      evd_params.additional_device_extensions =
+          get_required_device_extensions();
+      evd_params.is_for_ui = true;
+      evd_params.surface_creator = [&](VkInstance instance) -> VkSurfaceKHR {
+        VkSurfaceKHR surface = VK_NULL_HANDLE;
+
+        if (glfwCreateWindowSurface(instance, glfw_window, nullptr, &surface) !=
+            VK_SUCCESS) {
+          TI_ERROR("failed to create window surface!");
+        }
+        return surface;
+      };
+      evd_params.enable_validation_layer = true;
+
+      device_creator =
+          std::make_unique<vulkan::VulkanDeviceCreator>(evd_params);
+      device = device_creator->device();
+
+      TI_INFO("Initialized VulkanDevice");
+    }
+
+    {
+      SurfaceConfig config;
+      config.window_handle = glfw_window;
+      config.native_surface_handle = device_creator->get_surface();
+
+      surface = device->create_surface(config);
+    }
+  }
+
+  virtual ~App() {
+    surface.reset();
+    device_creator.reset();
+    glfwDestroyWindow(glfw_window);
+    glfwTerminate();
+  }
+
+  virtual std::vector<StreamSemaphore> render_loop(
+      StreamSemaphore image_available_semaphore) {
+    return {};
+  }
+
+  void run() {
+    while (!glfwWindowShouldClose(glfw_window)) {
+      auto image_available_semaphore = surface->acquire_next_image();
+
+      glfwPollEvents();
+
+      surface->present_image(render_loop(image_available_semaphore));
+    }
+  }
+
+ public:
+  // Owned
+  GLFWwindow *glfw_window;
+  std::unique_ptr<vulkan::VulkanDeviceCreator> device_creator;
+  std::unique_ptr<Surface> surface;
+
+  // Weak references
+  vulkan::VulkanDevice *device;
+};
diff --git a/cpp_examples/rhi_examples/sample_1_window.cpp b/cpp_examples/rhi_examples/sample_1_window.cpp
new file mode 100644
index 00000000000000..c1e5d0272a5dc8
--- /dev/null
+++ b/cpp_examples/rhi_examples/sample_1_window.cpp
@@ -0,0 +1,21 @@
+#include "common.h"
+
+class SampleApp : public App {
+ public:
+  SampleApp() : App(1920, 1080, "Sample 1: Window") {
+  }
+
+  std::vector<StreamSemaphore> render_loop(
+      StreamSemaphore image_available_semaphore) override {
+    return {};
+  }
+
+ public:
+};
+
+int main() {
+  std::unique_ptr<SampleApp> app = std::make_unique<SampleApp>();
+  app->run();
+
+  return 0;
+}
diff --git a/cpp_examples/rhi_examples/sample_2_triangle.cpp b/cpp_examples/rhi_examples/sample_2_triangle.cpp
new file mode 100644
index 00000000000000..8b7b84d4de827d
--- /dev/null
+++ b/cpp_examples/rhi_examples/sample_2_triangle.cpp
@@ -0,0 +1,115 @@
+#include "common.h"
+
+std::vector<uint32_t> frag_spv =
+#include "shaders/2_triangle.frag.spv.h"
+    ;
+
+std::vector<uint32_t> vert_spv =
+#include "shaders/2_triangle.vert.spv.h"
+    ;
+
+struct Vertex {
+  glm::vec2 pos;
+  glm::vec3 color;
+};
+
+class SampleApp : public App {
+ public:
+  SampleApp() : App(1920, 1080, "Sample 2: Triangle") {
+    // Create the triangle raster pipeline
+    {
+      // Load the SPIRV source
+      std::vector<PipelineSourceDesc> src_desc(2);
+
+      src_desc[0].data = (void *)frag_spv.data();
+      src_desc[0].size = frag_spv.size() * sizeof(uint32_t);
+      src_desc[0].type = PipelineSourceType::spirv_binary;
+      src_desc[0].stage = PipelineStageType::fragment;
+
+      src_desc[1].data = (void *)vert_spv.data();
+      src_desc[1].size = vert_spv.size() * sizeof(uint32_t);
+      src_desc[1].type = PipelineSourceType::spirv_binary;
+      src_desc[1].stage = PipelineStageType::vertex;
+
+      // Setup rasterizer parameters
+      RasterParams raster_params;  // use default
+
+      // Setup vertex input parameters
+      std::vector<VertexInputBinding> vertex_inputs = {
+          {.binding = 0, .stride = sizeof(Vertex), .instance = false}};
+      std::vector<VertexInputAttribute> vertex_attrs = {
+          {.location = 0,
+           .binding = 0,
+           .format = BufferFormat::rg32f,
+           .offset = offsetof(Vertex, pos)},
+          {.location = 1,
+           .binding = 0,
+           .format = BufferFormat::rgb32f,
+           .offset = offsetof(Vertex, color)}};
+
+      // Create pipeline
+      pipeline = device->create_raster_pipeline(src_desc, raster_params,
+                                                vertex_inputs, vertex_attrs);
+    }
+
+    // Create the vertex buffer
+    {
+      vertex_buffer = device->allocate_memory_unique(
+          Device::AllocParams{.size = 3 * sizeof(Vertex),
+                              .host_write = true,
+                              .usage = AllocUsage::Vertex});
+      Vertex *mapped = (Vertex *)device->map(*vertex_buffer);
+      mapped[0] = {{0.0, 0.5}, {1.0, 0.0, 0.0}};
+      mapped[1] = {{0.5, -0.5}, {0.0, 1.0, 0.0}};
+      mapped[2] = {{-0.5, -0.5}, {0.0, 0.0, 1.0}};
+      device->unmap(*vertex_buffer);
+    }
+
+    TI_INFO("App Init Done");
+  }
+
+  std::vector<StreamSemaphore> render_loop(
+      StreamSemaphore image_available_semaphore) override {
+    auto cmdlist = device->get_graphics_stream()->new_command_list();
+
+    // Set-up our frame buffer attachment
+    DeviceAllocation surface_image = surface->get_target_image();
+    cmdlist->image_transition(surface_image, ImageLayout::undefined,
+                              ImageLayout::color_attachment);
+
+    // Renderpass: render to surface image, clear color values
+    bool clear = true;
+    std::vector<float> clear_color = {0.1, 0.2, 0.3, 1.0};
+    const auto &[width, height] = surface->get_size();
+    cmdlist->begin_renderpass(0, 0, width, height, 1, &surface_image, &clear,
+                              &clear_color, nullptr, false);
+
+    // Bind our triangle pipeline
+    cmdlist->bind_pipeline(pipeline.get());
+    // Get the binder and bind our vertex buffer
+    auto resource_binder = pipeline->resource_binder();
+    resource_binder->vertex_buffer(vertex_buffer->get_ptr(0), 0);
+    cmdlist->bind_resources(resource_binder);
+    // Render the triangle
+    cmdlist->draw(3, 0);
+    // End rendering
+    cmdlist->end_renderpass();
+
+    // Submit command list, returns render complete semaphore
+    auto render_complete_semaphore = device->get_graphics_stream()->submit(
+        cmdlist.get(), {image_available_semaphore});
+    return {render_complete_semaphore};
+  }
+
+ public:
+  std::unique_ptr<Pipeline> pipeline;
+
+  std::unique_ptr<DeviceAllocationGuard> vertex_buffer;
+};
+
+int main() {
+  std::unique_ptr<SampleApp> app = std::make_unique<SampleApp>();
+  app->run();
+
+  return 0;
+}
diff --git a/cpp_examples/rhi_examples/shaders/2_triangle.frag b/cpp_examples/rhi_examples/shaders/2_triangle.frag
new file mode 100644
index 00000000000000..cb913b404f108e
--- /dev/null
+++ b/cpp_examples/rhi_examples/shaders/2_triangle.frag
@@ -0,0 +1,9 @@
+#version 460
+
+layout(location = 0) in vec3 color;
+
+layout(location = 0) out vec4 frag_output;
+
+void main() {
+  frag_output = vec4(color, 1.0);
+}
diff --git a/cpp_examples/rhi_examples/shaders/2_triangle.frag.spv.h b/cpp_examples/rhi_examples/shaders/2_triangle.frag.spv.h
new file mode 100644
index 00000000000000..aeb0437777d39e
--- /dev/null
+++ b/cpp_examples/rhi_examples/shaders/2_triangle.frag.spv.h
@@ -0,0 +1,26 @@
+{
+  0x07230203, 0x00010000, 0x000d000a, 0x00000013, 0x00000000, 0x00020011,
+      0x00000001, 0x0006000b, 0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e,
+      0x00000000, 0x0003000e, 0x00000000, 0x00000001, 0x0007000f, 0x00000004,
+      0x00000004, 0x6e69616d, 0x00000000, 0x00000009, 0x0000000c, 0x00030010,
+      0x00000004, 0x00000007, 0x00030003, 0x00000002, 0x000001cc, 0x000a0004,
+      0x475f4c47, 0x4c474f4f, 0x70635f45, 0x74735f70, 0x5f656c79, 0x656e696c,
+      0x7269645f, 0x69746365, 0x00006576, 0x00080004, 0x475f4c47, 0x4c474f4f,
+      0x6e695f45, 0x64756c63, 0x69645f65, 0x74636572, 0x00657669, 0x00040005,
+      0x00000004, 0x6e69616d, 0x00000000, 0x00050005, 0x00000009, 0x67617266,
+      0x74756f5f, 0x00747570, 0x00040005, 0x0000000c, 0x6f6c6f63, 0x00000072,
+      0x00040047, 0x00000009, 0x0000001e, 0x00000000, 0x00040047, 0x0000000c,
+      0x0000001e, 0x00000000, 0x00020013, 0x00000002, 0x00030021, 0x00000003,
+      0x00000002, 0x00030016, 0x00000006, 0x00000020, 0x00040017, 0x00000007,
+      0x00000006, 0x00000004, 0x00040020, 0x00000008, 0x00000003, 0x00000007,
+      0x0004003b, 0x00000008, 0x00000009, 0x00000003, 0x00040017, 0x0000000a,
+      0x00000006, 0x00000003, 0x00040020, 0x0000000b, 0x00000001, 0x0000000a,
+      0x0004003b, 0x0000000b, 0x0000000c, 0x00000001, 0x0004002b, 0x00000006,
+      0x0000000e, 0x3f800000, 0x00050036, 0x00000002, 0x00000004, 0x00000000,
+      0x00000003, 0x000200f8, 0x00000005, 0x0004003d, 0x0000000a, 0x0000000d,
+      0x0000000c, 0x00050051, 0x00000006, 0x0000000f, 0x0000000d, 0x00000000,
+      0x00050051, 0x00000006, 0x00000010, 0x0000000d, 0x00000001, 0x00050051,
+      0x00000006, 0x00000011, 0x0000000d, 0x00000002, 0x00070050, 0x00000007,
+      0x00000012, 0x0000000f, 0x00000010, 0x00000011, 0x0000000e, 0x0003003e,
+      0x00000009, 0x00000012, 0x000100fd, 0x00010038
+}
diff --git a/cpp_examples/rhi_examples/shaders/2_triangle.vert b/cpp_examples/rhi_examples/shaders/2_triangle.vert
new file mode 100644
index 00000000000000..120141945591eb
--- /dev/null
+++ b/cpp_examples/rhi_examples/shaders/2_triangle.vert
@@ -0,0 +1,11 @@
+#version 460
+
+layout(location = 0) in vec2 v_position;
+layout(location = 1) in vec3 v_color;
+
+layout(location = 0) out vec3 color;
+
+void main() {
+  gl_Position = vec4(v_position, 0.0, 1.0);
+  color = v_color;
+}
diff --git a/cpp_examples/rhi_examples/shaders/2_triangle.vert.spv.h b/cpp_examples/rhi_examples/shaders/2_triangle.vert.spv.h
new file mode 100644
index 00000000000000..ec77d9d4d12933
--- /dev/null
+++ b/cpp_examples/rhi_examples/shaders/2_triangle.vert.spv.h
@@ -0,0 +1,47 @@
+{
+  0x07230203, 0x00010000, 0x000d000a, 0x00000021, 0x00000000, 0x00020011,
+      0x00000001, 0x0006000b, 0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e,
+      0x00000000, 0x0003000e, 0x00000000, 0x00000001, 0x0009000f, 0x00000000,
+      0x00000004, 0x6e69616d, 0x00000000, 0x0000000d, 0x00000012, 0x0000001d,
+      0x0000001f, 0x00030003, 0x00000002, 0x000001cc, 0x000a0004, 0x475f4c47,
+      0x4c474f4f, 0x70635f45, 0x74735f70, 0x5f656c79, 0x656e696c, 0x7269645f,
+      0x69746365, 0x00006576, 0x00080004, 0x475f4c47, 0x4c474f4f, 0x6e695f45,
+      0x64756c63, 0x69645f65, 0x74636572, 0x00657669, 0x00040005, 0x00000004,
+      0x6e69616d, 0x00000000, 0x00060005, 0x0000000b, 0x505f6c67, 0x65567265,
+      0x78657472, 0x00000000, 0x00060006, 0x0000000b, 0x00000000, 0x505f6c67,
+      0x7469736f, 0x006e6f69, 0x00070006, 0x0000000b, 0x00000001, 0x505f6c67,
+      0x746e696f, 0x657a6953, 0x00000000, 0x00070006, 0x0000000b, 0x00000002,
+      0x435f6c67, 0x4470696c, 0x61747369, 0x0065636e, 0x00070006, 0x0000000b,
+      0x00000003, 0x435f6c67, 0x446c6c75, 0x61747369, 0x0065636e, 0x00030005,
+      0x0000000d, 0x00000000, 0x00050005, 0x00000012, 0x6f705f76, 0x69746973,
+      0x00006e6f, 0x00040005, 0x0000001d, 0x6f6c6f63, 0x00000072, 0x00040005,
+      0x0000001f, 0x6f635f76, 0x00726f6c, 0x00050048, 0x0000000b, 0x00000000,
+      0x0000000b, 0x00000000, 0x00050048, 0x0000000b, 0x00000001, 0x0000000b,
+      0x00000001, 0x00050048, 0x0000000b, 0x00000002, 0x0000000b, 0x00000003,
+      0x00050048, 0x0000000b, 0x00000003, 0x0000000b, 0x00000004, 0x00030047,
+      0x0000000b, 0x00000002, 0x00040047, 0x00000012, 0x0000001e, 0x00000000,
+      0x00040047, 0x0000001d, 0x0000001e, 0x00000000, 0x00040047, 0x0000001f,
+      0x0000001e, 0x00000001, 0x00020013, 0x00000002, 0x00030021, 0x00000003,
+      0x00000002, 0x00030016, 0x00000006, 0x00000020, 0x00040017, 0x00000007,
+      0x00000006, 0x00000004, 0x00040015, 0x00000008, 0x00000020, 0x00000000,
+      0x0004002b, 0x00000008, 0x00000009, 0x00000001, 0x0004001c, 0x0000000a,
+      0x00000006, 0x00000009, 0x0006001e, 0x0000000b, 0x00000007, 0x00000006,
+      0x0000000a, 0x0000000a, 0x00040020, 0x0000000c, 0x00000003, 0x0000000b,
+      0x0004003b, 0x0000000c, 0x0000000d, 0x00000003, 0x00040015, 0x0000000e,
+      0x00000020, 0x00000001, 0x0004002b, 0x0000000e, 0x0000000f, 0x00000000,
+      0x00040017, 0x00000010, 0x00000006, 0x00000002, 0x00040020, 0x00000011,
+      0x00000001, 0x00000010, 0x0004003b, 0x00000011, 0x00000012, 0x00000001,
+      0x0004002b, 0x00000006, 0x00000014, 0x00000000, 0x0004002b, 0x00000006,
+      0x00000015, 0x3f800000, 0x00040020, 0x00000019, 0x00000003, 0x00000007,
+      0x00040017, 0x0000001b, 0x00000006, 0x00000003, 0x00040020, 0x0000001c,
+      0x00000003, 0x0000001b, 0x0004003b, 0x0000001c, 0x0000001d, 0x00000003,
+      0x00040020, 0x0000001e, 0x00000001, 0x0000001b, 0x0004003b, 0x0000001e,
+      0x0000001f, 0x00000001, 0x00050036, 0x00000002, 0x00000004, 0x00000000,
+      0x00000003, 0x000200f8, 0x00000005, 0x0004003d, 0x00000010, 0x00000013,
+      0x00000012, 0x00050051, 0x00000006, 0x00000016, 0x00000013, 0x00000000,
+      0x00050051, 0x00000006, 0x00000017, 0x00000013, 0x00000001, 0x00070050,
+      0x00000007, 0x00000018, 0x00000016, 0x00000017, 0x00000014, 0x00000015,
+      0x00050041, 0x00000019, 0x0000001a, 0x0000000d, 0x0000000f, 0x0003003e,
+      0x0000001a, 0x00000018, 0x0004003d, 0x0000001b, 0x00000020, 0x0000001f,
+      0x0003003e, 0x0000001d, 0x00000020, 0x000100fd, 0x00010038
+}
diff --git a/docs/lang/articles/advanced/odop.md b/docs/lang/articles/advanced/odop1.md
similarity index 63%
rename from docs/lang/articles/advanced/odop.md
rename to docs/lang/articles/advanced/odop1.md
index a9f9154308d13a..179d659a2cc59a 100644
--- a/docs/lang/articles/advanced/odop.md
+++ b/docs/lang/articles/advanced/odop1.md
@@ -2,18 +2,15 @@
 sidebar_position: 2
 ---
 
-# Objective Data-oriented Programming
+# Objective Data-oriented Programming I
 
-Taichi is a
-[data-oriented](https://en.wikipedia.org/wiki/Data-oriented_design)
-programming (DOP) language. However, simple DOP makes modularization
-hard.
+Taichi is a [data-oriented](https://en.wikipedia.org/wiki/Data-oriented_design) programming (DOP) language. However, simple DOP makes modularization hard. To allow modularized code, Taichi borrows some concepts from object-oriented programming (OOP). For convenience, let's call the hybrid scheme **objective data-oriented programming** (ODOP).
 
-To allow modularized code, Taichi borrow some concepts from
-object-oriented programming (OOP).
+The ODOP scheme allows you to organize data and methods into a class and call the methods to manipulate the data in the Taichi scope. Taichi offers two different types of classes that serve this purpose, and they are distinguished by the two decorators `@ti.data_oriented` and `@ti.dataclass`, respectively:
 
-For convenience, let's call the hybrid scheme **objective data-oriented
-programming** (ODOP).
+1. `@ti.data_oriented`: It should be used when your data is actively updated in the Python scope (such as current time and user input events) and tracked in Taichi kernels. This type of class can have native Python objects as members and must be instantiated in the Python scope. This article will discuss this type of class in full detail.
+
+2. `@ti.dataclass`: It is a wrapper over `ti.types.struct` but offers more flexibility: You can define Taichi functions as its methods and invoke these methods in the Taichi scope. We will discuss this type of class in the next article.
 
 
 ## Data-oriented classes
@@ -277,80 +274,3 @@ b = Counter((4, 10))
 print(a.num())  # 6
 print(b.num())  # 7
 ```
-
-## Taichi dataclasses
-
-Taichi provides custom [struct types](../type_system/type.md#compound-types) for developers to associate pieces of data together. However, it is often convenient to have:
-  1. A Python representation of the struct type which is more object oriented.
-  2. Functions associated with a struct type. (C++ style structs)
-
-
-To achieve these two points, developers can use the `@ti.dataclass` decorator on a Python class.  This is heavily inspired by the Python [dataclass](https://docs.python.org/3/library/dataclasses.html) feature, which uses class fields with annotations to create data types.
-
-### Creating a struct from a Python class
-Here is an example of how we could create a Taichi struct type from a Python class:
-
-```python
-@ti.dataclass
-class Sphere:
-    center: vec3
-    radius: ti.f32
-```
-This will create the *exact* same type as doing:
-
-```python
-Sphere = ti.types.struct(center=vec3, radius=ti.f32)
-```
-Using the `@ti.dataclass` decorator will convert the annotated fields in the Python class to members in the resulting struct type.  In both of the above examples you would create a field of the struct the same way.
-
-```python
-sphere_field = Sphere.field(shape=(n,))
-```
-
-### Associating functions with the struct type
-Python classes can have functions attached to them, as can Taichi struct types.  Building from the above example, here is how one would add functions to the struct.
-
-```python
-@ti.dataclass
-class Sphere:
-    center: vec3
-    radius: ti.f32
-
-    @ti.func
-    def area(self):
-        # a function to run in taichi scope
-        return 4 * math.pi * self.radius * self.radius
-
-    def is_zero_sized(self):
-        # a python scope function
-        return self.radius == 0.0
-```
-
-Functions associated with structs follow the same [scope rules](../kernels/syntax.md#taichi-scope-vs-python-scope) as normal functions, in that they can be in Taichi or Python scope.  Each instance of the `Sphere` struct type now will have the above functions added to them.  The functions can be called such as:
-
-```python
-a_python_struct = Sphere(center=vec3(0.0), radius=1.0)
-# calls a python scope function from python
-a_python_struct.is_zero_sized() # False
-
-@ti.kernel
-def get_area() -> ti.f32:
-    a_taichi_struct = Sphere(center=vec3(0.0), radius=4.0)
-    # return the area of the sphere, a taichi scope function
-    return a_taichi_struct.area()
-get_area() # 201.062...
-```
-
-### Notes
-- Inheritance of Taichi dataclasses is not implemented.
-- While functions attached to a struct with the `@ti.dataclass` decorator is convenient and encouraged, it is actually possible to associate a function to structs with the older method of defining structs.  As mentioned above, the two methods for defining a struct type are identical in their output.  To do this, use the `__struct_methods` argument with the `ti.types.struct` call:
-
-```python
-@ti.func
-def area(self):
-    # a function to run in taichi scope
-    return 4 * math.pi * self.radius * self.radius
-
-Sphere = ti.types.struct(center=vec3, radius=ti.f32,
-                         __struct_methods={'area': area})
-```
diff --git a/docs/lang/articles/advanced/odop2.md b/docs/lang/articles/advanced/odop2.md
new file mode 100644
index 00000000000000..44db428f6f9039
--- /dev/null
+++ b/docs/lang/articles/advanced/odop2.md
@@ -0,0 +1,83 @@
+---
+sidebar_position: 3
+---
+
+# Objective Data-oriented Programming II
+
+
+## Taichi dataclasses
+
+Taichi provides custom [struct types](../type_system/type.md#compound-types) for developers to assemble pieces of data together. However, it would be more convenient to have:
+  1. A Python representation of the struct type which is more object oriented.
+  2. Functions associated with a struct type (C++-style structs).
+
+
+To achieve the ends, Taichi enabled the `@ti.dataclass` decorator on a Python class. This is inspired by Python's [dataclass](https://docs.python.org/3/library/dataclasses.html) feature, which uses class fields with annotations to create data types.
+
+### Creating a struct from a Python class
+Here is an example of how we could define a Taichi struct type under a Python class:
+
+```python
+@ti.dataclass
+class Sphere:
+    center: vec3
+    radius: ti.f32
+```
+This will create the *exact* same type as using `ti.types.struct()`:
+
+```python
+Sphere = ti.types.struct(center=vec3, radius=ti.f32)
+```
+The `@ti.dataclass` decorator converts the annotated members in the Python class to members in the resulting struct type. In both of the above examples, you end up with the same struct field.
+
+```python
+sphere_field = Sphere.field(shape=(n,))
+```
+
+### Associating functions with the struct type
+Python classes can have functions attached to them, and so can Taichi struct types. Building from the above example, one can embed functions in the struct as follows:
+
+```python
+@ti.dataclass
+class Sphere:
+    center: vec3
+    radius: ti.f32
+
+    @ti.func
+    def area(self):
+        # a function to run in taichi scope
+        return 4 * math.pi * self.radius * self.radius
+
+    def is_zero_sized(self):
+        # a python scope function
+        return self.radius == 0.0
+```
+
+Functions associated with structs follow the same scope rules as other functions. In other words, they can be placed in either the Taichi scope or the Python scope. Each instance of the `Sphere` struct type now have the above functions attached to them. The functions can be called in the following way:
+
+```python
+a_python_struct = Sphere(center=vec3(0.0), radius=1.0)
+# calls a python scope function from python
+a_python_struct.is_zero_sized() # False
+
+@ti.kernel
+def get_area() -> ti.f32:
+    a_taichi_struct = Sphere(center=vec3(0.0), radius=4.0)
+    # return the area of the sphere, a taichi scope function
+    return a_taichi_struct.area()
+get_area() # 201.062...
+```
+
+### Notes
+- Inheritance of Taichi dataclasses is not supported.
+- While it is convenient and recommended to associate functions with a struct defined via `@ti.dataclass`, `ti.types.struct` can serve the same purpose with the help of the `__struct_methods` argument. As mentioned above, the two methods of defining a struct type produce identical output.
+
+```python
+@ti.func
+def area(self):
+    # a function to run in taichi scope
+    return 4 * math.pi * self.radius * self.radius
+
+Sphere = ti.types.struct(center=vec3, radius=ti.f32,
+                         __struct_methods={'area': area})
+```
diff --git a/docs/lang/articles/advanced/quant.md b/docs/lang/articles/advanced/quant.md
index ec2d60d3faa6c2..95fc0132d26949 100644
--- a/docs/lang/articles/advanced/quant.md
+++ b/docs/lang/articles/advanced/quant.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 3
+sidebar_position: 4
 ---
 
 # Use quantized data types
diff --git a/docs/lang/articles/basic/layout.md b/docs/lang/articles/basic/layout.md
index e764cc8f0b8117..db46b10ddbdad1 100644
--- a/docs/lang/articles/basic/layout.md
+++ b/docs/lang/articles/basic/layout.md
@@ -58,13 +58,40 @@ ti.root.dense(ti.ij, (3, 4)).place(x)
 x = ti.field(ti.f32, shape=(3, 4))
 ```
 
-You can also nest two 1D `dense` statements to describe the same 2D array.
+You can also nest two 1D `dense` statements to describe a 2D array of the same shape.
 
 ```python {1-2}
 x = ti.field(ti.f32)
 ti.root.dense(ti.i, 3).dense(ti.j, 4).place(x)
+# has the same shape with
+x = ti.field(ti.f32, shape=(3,4))
 ```
 
+:::note
+
+The above 2D array built with nested `dense` statements is *not* equivalent to the 2D array built with `ti.field`.
+Although both statements result in a 2D array of the same shape, they have
+different layers of `SNodeTree`. In other words,
+```python
+x = ti.field(ti.f32)
+ti.root.dense(ti.i, 3).dense(ti.j, 4).place(x)
+```
+has two `SNodeTree` layers below the root;
+```python
+x = ti.field(ti.f32)
+ti.root.dense(ti.ij, (3, 4)).place(x)
+# or equivalently
+x = ti.field(ti.f32, shape=(3,4))
+```
+has only one `SNodeTree` layer below the root. See the sketch below:
+
+![2D data-layout sketch](https://user-images.githubusercontent.com/2747993/190545525-305563dc-d09e-4af2-b99b-166d5c4398d0.png)
+
+The difference here is subtle because both arrays are row-major, but it may have slight performance impact
+because the overhead of calculating the `SNodeTree` index is different for the two.
+
+:::
+
 In a nutshell, the `ti.root.X` statement progressively binds a shape to the corresponding axis.
 By nesting multiple statements, we can construct a field with higher dimensions.
 <!-- haidong: how far can we go? how many default axis exist? -->
diff --git a/docs/lang/articles/debug/developer_utilities.md b/docs/lang/articles/contribution/developer_utilities.md
similarity index 99%
rename from docs/lang/articles/debug/developer_utilities.md
rename to docs/lang/articles/contribution/developer_utilities.md
index aebba0f23d5a0e..c3e1e8bc98d872 100644
--- a/docs/lang/articles/debug/developer_utilities.md
+++ b/docs/lang/articles/contribution/developer_utilities.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 2
+sidebar_position: 3
 ---
 
 # Developer Utilities
diff --git a/docs/lang/articles/contribution/development_tips.md b/docs/lang/articles/contribution/development_tips.md
index b648056b8128a3..43f47cccf812af 100644
--- a/docs/lang/articles/contribution/development_tips.md
+++ b/docs/lang/articles/contribution/development_tips.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 5
+sidebar_position: 6
 ---
 
 # Development Tips
diff --git a/docs/lang/articles/contribution/doc_writing.md b/docs/lang/articles/contribution/doc_writing.md
index fd01e8da852bb3..fc8b9943999d02 100644
--- a/docs/lang/articles/contribution/doc_writing.md
+++ b/docs/lang/articles/contribution/doc_writing.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 6
+sidebar_position: 7
 ---
 
 # Markdown Syntax
diff --git a/docs/lang/articles/contribution/style_guide_en.md b/docs/lang/articles/contribution/style_guide_en.md
index c5aadb327746fa..f613feb389b0be 100644
--- a/docs/lang/articles/contribution/style_guide_en.md
+++ b/docs/lang/articles/contribution/style_guide_en.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 7
+sidebar_position: 8
 ---
 
 # Document Style Guide
diff --git a/docs/lang/articles/contribution/write_test.md b/docs/lang/articles/contribution/write_test.md
index ae45477d794081..867ea02e1b7935 100644
--- a/docs/lang/articles/contribution/write_test.md
+++ b/docs/lang/articles/contribution/write_test.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 3
+sidebar_position: 4
 ---
 
 # Write a Python test
diff --git a/docs/lang/articles/contribution/writing_cpp_tests.md b/docs/lang/articles/contribution/writing_cpp_tests.md
index 80ab14e8c96e4e..08b9289c32141e 100644
--- a/docs/lang/articles/contribution/writing_cpp_tests.md
+++ b/docs/lang/articles/contribution/writing_cpp_tests.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 4
+sidebar_position: 5
 ---
 
 # Write a C++ test
diff --git a/docs/lang/articles/get-started/accelerate_pytorch.md b/docs/lang/articles/get-started/accelerate_pytorch.md
new file mode 100644
index 00000000000000..2c94768a590fb2
--- /dev/null
+++ b/docs/lang/articles/get-started/accelerate_pytorch.md
@@ -0,0 +1,223 @@
+---
+sidebar_position: 4
+---
+
+# Accelerate PyTorch with Taichi
+
+Taichi and Torch serve different application scenarios but can complement each other.
+
+- Taichi provides finer control over parallelization and enables more 'granular' (element-level) operations, giving its users much more flexibilities.
+- Torch abstracts such details into Tensor-level operations like LEGO bricks, enabling its users to focus on building ML (Machine Learning) models.
+
+This document uses two examples to explain how to use Taichi kernel to implement data preprocessing operators and custom high-performance ML operators.
+
+## Data preprocessing
+
+This section uses padding as an example to show you how Taichi can complement PyTorch in data preprocessing.
+
+Padding is a commonly-used data preprocessing technique in machine learning. For example, padding can prevent convolution operations from changing the size of the input image. However, no PyTorch operators are designed specifically for padding in a specific customized pattern. Previously, you have two options to work around this:
+
+- Using Python or PyTorch to iterate over matrix elements.
+- Writing a C++/CUDA operator and connecting it to PyTorch via Python's custom operator extension.
+
+The former has very poor efficiency and could become a drain of the neural network training performance; the latter requires large amount of domain-specific knowledge about the underlying hardware architectures and it could take a long while to get started.
+
+Now, you can use Taichi to pad a brick wall of a specific customized pattern in a much more efficient way.
+
+The following sections compare PyTorch's implementation of this workflow with Taichi's implementation:
+
+1. Create a 'brick' and fill it with changing colors.
+
+   ![brick](https://user-images.githubusercontent.com/93570324/191012540-4035cf95-c9e0-4fcf-94f1-1be4cc8abfae.png)
+
+2. Repeat the bricks horizontally with a fixed offset to form a staggered layout.
+
+   ![bricks](https://user-images.githubusercontent.com/93570324/191012612-2834db6b-8c31-4986-92a9-0c462b2ee9c5.png)
+
+### Padding with PyTorch
+
+The following code implements a PyTorch kernel `torch_pad()` for padding. To improve efficiency, the kernel turns the padding process into a series of native PyTorch matrix operations. But such matrix operations are usually unintuitive and require so many intermediate results to be stored in the GPU memory that old GPUs with less RAM cannot even afford them.
+
+```python
+def torch_pad(arr, tile, y):
+    # image_pixel_to_coord
+    arr[:, :, 0] = image_height - 1 + ph - arr[:, :, 0]
+    arr[:, :, 1] -= pw
+    arr1 = torch.flip(arr, (2, ))
+    # map_coord
+    v = torch.floor(arr1[:, :, 1] / tile_height).to(torch.int)
+    u = torch.floor((arr1[:, :, 0] - v * shift_y[0]) / tile_width).to(torch.int)
+    uu = torch.stack((u, u), axis=2)
+    vv = torch.stack((v, v), axis=2)
+    arr2 = arr1 - uu * shift_x - vv * shift_y
+    # coord_to_tile_pixel
+    arr2[:, :, 1] = tile_height - 1 - arr2[:, :, 1]
+    table = torch.flip(arr2, (2, ))
+    table = table.view(-1, 2).to(torch.float)
+    inds = table.mv(y)
+    gathered = torch.index_select(tile.view(-1), 0, inds.to(torch.long))
+    return gathered
+
+with Timer():
+    gathered = torch_pad(coords, tile, y)
+    torch.cuda.synchronize(device=device)
+```
+
+### Padding with Taichi
+
+The following code implements a Taichi kernel `ti_pad()` for padding. The kernel iterates over the pixels in the output image, works out each pixel's corresponding position in the input 'brick', and fills the pixel with the RGB color in that position.
+
+Taichi automatically runs the top-level for-loops in parallel, and matrix operations written in Taichi are much more readable. Moreover, as you can tell from the following code, `ti_pad()` takes in the PyTorch tensors directly so that it can reuse the memory allocated by PyTorch and would not cause extra overhead from the data transfer between the two frameworks.
+
+```python
+@ti.kernel
+def ti_pad(image_pixels: ti.types.ndarray(), tile: ti.types.ndarray()):
+    for row, col in ti.ndrange(image_height, image_width):
+        # image_pixel_to_coord
+        x1, y1 = ti.math.ivec2(col - pw, image_height - 1 - row + ph)
+        # map_coord
+        v: ti.i32 = ti.floor(y1 / tile_height)
+        u: ti.i32 = ti.floor((x1 - v * shift_y[0]) / tile_width)
+        x2, y2 = ti.math.ivec2(x1 - u * shift_x[0] - v * shift_y[0],
+                 y1 - u * shift_x[1] - v * shift_y[1])
+        # coord_to_tile_pixel
+        x, y = ti.math.ivec2(tile_height - 1 - y2, x2)
+        image_pixels[row, col] = tile[x, y]
+with Timer():
+    ti_pad(image_pixels, tile）
+    ti.sync()
+```
+
+### Performance comparison
+
+As the following table shows, the PyTorch kernel takes 30.392 ms[1] to complete padding; the Taichi kernel takes 0.267 ms only. Taichi outruns PyTorch by more than 100x (30.392/0.267).
+
+`torch_pad()` launches 58 CUDA kernels, whilst Taichi compiles all computation into one CUDA kernel. The fewer the CUDA kernels, the less GPU launch overhead is incurred. Moreover, the Taichi kernel manages to save a lot more redundant memory operations than the PyTorch kernel. The GPU launch overhead and the redundant memory operations are the potential source for optimization and acceleration.
+
+| Kernel function  | Average time (ms)  | CUDA kernels launched (number)  |
+| :--------------- | :----------------- | :------------------------------ |
+| `torch_pad()`    | 30.392             | 58                              |
+| `ti_pad()`       | 0.267              | 1                               |
+
+> - GPU: RTX3090
+> - PyTorch version: v1.12.1; Taichi version: v1.1.0
+> - The actual acceleration rate may vary depending on your implementation and GPU setup.
+
+## Customize ML operators
+
+Researchers in machine learning usually spend a lot of time designing model architectures. Because they cannot find decent support for their newly-designed or customized operators from PyTorch, they have to spend time studying CUDA for fine tuning and to improve efficiency. But writing in CUDA is hard, tuning CUDA code is even harder, and accelerating model iteration with CUDA is difficult.
+
+[This repo](https://github.com/BlinkDL/RWKV-CUDA) introduces an example of customizing an ML operator in CUDA. The author developed an RWKV language model using sort of a one-dimensional depthwise convolution custom operator. The model does not involve much computation but still runs slow because PyTorch does not have native support for it.  So, the author customized the operator in CUDA using a set of optimization techniques, such as loop fusion and Shared Memory, and achieved a performance 20x better than he did with PyTorch.
+
+Referring to the CUDA code[3], we customized a Taichi depthwise convolution operator[4] in the RWKV model using the same optimization techniques.
+
+The function of the depth wise convolution operator:
+
+1. Iterates over two input Tensors `w` and `k`,
+2. Adds up the product of the respective elements in `w` and `k` into `s`,
+3. Saves `s` to an output Tensor `out`.
+
+The following subsections take the Baseline implementations as an example to show you how to implement a depthwise convolution operator with Python, PyTorch, CUDA, and Taichi, and how they compare to each other. With Taichi, you can accelerate your ML model development with ease and get rid of the tedious low-level parallel programming.
+
+|  Implementation |  Readability |  Performance                               |
+| :-------------- | :----------- | :----------------------------------------- |
+| Python          | Excellent    | The slowest                                |
+| PyTorch         | Poor         | Slow                                       |
+| CUDA            | Poor         | Fast                                       |
+| Taichi          | Excellent    | Comparable to that of CUDA or even better  |
+
+### Implement a depthwise convolution operator with Python
+
+The Python reference code is straightforward and easy to understand, but it runs so slow that the result can hardly make itself into the diagram above.
+
+```python
+def run_formula_very_slow(w, k, B, C, T, eps):
+    out = torch.empty((B, C, T), device='cpu')
+    for b in range(B):
+        for c in range(C):
+            for t in range(T):
+                s = eps
+                for u in range(t-T+1, t+1):
+                    s += w[c][0][(T-1)-(t-u)] * k[b][c][u+T-1]
+                out[b][c][t] = s
+    return out
+```
+
+### Implement a depthwise convolution operator with PyTorch
+
+It is very challenging to translate the Python reference code above to this code line. To come up with this, you have to know very well the underlying logic of these PyTorch operators.
+
+```python
+out = eps + F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(k), w.unsqueeze(1), groups=C)
+```
+
+### Implement a depthwise convolution operator with CUDA
+
+The CUDA reference code has much poorer readability: The outmost loop is implicitly defined by thread parallelism. The index calculation is complicated, and each element's position in the matrix is not clear at a glance. Besides, it could be rather error-prone to implement more sophisticated algorithms with CUDA.
+
+```cpp
+__global__ void kernel_forward(const float* w, const float* k, float* x,
+                               const float eps, const int B, const int C, const int T)
+{
+    const int i = blockIdx.y;
+    const int t = threadIdx.x;
+    float s = eps;
+    const float* www = w + (i % C) * T + (T - 1) - t;
+    const float* kk = k + i * T;
+    for (int u = 0; u <= t; u++){
+        s += www[u] * kk[u];
+    }
+    x[i * T + t] = s;
+}
+```
+
+Further, you need a proper compile environment to run your CUDA code! If you have precompiled your CUDA code into a dynamic link library, then you also need to spend time working hard on trivial matters such as environment settings and Python API encapsulation.
+
+### Implement a depthwise convolution operator with Taichi
+
+The Taichi reference code is almost identical to its Python counterpart. And a good advantage that Taichi has over CUDA is that, without worrying about low-level details like parallelization and pointer offsets, one can easily use Taichi to achieve comparable performance.
+
+```python
+@ti.kernel
+def taichi_forward_v0(
+        out: ti.types.ndarray(field_dim=3),
+        w: ti.types.ndarray(field_dim=3),
+        k: ti.types.ndarray(field_dim=3),
+        eps: ti.f32):
+
+    for b, c, t in out:
+        s = eps
+        for u in range(t-T+1, t+1):
+            s += w[c, 0, (T-1)-(t-u)] * k[b, c, u+T-1]
+        out[b, c, t] = s
+```
+
+### Performance comparison
+
+The following diagram shows that Taichi always shows a performance that is comparable to its CUDA counterpart or even better under certain circumstances.
+
+![comparison](https://user-images.githubusercontent.com/93570324/191012778-99408533-c3a2-4868-a750-e853a63d2697.png)
+
+> - The RWKV compute time in the diagram is in milliseconds. The less the compute time, the better the performance is.
+> - 'Baseline': The reference code is a faithful implementation of the algorithm without any modification.
+> - v1 to v3: The three different optimized implementations.
+
+## Recap
+
+PyTorch is efficient in handling a large proportion of computation tasks in machine learning. Still, there are niches and needs that it falls short of addressing, such as native support for many operators and unsatisfactory runtime performance.
+
+As a high-performance programming language embedded in Python, Taichi features:
+
+- Eeasy readability,
+- Optimized memory consumption,
+- Runtime performance comparable to that of CUDA,
+- Good portability that encourages reproducible code sharing among the community.
+
+All these features set Taichi apart as a convenient tool for ML operator customization.The two examples provided in this document give you a glimpse of how Taichi and PyTorch can complement each other to solve real-world high-performance programming issues.
+
+## Reference
+
+- [1] [Pure PyTorch padding](https://github.com/ailzhang/blog_code/blob/master/tile/demo_torch.py)
+- [2] [Padding PyTorch tensor in Taichi kernel](https://github.com/ailzhang/blog_code/blob/master/tile/demo_taichi.py)
+- [3] [RWKV-CUDA](https://github.com/BlinkDL/RWKV-CUDA/tree/main/depthwise_conv1d)
+- [4] [RWKV-Taichi ](https://github.com/ailzhang/blog_code/tree/master/rwkv)
diff --git a/docs/lang/articles/type_system/type.md b/docs/lang/articles/type_system/type.md
index 1ddd482703bdcc..27fdb8cc3cc8c4 100644
--- a/docs/lang/articles/type_system/type.md
+++ b/docs/lang/articles/type_system/type.md
@@ -255,7 +255,7 @@ The code above serves the same purpose as the line below does but provides bette
 Sphere = ti.types.struct(center=vec3, radius=float)
 ```
 
-Another advantage of using `@ti.dataclass` over `ti.types.struct` is that you can define member functions in a dataclass and call them in the Taichi scope, making object-oriented programming (OOP) possible. See the article [objective data-oriented programming](../advanced/odop.md) for more details.
+Another advantage of using `@ti.dataclass` over `ti.types.struct` is that you can define member functions in a dataclass and call them in the Taichi scope, making object-oriented programming (OOP) possible. See the article [objective data-oriented programming](../advanced/odop2.md) for more details.
 
 
 ### Initialization
diff --git a/python/taichi/_funcs.py b/python/taichi/_funcs.py
index 40a9f3b097cc6f..c8b0ee6280f344 100644
--- a/python/taichi/_funcs.py
+++ b/python/taichi/_funcs.py
@@ -3,7 +3,7 @@
 from taichi.lang import impl, matrix, ops
 from taichi.lang.impl import expr_init, get_runtime, grouped, static
 from taichi.lang.kernel_impl import func, pyfunc
-from taichi.lang.matrix import Matrix, Vector, is_vector
+from taichi.lang.matrix import Matrix, Vector
 from taichi.types import f32, f64
 from taichi.types.annotations import template
 
@@ -59,9 +59,6 @@ def _matrix_transpose(mat):
     Returns:
         Transpose of the input matrix.
     """
-    if static(is_vector(mat)):
-        # Convert to row vector
-        return matrix.Matrix([[mat(i) for i in range(mat.n)]])
     return matrix.Matrix([[mat(i, j) for i in range(mat.n)]
                           for j in range(mat.m)],
                          ndim=mat.ndim)
diff --git a/python/taichi/lang/ast/ast_transformer.py b/python/taichi/lang/ast/ast_transformer.py
index 6fac2c6418e0ce..f72456a664f3d4 100644
--- a/python/taichi/lang/ast/ast_transformer.py
+++ b/python/taichi/lang/ast/ast_transformer.py
@@ -15,8 +15,9 @@
 from taichi.lang.ast.symbol_resolver import ASTResolver
 from taichi.lang.exception import TaichiSyntaxError, TaichiTypeError
 from taichi.lang.field import Field
+from taichi.lang.impl import current_cfg
 from taichi.lang.matrix import (Matrix, MatrixType, Vector, _PyScopeMatrixImpl,
-                                _TiScopeMatrixImpl)
+                                _TiScopeMatrixImpl, make_matrix)
 from taichi.lang.snode import append
 from taichi.lang.util import in_taichi_scope, is_taichi_class, to_taichi_type
 from taichi.types import (annotations, ndarray_type, primitive_types,
@@ -114,6 +115,12 @@ def build_Assign(ctx, node):
     @staticmethod
     def build_assign_slice(ctx, node_target, values, is_static_assign):
         target = ASTTransformer.build_Subscript(ctx, node_target, get_ref=True)
+        if current_cfg().real_matrix:
+            if isinstance(node_target.value.ptr,
+                          any_array.AnyArray) and isinstance(
+                              values, (list, tuple)):
+                values = make_matrix(values)
+
         if isinstance(node_target.value.ptr, Matrix):
             if isinstance(node_target.value.ptr._impl, _TiScopeMatrixImpl):
                 target._assign(values)
@@ -1110,7 +1117,7 @@ def build_nested_mesh_for(ctx, node):
             loop_var = expr.Expr(ctx.ast_builder.make_id_expr(''))
             ctx.create_variable(loop_name, loop_var)
             begin = expr.Expr(0)
-            end = node.iter.ptr.size
+            end = ti_ops.cast(node.iter.ptr.size, primitive_types.i32)
             ctx.ast_builder.begin_frontend_range_for(loop_var.ptr, begin.ptr,
                                                      end.ptr)
             entry_expr = _ti_core.get_relation_access(
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index dc92af1b77a928..e91a17d1971379 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -16,7 +16,7 @@
 from taichi.lang.kernel_arguments import SparseMatrixProxy
 from taichi.lang.matrix import (Matrix, MatrixField, MatrixNdarray, MatrixType,
                                 Vector, _IntermediateMatrix,
-                                _MatrixFieldElement, make_matrix)
+                                _MatrixFieldElement)
 from taichi.lang.mesh import (ConvType, MeshElementFieldProxy, MeshInstance,
                               MeshRelationAccessProxy,
                               MeshReorderedMatrixFieldProxy,
@@ -64,7 +64,8 @@ def expr_init(rhs):
                 entries = [[rhs(i, j) for j in range(rhs.m)]
                            for i in range(rhs.n)]
             return make_matrix(entries)
-        if isinstance(rhs, Vector) or getattr(rhs, "ndim", None) == 1:
+        if (isinstance(rhs, Vector)
+                or getattr(rhs, "ndim", None) == 1) and rhs.m == 1:
             # _IntermediateMatrix may reach here
             return Vector(rhs.to_list(), ndim=rhs.ndim)
         return Matrix(rhs.to_list(), ndim=rhs.ndim)
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 3fcddd7dfe7035..c1e148ef005840 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -117,6 +117,10 @@ def is_vector(x):
     return isinstance(x, Vector) or getattr(x, "ndim", None) == 1
 
 
+def is_col_vector(x):
+    return is_vector(x) and getattr(x, "m", None) == 1
+
+
 class _MatrixBaseImpl:
     def __init__(self, m, n, entries):
         self.m = m
@@ -447,6 +451,11 @@ def __init__(self,
             is_matrix = isinstance(arr[0], Iterable) and not is_vector(self)
             initializer = _make_entries_initializer(is_matrix)
             self.ndim = 2 if is_matrix else 1
+            if not is_matrix and isinstance(arr[0], Iterable):
+                flattened = []
+                for row in arr:
+                    flattened += row
+                arr = flattened
 
             if in_python_scope() or is_ref:
                 mat = initializer.pyscope_or_ref(arr)
@@ -493,7 +502,7 @@ def __init__(self,
 
     def _element_wise_binary(self, foo, other):
         other = self._broadcast_copy(other)
-        if is_vector(self):
+        if is_col_vector(self):
             return Vector([foo(self(i), other(i)) for i in range(self.n)],
                           ndim=self.ndim)
         return Matrix([[foo(self(i, j), other(i, j)) for j in range(self.m)]
@@ -502,7 +511,7 @@ def _element_wise_binary(self, foo, other):
 
     def _broadcast_copy(self, other):
         if isinstance(other, (list, tuple)):
-            if is_vector(self):
+            if is_col_vector(self):
                 other = Vector(other, ndim=self.ndim)
             else:
                 other = Matrix(other, ndim=self.ndim)
@@ -553,6 +562,11 @@ def __matmul__(self, other):
 
         """
         assert isinstance(other, Matrix), "rhs of `@` is not a matrix / vector"
+        if (is_col_vector(self)) and not is_vector(other):
+            # left multiplication
+            assert self.n == other.m, f"Dimension mismatch between (left multiplication) shapes ({self.n}, {self.m}), ({other.n}, {other.m})"
+            return other.transpose() @ self
+        # right multiplication
         assert self.m == other.n, f"Dimension mismatch between shapes ({self.n}, {self.m}), ({other.n}, {other.m})"
         entries = []
         for i in range(self.n):
@@ -562,6 +576,8 @@ def __matmul__(self, other):
                 for k in range(1, other.n):
                     acc = acc + self(i, k) * other(k, j)
                 entries[i].append(acc)
+        if is_col_vector(other):
+            return Vector(entries)
         return Matrix(entries)
 
     # host access & python scope operation
@@ -657,7 +673,7 @@ def to_list(self):
         This is similar to `numpy.ndarray`'s `flatten` and `ravel` methods,
         the difference is that this function always returns a new list.
         """
-        if is_vector(self):
+        if is_col_vector(self):
             return [self(i) for i in range(self.n)]
         return [[self(i, j) for j in range(self.m)] for i in range(self.n)]
 
@@ -679,7 +695,7 @@ def cast(self, dtype):
             >>> B
             [0.0, 1.0, 2.0]
         """
-        if is_vector(self):
+        if is_col_vector(self):
             # when using _IntermediateMatrix, we can only check `self.ndim`
             return Vector(
                 [ops_mod.cast(self(i), dtype) for i in range(self.n)])
@@ -1624,8 +1640,8 @@ def fill(self, val):
                         (list, tuple)) and isinstance(val[0], numbers.Number):
             assert self.m == 1
             val = tuple(val)
-        elif is_vector(val) or self.ndim == 1:
-            val = tuple([(val(i), ) for i in range(self.n)])
+        elif is_vector(val):
+            val = tuple([(val(i), ) for i in range(self.n * self.m)])
         elif isinstance(val, Matrix):
             val_tuple = []
             for i in range(val.n):
diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index 4b6c38ebdeb4ef..758892310e193d 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -347,8 +347,6 @@ def init(arch=None,
     # changed by the Vulkan backend initialization on OS X.
     current_dir = os.getcwd()
 
-    cfg = impl.default_cfg()
-    cfg.offline_cache = True  # Enable offline cache in frontend instead of C++ side
     # Check if installed version meets the requirements.
     if require_version is not None:
         check_require_version(require_version)
@@ -365,6 +363,9 @@ def init(arch=None,
     kwargs = _deepcopy(kwargs)
     reset()
 
+    cfg = impl.default_cfg()
+    cfg.offline_cache = True  # Enable offline cache in frontend instead of C++ side
+
     spec_cfg = _SpecialConfig()
     env_comp = _EnvironmentConfigurator(kwargs, cfg)
     env_spec = _EnvironmentConfigurator(kwargs, spec_cfg)
diff --git a/python/taichi/shaders/SetImage_vk.frag b/python/taichi/shaders/SetImage_vk.frag
index 19715a7327145a..0ad5b40dbf4571 100644
--- a/python/taichi/shaders/SetImage_vk.frag
+++ b/python/taichi/shaders/SetImage_vk.frag
@@ -9,11 +9,10 @@ layout(location = 0) out vec4 out_color;
 layout(binding = 1) uniform UBO {
   float x_factor;
   float y_factor;
-}
-ubo;
+  int is_transposed;
+} ubo;
 
 void main() {
-  vec2 coord = frag_texcoord.yx * vec2(ubo.y_factor,ubo.x_factor);
-  out_color = texture(texSampler, coord);
-  // out_color = vec4(frag_texcoord.xy,0,1);
+  vec2 coord = frag_texcoord * vec2(ubo.x_factor,ubo.y_factor);
+  out_color = texture(texSampler, ubo.is_transposed != 0 ? coord.yx : coord);
 }
diff --git a/python/taichi/shaders/SetImage_vk_frag.spv b/python/taichi/shaders/SetImage_vk_frag.spv
index 42ea34c3a07010..750ef2e42a3b2c 100644
Binary files a/python/taichi/shaders/SetImage_vk_frag.spv and b/python/taichi/shaders/SetImage_vk_frag.spv differ
diff --git a/python/taichi/shaders/SetImage_vk_vert.spv b/python/taichi/shaders/SetImage_vk_vert.spv
index 211bd77be93591..0c41bc96fa42be 100644
Binary files a/python/taichi/shaders/SetImage_vk_vert.spv and b/python/taichi/shaders/SetImage_vk_vert.spv differ
diff --git a/python/taichi/ui/canvas.py b/python/taichi/ui/canvas.py
index 5f09f359d6d944..b1783dff3a2b32 100644
--- a/python/taichi/ui/canvas.py
+++ b/python/taichi/ui/canvas.py
@@ -1,3 +1,7 @@
+from taichi._lib import core as _ti_core
+from taichi.lang import impl
+from taichi.lang._texture import Texture
+
 from .staging_buffer import (copy_colors_to_vbo, copy_vertices_to_vbo,
                              get_vbo_field, to_rgba8)
 from .utils import get_field_info
@@ -28,9 +32,15 @@ def set_image(self, img):
             img (numpy.ndarray, :class:`~taichi.MatrixField`, :class:`~taichi.Field`, :class:`~taichi.Texture`): \
                 the image to be shown.
         """
-        staging_img = to_rgba8(img)
-        info = get_field_info(staging_img)
-        self.canvas.set_image(info)
+        is_texture = isinstance(img, Texture)
+        prog_is_vk = impl.pytaichi.prog.config.arch == _ti_core.Arch.vulkan
+        # FIXME: Remove this hack. Maybe add a query function for whether the texture can be presented
+        if is_texture and prog_is_vk:
+            self.canvas.set_image_texture(img.tex)
+        else:
+            staging_img = to_rgba8(img)
+            info = get_field_info(staging_img)
+            self.canvas.set_image(info)
 
     def triangles(self,
                   vertices,
diff --git a/python/taichi/ui/scene.py b/python/taichi/ui/scene.py
index 46f95deb413a49..a058ada09ddb81 100644
--- a/python/taichi/ui/scene.py
+++ b/python/taichi/ui/scene.py
@@ -307,10 +307,14 @@ def mesh_instance(self,
                 index_count = vertex_count
             else:
                 index_count = indices.shape[0]
-        if instance_count is None:
-            instance_count = transforms.shape[0]
-        if transforms and (transforms.m != 4 or transforms.n != 4):
-            raise Exception("Error! Transform matrix must be 4x4 shape")
+        if transforms:
+            if (transforms.m != 4 or transforms.n != 4):
+                raise Exception("Error! Transform matrix must be 4x4 shape")
+            if instance_count is None:
+                instance_count = transforms.shape[0]
+        else:
+            instance_count = 1
+
         copy_normals_to_vbo(vbo, normals)
         vbo_info = get_field_info(vbo)
         indices_info = get_field_info(indices)
diff --git a/setup.py b/setup.py
index c7fe63fa51b177..7196db9694c8a8 100644
--- a/setup.py
+++ b/setup.py
@@ -8,6 +8,7 @@
 import glob
 import multiprocessing
 import os
+import platform
 import shutil
 import sys
 from distutils.command.clean import clean
@@ -131,6 +132,11 @@ def get_cmake_args():
 
     if sys.platform != 'win32':
         os.environ['SKBUILD_BUILD_OPTIONS'] = f'-j{num_threads}'
+    if sys.platform == "darwin":
+        if platform.machine() == "arm64":
+            cmake_args += ["-DCMAKE_OSX_ARCHITECTURES=arm64"]
+        else:
+            cmake_args += ["-DCMAKE_OSX_ARCHITECTURES=x86_64"]
     return cmake_args
 
 
diff --git a/taichi/analysis/offline_cache_util.cpp b/taichi/analysis/offline_cache_util.cpp
index 3207e0d887834d..ee41a3973784d6 100644
--- a/taichi/analysis/offline_cache_util.cpp
+++ b/taichi/analysis/offline_cache_util.cpp
@@ -66,6 +66,7 @@ static std::vector<std::uint8_t> get_offline_cache_key_of_compile_config(
   serializer(config->experimental_auto_mesh_local);
   serializer(config->auto_mesh_local_default_occupacy);
   serializer(config->real_matrix);
+  serializer(config->real_matrix_scalarize);
   serializer.finalize();
 
   return serializer.data;
@@ -184,7 +185,7 @@ std::string get_cache_path_by_arch(const std::string &base_path, Arch arch) {
   std::string subdir;
   if (arch_uses_llvm(arch)) {
     subdir = "llvm";
-  } else if (arch == Arch::vulkan) {
+  } else if (arch == Arch::vulkan || arch == Arch::opengl) {
     subdir = "gfx";
   } else {
     return base_path;
diff --git a/taichi/aot/module_builder.cpp b/taichi/aot/module_builder.cpp
index 1b1b71fb58c8ff..9c1a8a71fda0da 100644
--- a/taichi/aot/module_builder.cpp
+++ b/taichi/aot/module_builder.cpp
@@ -5,9 +5,6 @@ namespace taichi {
 namespace lang {
 
 void AotModuleBuilder::add(const std::string &identifier, Kernel *kernel) {
-  if (!kernel->lowered() && Kernel::supports_lowering(kernel->arch)) {
-    kernel->lower(/*to_executable=*/!arch_uses_llvm(kernel->arch));
-  }
   add_per_backend(identifier, kernel);
 }
 
@@ -25,9 +22,6 @@ void AotModuleBuilder::add_field(const std::string &identifier,
 void AotModuleBuilder::add_kernel_template(const std::string &identifier,
                                            const std::string &key,
                                            Kernel *kernel) {
-  if (!kernel->lowered() && Kernel::supports_lowering(kernel->arch)) {
-    kernel->lower();
-  }
   add_per_backend_tmpl(identifier, key, kernel);
 }
 
diff --git a/taichi/cache/gfx/cache_manager.cpp b/taichi/cache/gfx/cache_manager.cpp
index 833d43ad9ee992..f3284cee0f0c91 100644
--- a/taichi/cache/gfx/cache_manager.cpp
+++ b/taichi/cache/gfx/cache_manager.cpp
@@ -47,13 +47,6 @@ struct CacheCleanerUtils<gfx::CacheManager::Metadata> {
   using MetadataType = gfx::CacheManager::Metadata;
   using KernelMetaData = MetadataType::KernelMetadata;
 
-  // To load metadata from file
-  static bool load_metadata(const CacheCleanerConfig &config,
-                            MetadataType &result) {
-    return read_from_binary_file(
-        result, taichi::join_path(config.path, config.metadata_filename));
-  }
-
   // To save metadata as file
   static bool save_metadata(const CacheCleanerConfig &config,
                             const MetadataType &data) {
@@ -81,13 +74,6 @@ struct CacheCleanerUtils<gfx::CacheManager::Metadata> {
     return true;
   }
 
-  // To check version
-  static bool check_version(const CacheCleanerConfig &config,
-                            const Version &version) {
-    return version[0] == TI_VERSION_MAJOR && version[1] == TI_VERSION_MINOR &&
-           version[2] == TI_VERSION_PATCH;
-  }
-
   // To get cache files name
   static std::vector<std::string> get_cache_files(
       const CacheCleanerConfig &config,
@@ -106,6 +92,12 @@ struct CacheCleanerUtils<gfx::CacheManager::Metadata> {
         taichi::join_path(config.path, kDebuggingAotMetadataFilename));
     taichi::remove(taichi::join_path(config.path, kGraphMetadataFilename));
   }
+
+  // To check if a file is cache file
+  static bool is_valid_cache_file(const CacheCleanerConfig &config,
+                                  const std::string &name) {
+    return filename_extension(name) == "spv";
+  }
 };
 
 }  // namespace offline_cache
@@ -184,10 +176,12 @@ void CacheManager::dump_with_merging() const {
       cache_builder->dump(path_, "");
 
       // Update offline_cache_metadata.tcb
+      using offline_cache::load_metadata_with_checking;
+      using Error = offline_cache::LoadMetadataError;
       Metadata old_data;
       const auto filename =
           taichi::join_path(path_, kOfflineCacheMetadataFilename);
-      if (read_from_binary_file(old_data, filename)) {
+      if (load_metadata_with_checking(old_data, filename) == Error::kNoError) {
         for (auto &[k, v] : offline_cache_metadata_.kernels) {
           auto iter = old_data.kernels.find(k);
           if (iter != old_data.kernels.end()) {  // Update
diff --git a/taichi/codegen/dx12/codegen_dx12.cpp b/taichi/codegen/dx12/codegen_dx12.cpp
index 4be95a53f7b1b4..bc2a65e2ae94e9 100644
--- a/taichi/codegen/dx12/codegen_dx12.cpp
+++ b/taichi/codegen/dx12/codegen_dx12.cpp
@@ -1,3 +1,5 @@
+#include "llvm/IR/IntrinsicsDirectX.h"
+
 #include "taichi/codegen/dx12/codegen_dx12.h"
 #include "taichi/codegen/dx12/dx12_llvm_passes.h"
 #include "taichi/rhi/dx12/dx12_api.h"
@@ -26,24 +28,13 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
   }
 
   void create_offload_range_for(OffloadedStmt *stmt) override {
-    int step = 1;
-
-    // In parallel for-loops reversing the order doesn't make sense.
-    // However, we may need to support serial offloaded range for's in the
-    // future, so it still makes sense to reverse the order here.
-    if (stmt->reversed) {
-      step = -1;
-    }
+    auto tls_prologue = create_xlogue(stmt->tls_prologue);
 
-    auto *tls_prologue = create_xlogue(stmt->tls_prologue);
-
-    // The loop body
     llvm::Function *body;
     {
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
-           llvm::Type::getInt8PtrTy(*llvm_context),
-           tlctx->get_data_type<int>()});
+           get_tls_buffer_type(), tlctx->get_data_type<int>()});
 
       auto loop_var = create_entry_block_alloca(PrimitiveType::i32);
       loop_vars_llvm[stmt].push_back(loop_var);
@@ -53,36 +44,22 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
       body = guard.body;
     }
 
-    llvm::Value *epilogue = create_xlogue(stmt->tls_epilogue);
+    auto epilogue = create_xlogue(stmt->tls_epilogue);
 
     auto [begin, end] = get_range_for_bounds(stmt);
-
-    // adaptive block_dim
-    if (prog->config.cpu_block_dim_adaptive) {
-      int num_items = (stmt->end_value - stmt->begin_value) / std::abs(step);
-      int num_threads = stmt->num_cpu_threads;
-      int items_per_thread = std::max(1, num_items / (num_threads * 32));
-      // keep each task has at least 512 items to amortize scheduler overhead
-      // also saturate the value to 1024 for better load balancing
-      stmt->block_dim = std::min(1024, std::max(512, items_per_thread));
-    }
-
-    create_call(
-        "gpu_parallel_range_for",
-        {get_arg(0), tlctx->get_constant(stmt->num_cpu_threads), begin, end,
-         tlctx->get_constant(step), tlctx->get_constant(stmt->block_dim),
-         tls_prologue, body, epilogue, tlctx->get_constant(stmt->tls_size)});
+    create_call("gpu_parallel_range_for",
+                {get_arg(0), begin, end, tls_prologue, body, epilogue,
+                 tlctx->get_constant(stmt->tls_size)});
   }
 
   void create_offload_mesh_for(OffloadedStmt *stmt) override {
-    auto *tls_prologue = create_mesh_xlogue(stmt->tls_prologue);
+    auto tls_prologue = create_mesh_xlogue(stmt->tls_prologue);
 
     llvm::Function *body;
     {
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
-           llvm::Type::getInt8PtrTy(*llvm_context),
-           tlctx->get_data_type<int>()});
+           get_tls_buffer_type(), tlctx->get_data_type<int>()});
 
       for (int i = 0; i < stmt->mesh_prologue->size(); i++) {
         auto &s = stmt->mesh_prologue->statements[i];
@@ -91,6 +68,7 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
 
       if (stmt->bls_prologue) {
         stmt->bls_prologue->accept(this);
+        call("block_barrier");  // "__syncthreads()"
       }
 
       auto loop_test_bb =
@@ -99,21 +77,27 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
           llvm::BasicBlock::Create(*llvm_context, "loop_body", func);
       auto func_exit =
           llvm::BasicBlock::Create(*llvm_context, "func_exit", func);
-      auto loop_index =
-          create_entry_block_alloca(llvm::Type::getInt32Ty(*llvm_context));
-      builder->CreateStore(tlctx->get_constant(0), loop_index);
+      auto i32_ty = llvm::Type::getInt32Ty(*llvm_context);
+      auto loop_index = create_entry_block_alloca(i32_ty);
+      llvm::Value *thread_idx =
+          builder->CreateIntrinsic(llvm::Intrinsic::dx_thread_id_in_group,
+                                   {i32_ty}, {builder->getInt32(0)});
+      // FIXME: use correct block dim.
+      llvm::Value *block_dim =
+          builder->getInt32(64); /*builder->CreateIntrinsic(
+          llvm::Intrinsic::dx, {}, {});*/
+      builder->CreateStore(thread_idx, loop_index);
       builder->CreateBr(loop_test_bb);
 
       {
         builder->SetInsertPoint(loop_test_bb);
+        auto cond = builder->CreateICmp(
+            llvm::CmpInst::Predicate::ICMP_SLT,
+            builder->CreateLoad(
 #ifdef TI_LLVM_15
-        auto *loop_index_load =
-            builder->CreateLoad(builder->getInt32Ty(), loop_index);
-#else
-        auto *loop_index_load = builder->CreateLoad(loop_index);
+                i32_ty,
 #endif
-        auto cond = builder->CreateICmp(
-            llvm::CmpInst::Predicate::ICMP_SLT, loop_index_load,
+                loop_index),
             llvm_val[stmt->owned_num_local.find(stmt->major_from_type)
                          ->second]);
         builder->CreateCondBr(cond, loop_body_bb, func_exit);
@@ -126,33 +110,31 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
           auto &s = stmt->body->statements[i];
           s->accept(this);
         }
+        builder->CreateStore(builder->CreateAdd(builder->CreateLoad(
 #ifdef TI_LLVM_15
-        auto *loop_index_load =
-            builder->CreateLoad(builder->getInt32Ty(), loop_index);
-#else
-        auto *loop_index_load = builder->CreateLoad(loop_index);
+                                                    i32_ty,
 #endif
-        builder->CreateStore(
-            builder->CreateAdd(loop_index_load, tlctx->get_constant(1)),
-            loop_index);
+                                                    loop_index),
+                                                block_dim),
+                             loop_index);
         builder->CreateBr(loop_test_bb);
         builder->SetInsertPoint(func_exit);
       }
 
       if (stmt->bls_epilogue) {
+        call("block_barrier");  // "__syncthreads()"
         stmt->bls_epilogue->accept(this);
       }
 
       body = guard.body;
     }
 
-    llvm::Value *epilogue = create_mesh_xlogue(stmt->tls_epilogue);
+    auto tls_epilogue = create_mesh_xlogue(stmt->tls_epilogue);
 
-    create_call("gpu_parallel_mesh_for",
-                {get_arg(0), tlctx->get_constant(stmt->num_cpu_threads),
-                 tlctx->get_constant(stmt->mesh->num_patches),
-                 tlctx->get_constant(stmt->block_dim), tls_prologue, body,
-                 epilogue, tlctx->get_constant(stmt->tls_size)});
+    create_call(
+        "gpu_parallel_mesh_for",
+        {get_arg(0), tlctx->get_constant(stmt->mesh->num_patches), tls_prologue,
+         body, tls_epilogue, tlctx->get_constant(stmt->tls_size)});
   }
 
   void create_bls_buffer(OffloadedStmt *stmt) {
@@ -227,7 +209,7 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
 #ifdef TI_WITH_LLVM
 
 static std::vector<uint8_t> generate_dxil_from_llvm(
-    LLVMCompiledData &compiled_data,
+    LLVMCompiledTask &compiled_data,
     taichi::lang::Kernel *kernel) {
   // generate dxil from llvm ir.
   auto offloaded_local = compiled_data.tasks;
@@ -286,7 +268,7 @@ KernelCodeGenDX12::CompileResult KernelCodeGenDX12::compile() {
   return Result;
 }
 
-LLVMCompiledData KernelCodeGenDX12::compile_task(
+LLVMCompiledTask KernelCodeGenDX12::compile_task(
     std::unique_ptr<llvm::Module> &&module,
     OffloadedStmt *stmt) {
   TaskCodeGenLLVMDX12 gen(kernel, stmt);
diff --git a/taichi/codegen/dx12/codegen_dx12.h b/taichi/codegen/dx12/codegen_dx12.h
index 1b9e920e71873f..5d352231a1e6b9 100644
--- a/taichi/codegen/dx12/codegen_dx12.h
+++ b/taichi/codegen/dx12/codegen_dx12.h
@@ -22,7 +22,7 @@ class KernelCodeGenDX12 : public KernelCodeGen {
   };
   CompileResult compile();
 #ifdef TI_WITH_LLVM
-  LLVMCompiledData compile_task(
+  LLVMCompiledTask compile_task(
       std::unique_ptr<llvm::Module> &&module = nullptr,
       OffloadedStmt *stmt = nullptr) override;
 #endif
diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
index 85995d20fb4907..8c51a8f38f3074 100644
--- a/taichi/codegen/spirv/spirv_codegen.cpp
+++ b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -2,6 +2,7 @@
 
 #include <string>
 #include <vector>
+#include <variant>
 
 #include "taichi/program/program.h"
 #include "taichi/program/kernel.h"
@@ -104,7 +105,7 @@ class TaskCodegen : public IRVisitor {
   Result run() {
     ir_->init_header();
     kernel_function_ = ir_->new_function();  // void main();
-    ir_->debug(spv::OpName, kernel_function_, "main");
+    ir_->debug_name(spv::OpName, kernel_function_, "main");
 
     compile_args_struct();
     compile_ret_struct();
@@ -146,8 +147,28 @@ class TaskCodegen : public IRVisitor {
     }
   }
 
-  void visit(PrintStmt *print_stmt) override {
-    TI_WARN("Printing is not yet supported in Vulkan");
+  void visit(PrintStmt *stmt) override {
+    if (!device_->get_cap(DeviceCapability::spirv_has_non_semantic_info)) {
+      return;
+    }
+
+    std::string formats;
+    std::vector<Value> vals;
+
+    for (auto const &content : stmt->contents) {
+      if (std::holds_alternative<Stmt *>(content)) {
+        auto arg_stmt = std::get<Stmt *>(content);
+        TI_ASSERT(!arg_stmt->ret_type->is<TensorType>());
+
+        auto value = ir_->query_value(arg_stmt->raw_name());
+        vals.push_back(value);
+        formats += data_type_format(arg_stmt->ret_type);
+      } else {
+        auto arg_str = std::get<std::string>(content);
+        formats += arg_str;
+      }
+    }
+    ir_->call_debugprintf(formats, vals);
   }
 
   void visit(ConstStmt *const_stmt) override {
@@ -1649,15 +1670,15 @@ class TaskCodegen : public IRVisitor {
       task_attribs_.advisory_total_num_threads = kMaxNumThreadsGridStrideLoop;
     }
     task_attribs_.advisory_num_threads_per_group = stmt->block_dim;
-    ir_->debug(spv::OpName, begin_expr_value, "begin_expr_value");
-    ir_->debug(spv::OpName, total_elems, total_elems_name);
+    ir_->debug_name(spv::OpName, begin_expr_value, "begin_expr_value");
+    ir_->debug_name(spv::OpName, total_elems, total_elems_name);
 
     spirv::Value begin_ =
         ir_->add(ir_->cast(ir_->i32_type(), ir_->get_global_invocation_id(0)),
                  begin_expr_value);
-    ir_->debug(spv::OpName, begin_, "begin_");
+    ir_->debug_name(spv::OpName, begin_, "begin_");
     spirv::Value end_ = ir_->add(total_elems, begin_expr_value);
-    ir_->debug(spv::OpName, end_, "end_");
+    ir_->debug_name(spv::OpName, end_, "end_");
     const std::string total_invocs_name = "total_invocs";
     // For now, |total_invocs_name| is equal to |total_elems|. Once we support
     // dynamic range, they will be different.
@@ -1679,7 +1700,7 @@ class TaskCodegen : public IRVisitor {
         false);
         */
 
-    ir_->debug(spv::OpName, total_invocs, total_invocs_name);
+    ir_->debug_name(spv::OpName, total_invocs, total_invocs_name);
 
     // Must get init label after making value(to make sure they are correct)
     spirv::Label init_label = ir_->current_label();
@@ -2300,6 +2321,7 @@ void KernelCodegen::run(TaichiKernelAttributes &kernel_attribs,
     std::vector<uint32_t> optimized_spv(task_res.spirv_code);
 
     size_t last_size;
+    bool success = true;
     do {
       last_size = optimized_spv.size();
       bool result = false;
@@ -2307,25 +2329,31 @@ void KernelCodegen::run(TaichiKernelAttributes &kernel_attribs,
           (result = !spirv_opt_->Run(optimized_spv.data(), optimized_spv.size(),
                                      &optimized_spv, spirv_opt_options_)),
           "SPIRV optimization failed");
-      if (result)
+      if (result) {
+        success = false;
         break;
+      }
     } while (last_size != optimized_spv.size());
 
     TI_TRACE("SPIRV-Tools-opt: binary size, before={}, after={}",
              task_res.spirv_code.size(), optimized_spv.size());
 
     // Enable to dump SPIR-V assembly of kernels
-#if 0
-    std::string spirv_asm;
-    spirv_tools_->Disassemble(optimized_spv, &spirv_asm);
-    auto kernel_name = tp.ti_kernel_name;
-    TI_WARN("SPIR-V Assembly dump for {} :\n{}\n\n", kernel_name, spirv_asm);
-
-    std::ofstream fout(kernel_name + ".spv", std::ios::binary | std::ios::out);
-    fout.write(reinterpret_cast<const char *>(optimized_spv.data()),
-               optimized_spv.size() * sizeof(uint32_t));
-    fout.close();
-#endif
+    if constexpr (false) {
+      std::vector<uint32_t> &spirv =
+          success ? optimized_spv : task_res.spirv_code;
+
+      std::string spirv_asm;
+      spirv_tools_->Disassemble(optimized_spv, &spirv_asm);
+      auto kernel_name = tp.ti_kernel_name;
+      TI_WARN("SPIR-V Assembly dump for {} :\n{}\n\n", kernel_name, spirv_asm);
+
+      std::ofstream fout(kernel_name + ".spv",
+                         std::ios::binary | std::ios::out);
+      fout.write(reinterpret_cast<const char *>(spirv.data()),
+                 spirv.size() * sizeof(uint32_t));
+      fout.close();
+    }
 
     kernel_attribs.tasks_attribs.push_back(std::move(task_res.task_attribs));
     generated_spirv.push_back(std::move(optimized_spv));
diff --git a/taichi/codegen/spirv/spirv_ir_builder.cpp b/taichi/codegen/spirv/spirv_ir_builder.cpp
index f9d217d4d0696f..3505ceb11b0bca 100644
--- a/taichi/codegen/spirv/spirv_ir_builder.cpp
+++ b/taichi/codegen/spirv/spirv_ir_builder.cpp
@@ -80,6 +80,12 @@ void IRBuilder::init_header() {
       .add("SPV_KHR_storage_buffer_storage_class")
       .commit(&header_);
 
+  if (device_->get_cap(cap::spirv_has_non_semantic_info)) {
+    ib_.begin(spv::OpExtension)
+        .add("SPV_KHR_non_semantic_info")
+        .commit(&header_);
+  }
+
   if (device_->get_cap(cap::spirv_has_variable_ptr)) {
     ib_.begin(spv::OpExtension)
         .add("SPV_KHR_variable_pointers")
@@ -125,7 +131,8 @@ std::vector<uint32_t> IRBuilder::finalize() {
   data.insert(data.end(), header_.begin(), header_.end());
   data.insert(data.end(), entry_.begin(), entry_.end());
   data.insert(data.end(), exec_mode_.begin(), exec_mode_.end());
-  data.insert(data.end(), debug_.begin(), debug_.end());
+  data.insert(data.end(), strings_.begin(), strings_.end());
+  data.insert(data.end(), names_.begin(), names_.end());
   data.insert(data.end(), decorate_.begin(), decorate_.end());
   data.insert(data.end(), global_.begin(), global_.end());
   data.insert(data.end(), func_header_.begin(), func_header_.end());
@@ -135,6 +142,10 @@ std::vector<uint32_t> IRBuilder::finalize() {
 
 void IRBuilder::init_pre_defs() {
   ext_glsl450_ = ext_inst_import("GLSL.std.450");
+  if (device_->get_cap(cap::spirv_has_non_semantic_info)) {
+    debug_printf_ = ext_inst_import("NonSemantic.DebugPrintf");
+  }
+
   t_bool_ = declare_primitive_type(get_data_type<bool>());
   if (device_->get_cap(cap::spirv_has_int8)) {
     t_int8_ = declare_primitive_type(get_data_type<int8>());
@@ -207,6 +218,12 @@ void IRBuilder::init_pre_defs() {
   const_i32_one_ = int_immediate_number(t_int32_, 1);
 }
 
+Value IRBuilder::debug_string(std::string s) {
+  Value val = new_value(SType(), ValueKind::kNormal);
+  ib_.begin(spv::OpString).add_seq(val, s).commit(&strings_);
+  return val;
+}
+
 PhiValue IRBuilder::make_phi(const SType &out_type, uint32_t num_incoming) {
   Value val = new_value(out_type, ValueKind::kNormal);
   ib_.begin(spv::OpPhi).add_seq(out_type, val);
@@ -575,7 +592,7 @@ SType IRBuilder::create_struct_type(
   for (auto &[type, name, offset] : components) {
     this->decorate(spv::OpMemberDecorate, struct_type, i, spv::DecorationOffset,
                    offset);
-    this->debug(spv::OpMemberName, struct_type, i, name);
+    this->debug_name(spv::OpMemberName, struct_type, i, name);
     i++;
   }
 
@@ -595,7 +612,7 @@ Value IRBuilder::buffer_struct_argument(const SType &struct_type,
     storage_class = spv::StorageClassStorageBuffer;
   }
 
-  this->debug(spv::OpName, struct_type, name + "_t");
+  this->debug_name(spv::OpName, struct_type, name + "_t");
 
   if (device_->get_cap(cap::spirv_version) < 0x10300) {
     // NOTE: BufferBlock was deprecated in SPIRV 1.3
@@ -608,14 +625,14 @@ Value IRBuilder::buffer_struct_argument(const SType &struct_type,
 
   SType ptr_type = get_pointer_type(struct_type, storage_class);
 
-  this->debug(spv::OpName, ptr_type, name + "_ptr");
+  this->debug_name(spv::OpName, ptr_type, name + "_ptr");
 
   Value val = new_value(ptr_type, ValueKind::kStructArrayPtr);
   ib_.begin(spv::OpVariable)
       .add_seq(ptr_type, val, storage_class)
       .commit(&global_);
 
-  this->debug(spv::OpName, val, name);
+  this->debug_name(spv::OpName, val, name);
 
   this->decorate(spv::OpDecorate, val, spv::DecorationDescriptorSet,
                  descriptor_set);
@@ -631,20 +648,20 @@ Value IRBuilder::uniform_struct_argument(const SType &struct_type,
   // use StorageClassStorageBuffer instead.
   spv::StorageClass storage_class = spv::StorageClassUniform;
 
-  this->debug(spv::OpName, struct_type, name + "_t");
+  this->debug_name(spv::OpName, struct_type, name + "_t");
 
   this->decorate(spv::OpDecorate, struct_type, spv::DecorationBlock);
 
   SType ptr_type = get_pointer_type(struct_type, storage_class);
 
-  this->debug(spv::OpName, ptr_type, name + "_ptr");
+  this->debug_name(spv::OpName, ptr_type, name + "_ptr");
 
   Value val = new_value(ptr_type, ValueKind::kStructArrayPtr);
   ib_.begin(spv::OpVariable)
       .add_seq(ptr_type, val, storage_class)
       .commit(&global_);
 
-  this->debug(spv::OpName, val, name);
+  this->debug_name(spv::OpName, val, name);
 
   this->decorate(spv::OpDecorate, val, spv::DecorationDescriptorSet,
                  descriptor_set);
@@ -669,18 +686,18 @@ Value IRBuilder::buffer_argument(const SType &value_type,
 
   auto typed_name = name + "_" + value_type.dt.to_string();
 
-  this->debug(spv::OpName, sarr_type, typed_name + "_struct_array");
+  this->debug_name(spv::OpName, sarr_type, typed_name + "_struct_array");
 
   SType ptr_type = get_pointer_type(sarr_type, storage_class);
 
-  this->debug(spv::OpName, sarr_type, typed_name + "_ptr");
+  this->debug_name(spv::OpName, sarr_type, typed_name + "_ptr");
 
   Value val = new_value(ptr_type, ValueKind::kStructArrayPtr);
   ib_.begin(spv::OpVariable)
       .add_seq(ptr_type, val, storage_class)
       .commit(&global_);
 
-  this->debug(spv::OpName, val, typed_name);
+  this->debug_name(spv::OpName, val, typed_name);
 
   this->decorate(spv::OpDecorate, val, spv::DecorationDescriptorSet,
                  descriptor_set);
@@ -727,7 +744,7 @@ Value IRBuilder::texture_argument(int num_channels,
                  descriptor_set);
   this->decorate(spv::OpDecorate, val, spv::DecorationBinding, binding);
 
-  this->debug(spv::OpName, val, "tex");
+  this->debug_name(spv::OpName, val, "tex");
 
   this->global_values.push_back(val);
 
@@ -752,7 +769,7 @@ Value IRBuilder::storage_image_argument(int num_channels,
                  descriptor_set);
   this->decorate(spv::OpDecorate, val, spv::DecorationBinding, binding);
 
-  this->debug(spv::OpName, val, "tex");
+  this->debug_name(spv::OpName, val, "tex");
 
   this->global_values.push_back(val);
 
@@ -1184,7 +1201,7 @@ void IRBuilder::register_value(std::string name, Value value) {
   if (it != value_name_tbl_.end() && it->second.flag != ValueKind::kConstant) {
     TI_ERROR("{} already exists.", name);
   }
-  this->debug(
+  this->debug_name(
       spv::OpName, value,
       fmt::format("{}_{}", name, value.stype.dt.to_string()));  // Debug info
   value_name_tbl_[name] = value;
@@ -1422,13 +1439,13 @@ void IRBuilder::init_random_function(Value global_tmp_) {
   ib_.begin(spv::OpVariable)
       .add_seq(local_type, rand_w_, spv::StorageClassPrivate)
       .commit(&global_);
-  debug(spv::OpName, rand_x_, "_rand_x");
-  debug(spv::OpName, rand_y_, "_rand_y");
-  debug(spv::OpName, rand_z_, "_rand_z");
-  debug(spv::OpName, rand_w_, "_rand_w");
+  debug_name(spv::OpName, rand_x_, "_rand_x");
+  debug_name(spv::OpName, rand_y_, "_rand_y");
+  debug_name(spv::OpName, rand_z_, "_rand_z");
+  debug_name(spv::OpName, rand_w_, "_rand_w");
   SType gtmp_type = get_pointer_type(t_uint32_, spv::StorageClassStorageBuffer);
   Value rand_gtmp_ = new_value(gtmp_type, ValueKind::kVariablePtr);
-  debug(spv::OpName, rand_gtmp_, "rand_gtmp");
+  debug_name(spv::OpName, rand_gtmp_, "rand_gtmp");
 
   auto load_var = [&](Value pointer, const SType &res_type) {
     TI_ASSERT(pointer.flag == ValueKind::kVariablePtr ||
diff --git a/taichi/codegen/spirv/spirv_ir_builder.h b/taichi/codegen/spirv/spirv_ir_builder.h
index fcf76c8471c522..337985bad360b2 100644
--- a/taichi/codegen/spirv/spirv_ir_builder.h
+++ b/taichi/codegen/spirv/spirv_ir_builder.h
@@ -209,10 +209,12 @@ class IRBuilder {
   }
 
   template <typename... Args>
-  void debug(spv::Op op, Args &&...args) {
-    ib_.begin(op).add_seq(std::forward<Args>(args)...).commit(&debug_);
+  void debug_name(spv::Op op, Args &&...args) {
+    ib_.begin(op).add_seq(std::forward<Args>(args)...).commit(&names_);
   }
 
+  Value debug_string(std::string str);
+
   template <typename... Args>
   void execution_mode(Value func, Args &&...args) {
     ib_.begin(spv::OpExecutionMode)
@@ -460,6 +462,18 @@ class IRBuilder {
     return val;
   }
 
+  // Create a debugPrintf call
+  void call_debugprintf(std::string formats, const std::vector<Value> &args) {
+    Value format_str = debug_string(formats);
+    Value val = new_value(t_void_, ValueKind::kNormal);
+    ib_.begin(spv::OpExtInst)
+        .add_seq(t_void_, val, debug_printf_, 1, format_str);
+    for (const auto &arg : args) {
+      ib_.add(arg);
+    }
+    ib_.commit(&function_);
+  }
+
   // Local allocate, load, store methods
   Value alloca_variable(const SType &type);
   Value alloca_workgroup_array(const SType &type);
@@ -551,6 +565,9 @@ class IRBuilder {
   // glsl 450 extension
   Value ext_glsl450_;
 
+  // debugprint extension
+  Value debug_printf_;
+
   SType t_bool_;
   SType t_int8_;
   SType t_int16_;
@@ -603,7 +620,17 @@ class IRBuilder {
   // Header segment
   std::vector<uint32_t> exec_mode_;
   // Debug segment
-  std::vector<uint32_t> debug_;
+  //   According to SPIR-V spec, the following debug instructions must be
+  //   grouped in the order:
+  //   - All OpString, OpSourceExtension, OpSource, and OpSourceContinued,
+  //   without forward references.
+  //   - All OpName and all OpMemberName.
+  //   - All OpModuleProcessed instructions.
+
+  // OpString segment
+  std::vector<uint32_t> strings_;
+  // OpName segment
+  std::vector<uint32_t> names_;
   // Annotation segment
   std::vector<uint32_t> decorate_;
   // Global segment: types, variables, types
diff --git a/taichi/common/serialization.h b/taichi/common/serialization.h
index 84d77a6c165dd7..8458c1b7514d11 100644
--- a/taichi/common/serialization.h
+++ b/taichi/common/serialization.h
@@ -345,15 +345,20 @@ class BinarySerializer : public Serializer {
     preserved = 0;
   }
 
+  template <bool writing_ = writing>
+  typename std::enable_if<!writing_, std::size_t>::type retrieve_length() {
+    return *reinterpret_cast<std::size_t *>(c_data);
+  }
+
   void finalize() {
-    if (writing) {
+    if constexpr (writing) {
       if (c_data) {
         *reinterpret_cast<std::size_t *>(&c_data[0]) = head;
       } else {
         *reinterpret_cast<std::size_t *>(&data[0]) = head;
       }
     } else {
-      assert(head == *reinterpret_cast<std::size_t *>(c_data));
+      assert(head == retrieve_length());
     }
   }
 
@@ -880,6 +885,21 @@ operator<<(std::ostream &os, const T &t) {
 }
 
 // Returns true if deserialization succeeded.
+template <typename T>
+bool read_from_binary(T &t,
+                      const void *bin,
+                      std::size_t len,
+                      bool match_all = true) {
+  BinaryInputSerializer reader;
+  reader.initialize(const_cast<void *>(bin));
+  if (len != reader.retrieve_length()) {
+    return false;
+  }
+  reader(t);
+  auto head = reader.head;
+  return match_all ? head == len : head <= len;
+}
+
 template <typename T>
 bool read_from_binary_file(T &t, const std::string &file_name) {
   BinaryInputSerializer reader;
diff --git a/taichi/inc/rhi_constants.inc.h b/taichi/inc/rhi_constants.inc.h
index 023e94356b4336..d5d60dc725dadb 100644
--- a/taichi/inc/rhi_constants.inc.h
+++ b/taichi/inc/rhi_constants.inc.h
@@ -28,6 +28,7 @@ PER_DEVICE_CAPABILITY(spirv_has_subgroup_basic)
 PER_DEVICE_CAPABILITY(spirv_has_subgroup_vote)
 PER_DEVICE_CAPABILITY(spirv_has_subgroup_arithmetic)
 PER_DEVICE_CAPABILITY(spirv_has_subgroup_ballot)
+PER_DEVICE_CAPABILITY(spirv_has_non_semantic_info)
 // Graphics Caps
 PER_DEVICE_CAPABILITY(wide_lines)
 #endif
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index cadf3663b78fb2..cd022734835fb1 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -531,6 +531,19 @@ class MatrixFieldExpression : public Expression {
   MatrixFieldExpression(const std::vector<Expr> &fields,
                         const std::vector<int> &element_shape)
       : fields(fields), element_shape(element_shape) {
+    for (auto &field : fields) {
+      TI_ASSERT(field.is<FieldExpression>());
+    }
+    TI_ASSERT(!fields.empty());
+    auto compute_type =
+        fields[0].cast<FieldExpression>()->dt->get_compute_type();
+    for (auto &field : fields) {
+      if (field.cast<FieldExpression>()->dt->get_compute_type() !=
+          compute_type) {
+        throw TaichiRuntimeError(
+            "Member fields of a matrix field must have the same compute type");
+      }
+    }
   }
 
   void type_check(CompileConfig *config) override {
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index acfa4264a9d363..57dcd27a0de560 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -40,10 +40,10 @@
 #include "taichi/rhi/dx/dx_api.h"
 #endif
 
-#if defined(TI_ARCH_x64)
+#if defined(_M_X64) || defined(__x86_64)
 // For _MM_SET_FLUSH_ZERO_MODE
 #include <xmmintrin.h>
-#endif
+#endif  // defined(_M_X64) || defined(__x86_64)
 
 namespace taichi {
 namespace lang {
@@ -55,9 +55,10 @@ Program::Program(Arch desired_arch) : snode_rw_accessors_bank_(this) {
   // For performance considerations and correctness of QuantFloatType
   // operations, we force floating-point operations to flush to zero on all
   // backends (including CPUs).
-#if defined(TI_ARCH_x64)
+#if defined(_M_X64) || defined(__x86_64)
   _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
-#else
+#endif  // defined(_M_X64) || defined(__x86_64)
+#if defined(__arm64__) || defined(__aarch64__)
   // Enforce flush to zero on arm64 CPUs
   // https://developer.arm.com/documentation/100403/0201/register-descriptions/advanced-simd-and-floating-point-registers/aarch64-register-descriptions/fpcr--floating-point-control-register?lang=en
   std::uint64_t fpcr;
@@ -68,7 +69,7 @@ Program::Program(Arch desired_arch) : snode_rw_accessors_bank_(this) {
                        :
                        : "ri"(fpcr | (1 << 24)));  // Bit 24 is FZ
   __asm__ __volatile__("");
-#endif
+#endif  // defined(__arm64__) || defined(__aarch64__)
   config = default_compile_config;
   config.arch = desired_arch;
   // TODO: allow users to run in debug mode without out-of-bound checks
@@ -541,5 +542,11 @@ void Program::prepare_runtime_context(RuntimeContext *ctx) {
   program_impl_->prepare_runtime_context(ctx);
 }
 
+void Program::enqueue_compute_op_lambda(
+    std::function<void(Device *device, CommandList *cmdlist)> op,
+    const std::vector<ComputeOpImageRef> &image_refs) {
+  program_impl_->enqueue_compute_op_lambda(op, image_refs);
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/program/program.h b/taichi/program/program.h
index 329237993e7041..e2dc1ef5f0d8ea 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -331,6 +331,15 @@ class TI_DLL_EXPORT Program {
 
   void prepare_runtime_context(RuntimeContext *ctx);
 
+  /** Enqueue a custom compute op to the current program execution flow.
+   *
+   *  @params op The lambda that is invoked to construct the custom compute Op
+   *  @params image_refs The image resource references used in this compute Op
+   */
+  void enqueue_compute_op_lambda(
+      std::function<void(Device *device, CommandList *cmdlist)> op,
+      const std::vector<ComputeOpImageRef> &image_refs);
+
   /**
    * TODO(zhanlue): Remove this interface
    *
diff --git a/taichi/program/program_impl.h b/taichi/program/program_impl.h
index b140b50b760ca9..4f2a5d5a473810 100644
--- a/taichi/program/program_impl.h
+++ b/taichi/program/program_impl.h
@@ -13,6 +13,15 @@
 namespace taichi {
 namespace lang {
 
+// Represents an image resource reference for a compute/render Op
+struct ComputeOpImageRef {
+  DeviceAllocation image;
+  // The requested initial layout of the image, when Op is invoked
+  ImageLayout initial_layout;
+  // The final layout the image will be in once Op finishes
+  ImageLayout final_layout;
+};
+
 struct RuntimeContext;
 
 class ProgramImpl {
@@ -128,6 +137,12 @@ class ProgramImpl {
   virtual void prepare_runtime_context(RuntimeContext *ctx) {
   }
 
+  virtual void enqueue_compute_op_lambda(
+      std::function<void(Device *device, CommandList *cmdlist)> op,
+      const std::vector<ComputeOpImageRef> &image_refs) {
+    TI_NOT_IMPLEMENTED;
+  }
+
   virtual void print_memory_profiler_info(
       std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
       uint64 *result_buffer) {
diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
index 3e8096a98d72f9..f142766111c64b 100644
--- a/taichi/program/sparse_matrix.cpp
+++ b/taichi/program/sparse_matrix.cpp
@@ -203,11 +203,40 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
                                         void *coo_values_ptr,
                                         int nnz) {
 #if defined(TI_WITH_CUDA)
+  // Step 1: Sort coo first
+  cusparseHandle_t cusparse_handle = NULL;
+  CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
+  cusparseSpVecDescr_t vec_permutation;
+  cusparseDnVecDescr_t vec_values;
+  void *d_permutation = NULL, *d_values_sorted = NULL;
+  CUDADriver::get_instance().malloc(&d_permutation, nnz * sizeof(int));
+  CUDADriver::get_instance().malloc(&d_values_sorted, nnz * sizeof(float));
+  CUSPARSEDriver::get_instance().cpCreateSpVec(
+      &vec_permutation, nnz, nnz, d_permutation, d_values_sorted,
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
+  CUSPARSEDriver::get_instance().cpCreateDnVec(&vec_values, nnz, coo_values_ptr,
+                                               CUDA_R_32F);
+  size_t bufferSize = 0;
+  CUSPARSEDriver::get_instance().cpXcoosort_bufferSizeExt(
+      cusparse_handle, rows_, cols_, nnz, coo_row_ptr, coo_col_ptr,
+      &bufferSize);
+  void *dbuffer = NULL;
+  if (bufferSize > 0)
+    CUDADriver::get_instance().malloc(&dbuffer, bufferSize);
+  // Setup permutation vector to identity
+  CUSPARSEDriver::get_instance().cpCreateIdentityPermutation(
+      cusparse_handle, nnz, d_permutation);
+  CUSPARSEDriver::get_instance().cpXcoosortByRow(cusparse_handle, rows_, cols_,
+                                                 nnz, coo_row_ptr, coo_col_ptr,
+                                                 d_permutation, dbuffer);
+  CUSPARSEDriver::get_instance().cpGather(cusparse_handle, vec_values,
+                                          vec_permutation);
+  CUDADriver::get_instance().memcpy_device_to_device(
+      coo_values_ptr, d_values_sorted, nnz * sizeof(float));
+  // Step 2: coo to csr
   void *csr_row_offset_ptr = NULL;
   CUDADriver::get_instance().malloc(&csr_row_offset_ptr,
                                     sizeof(int) * (rows_ + 1));
-  cusparseHandle_t cusparse_handle;
-  CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
   CUSPARSEDriver::get_instance().cpCoo2Csr(
       cusparse_handle, (void *)coo_row_ptr, nnz, rows_,
       (void *)csr_row_offset_ptr, CUSPARSE_INDEX_BASE_ZERO);
@@ -216,9 +245,14 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
       &matrix_, rows_, cols_, nnz, csr_row_offset_ptr, coo_col_ptr,
       coo_values_ptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
       CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
+  CUSPARSEDriver::get_instance().cpDestroySpVec(vec_permutation);
+  CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_values);
   CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
   // TODO: free csr_row_offset_ptr
   // CUDADriver::get_instance().mem_free(csr_row_offset_ptr);
+  CUDADriver::get_instance().mem_free(d_values_sorted);
+  CUDADriver::get_instance().mem_free(d_permutation);
+  CUDADriver::get_instance().mem_free(dbuffer);
 #endif
 }
 
diff --git a/taichi/program/texture.h b/taichi/program/texture.h
index 0b54f6a3487506..7b439a79eac40b 100644
--- a/taichi/program/texture.h
+++ b/taichi/program/texture.h
@@ -46,6 +46,14 @@ class TI_DLL_EXPORT Texture {
 
   ~Texture();
 
+  BufferFormat get_buffer_format() const {
+    return format_;
+  }
+
+  std::array<int, 3> get_size() const {
+    return {width_, height_, depth_};
+  }
+
  private:
   DeviceAllocation texture_alloc_{kDeviceNullAllocation};
   DataType dtype_;
diff --git a/taichi/python/export_ggui.cpp b/taichi/python/export_ggui.cpp
index bd6ad0fc885ac5..b68e60f67a3d6a 100644
--- a/taichi/python/export_ggui.cpp
+++ b/taichi/python/export_ggui.cpp
@@ -290,6 +290,10 @@ struct PyCanvas {
     canvas->set_image({img});
   }
 
+  void set_image_texture(Texture *texture) {
+    canvas->set_image(texture);
+  }
+
   void scene(PyScene &scene) {
     canvas->scene(scene.scene);
   }
@@ -499,6 +503,7 @@ void export_ggui(py::module &m) {
   py::class_<PyCanvas>(m, "PyCanvas")
       .def("set_background_color", &PyCanvas::set_background_color)
       .def("set_image", &PyCanvas::set_image)
+      .def("set_image_texture", &PyCanvas::set_image_texture)
       .def("triangles", &PyCanvas::triangles)
       .def("lines", &PyCanvas::lines)
       .def("circles", &PyCanvas::circles)
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index f43b711b3269d2..82de4581b83b35 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -204,6 +204,8 @@ void export_lang(py::module &m) {
                      &CompileConfig::ndarray_use_cached_allocator)
       .def_readwrite("use_mesh", &CompileConfig::use_mesh)
       .def_readwrite("real_matrix", &CompileConfig::real_matrix)
+      .def_readwrite("real_matrix_scalarize",
+                     &CompileConfig::real_matrix_scalarize)
       .def_readwrite("cc_compile_cmd", &CompileConfig::cc_compile_cmd)
       .def_readwrite("cc_link_cmd", &CompileConfig::cc_link_cmd)
       .def_readwrite("quant_opt_store_fusion",
diff --git a/taichi/rhi/cuda/cuda_types.h b/taichi/rhi/cuda/cuda_types.h
index 64a369f5a76b59..88bb33951e3e74 100644
--- a/taichi/rhi/cuda/cuda_types.h
+++ b/taichi/rhi/cuda/cuda_types.h
@@ -441,8 +441,10 @@ typedef struct cusparseContext *cusparseHandle_t;
 struct cusparseMatDescr;
 typedef struct cusparseMatDescr *cusparseMatDescr_t;
 
+struct cusparseSpVecDescr;
 struct cusparseDnVecDescr;
 struct cusparseSpMatDescr;
+typedef struct cusparseSpVecDescr *cusparseSpVecDescr_t;
 typedef struct cusparseDnVecDescr *cusparseDnVecDescr_t;
 typedef struct cusparseSpMatDescr *cusparseSpMatDescr_t;
 typedef enum {
diff --git a/taichi/rhi/cuda/cusparse_functions.inc.h b/taichi/rhi/cuda/cusparse_functions.inc.h
index 7476b83c094730..1a70e36de7883e 100644
--- a/taichi/rhi/cuda/cusparse_functions.inc.h
+++ b/taichi/rhi/cuda/cusparse_functions.inc.h
@@ -13,6 +13,12 @@ PER_CUSPARSE_FUNCTION(cpCreateMatDescr, cusparseCreateMatDescr, cusparseMatDescr
 PER_CUSPARSE_FUNCTION(cpSetMatType, cusparseSetMatType, cusparseMatDescr_t, cusparseMatrixType_t);
 PER_CUSPARSE_FUNCTION(cpSetMatIndexBase, cusparseSetMatIndexBase, cusparseMatDescr_t, cusparseIndexBase_t);
 PER_CUSPARSE_FUNCTION(cpDestroySpMat, cusparseDestroySpMat, cusparseSpMatDescr_t);
+PER_CUSPARSE_FUNCTION(cpCreateSpVec, cusparseCreateSpVec, cusparseSpVecDescr_t* ,int ,int,void*,void*,cusparseIndexType_t,cusparseIndexBase_t,cudaDataType);
+PER_CUSPARSE_FUNCTION(cpDestroySpVec, cusparseDestroySpVec, cusparseSpVecDescr_t);
+PER_CUSPARSE_FUNCTION(cpCreateIdentityPermutation, cusparseCreateIdentityPermutation, cusparseHandle_t, int, void*);
+PER_CUSPARSE_FUNCTION(cpXcoosort_bufferSizeExt, cusparseXcoosort_bufferSizeExt, cusparseHandle_t,int ,int,int, void* ,void* ,void*);
+PER_CUSPARSE_FUNCTION(cpXcoosortByRow, cusparseXcoosortByRow, cusparseHandle_t,int,int,int,void* ,void* ,void* ,void*);
+PER_CUSPARSE_FUNCTION(cpGather, cusparseGather, cusparseHandle_t, cusparseDnVecDescr_t, cusparseSpVecDescr_t);
 
 // cusparse dense vector description
 PER_CUSPARSE_FUNCTION(cpCreateDnVec, cusparseCreateDnVec, cusparseDnVecDescr_t*, int, void*, cudaDataType);
diff --git a/taichi/rhi/device.cpp b/taichi/rhi/device.cpp
index 460cbba420c403..1d3af12cd80221 100644
--- a/taichi/rhi/device.cpp
+++ b/taichi/rhi/device.cpp
@@ -244,7 +244,14 @@ void Device::memcpy_direct(DevicePtr dst, DevicePtr src, uint64_t size) {
     dst.device->memcpy_internal(dst, src, size);
     return;
   }
-  // Inter-device copy
+#if TI_WITH_VULKAN && TI_WITH_LLVM
+  // cross-device copy directly
+  else if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) &&
+           dynamic_cast<cpu::CpuDevice *>(src.device)) {
+    memcpy_cpu_to_vulkan(dst, src, size);
+    return;
+  }
+#endif
 #if TI_WITH_VULKAN && TI_WITH_CUDA
   if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) &&
       dynamic_cast<cuda::CudaDevice *>(src.device)) {
diff --git a/taichi/rhi/interop/vulkan_cpu_interop.cpp b/taichi/rhi/interop/vulkan_cpu_interop.cpp
index e3a1158457ceed..598e833977eee5 100644
--- a/taichi/rhi/interop/vulkan_cpu_interop.cpp
+++ b/taichi/rhi/interop/vulkan_cpu_interop.cpp
@@ -15,6 +15,23 @@ namespace lang {
 using namespace taichi::lang::vulkan;
 using namespace taichi::lang::cpu;
 
+void memcpy_cpu_to_vulkan(DevicePtr dst, DevicePtr src, uint64_t size) {
+  // Note that `dst` must point to host-visible memory, if `dst` point to
+  // device-local memory, please choose to use `memcpy_via_staging`.
+  VulkanDevice *vk_dev = dynamic_cast<VulkanDevice *>(dst.device);
+  CpuDevice *cpu_dev = dynamic_cast<CpuDevice *>(src.device);
+
+  DeviceAllocation src_alloc(src);
+
+  CpuDevice::AllocInfo src_alloc_info = cpu_dev->get_alloc_info(src_alloc);
+
+  unsigned char *dst_ptr = (unsigned char *)(vk_dev->map_range(dst, size));
+  unsigned char *src_ptr = (unsigned char *)src_alloc_info.ptr + src.offset;
+
+  memcpy(dst_ptr, src_ptr, size);
+  vk_dev->unmap(dst);
+}
+
 void memcpy_cpu_to_vulkan_via_staging(DevicePtr dst,
                                       DevicePtr staging,
                                       DevicePtr src,
@@ -39,6 +56,9 @@ void memcpy_cpu_to_vulkan_via_staging(DevicePtr dst,
 }
 
 #else
+void memcpy_cpu_to_vulkan(DevicePtr dst, DevicePtr src, uint64_t size) {
+  TI_NOT_IMPLEMENTED;
+}
 void memcpy_cpu_to_vulkan_via_staging(DevicePtr dst,
                                       DevicePtr stagin,
                                       DevicePtr src,
diff --git a/taichi/rhi/interop/vulkan_cpu_interop.h b/taichi/rhi/interop/vulkan_cpu_interop.h
index ac95959f0251f5..4042ceca80be9a 100644
--- a/taichi/rhi/interop/vulkan_cpu_interop.h
+++ b/taichi/rhi/interop/vulkan_cpu_interop.h
@@ -5,6 +5,8 @@
 namespace taichi {
 namespace lang {
 
+void memcpy_cpu_to_vulkan(DevicePtr dst, DevicePtr src, uint64_t size);
+
 void memcpy_cpu_to_vulkan_via_staging(DevicePtr dst,
                                       DevicePtr staging,
                                       DevicePtr src,
diff --git a/taichi/rhi/opengl/CMakeLists.txt b/taichi/rhi/opengl/CMakeLists.txt
index 1a78c4fd49d997..4e88b328064a13 100644
--- a/taichi/rhi/opengl/CMakeLists.txt
+++ b/taichi/rhi/opengl/CMakeLists.txt
@@ -22,7 +22,4 @@ target_include_directories(${OPENGL_RHI}
   )
 
 target_link_libraries(opengl_rhi PRIVATE glfw)
-
-set(SPIRV_CROSS_CLI false)
-add_subdirectory(${PROJECT_SOURCE_DIR}/external/SPIRV-Cross ${PROJECT_BINARY_DIR}/external/SPIRV-Cross)
 target_link_libraries(opengl_rhi PRIVATE spirv-cross-glsl spirv-cross-core)
diff --git a/taichi/rhi/vulkan/vulkan_device.cpp b/taichi/rhi/vulkan/vulkan_device.cpp
index 595b6b757e347e..4e1b30fa0d6af9 100644
--- a/taichi/rhi/vulkan/vulkan_device.cpp
+++ b/taichi/rhi/vulkan/vulkan_device.cpp
@@ -782,9 +782,13 @@ VulkanCommandList::VulkanCommandList(VulkanDevice *ti_device,
   info.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
 
   vkBeginCommandBuffer(buffer->buffer, &info);
+
+// Workaround for MacOS: https://github.com/taichi-dev/taichi/issues/5888
+#if !defined(__APPLE__)
   vkCmdResetQueryPool(buffer->buffer, query_pool_->query_pool, 0, 2);
   vkCmdWriteTimestamp(buffer->buffer, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                       query_pool_->query_pool, 0);
+#endif
 }
 
 VulkanCommandList::~VulkanCommandList() {
@@ -1304,8 +1308,11 @@ vkapi::IVkRenderPass VulkanCommandList::current_renderpass() {
 
 vkapi::IVkCommandBuffer VulkanCommandList::finalize() {
   if (!finalized_) {
+// Workaround for MacOS: https://github.com/taichi-dev/taichi/issues/5888
+#if !defined(__APPLE__)
     vkCmdWriteTimestamp(buffer_->buffer, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                         query_pool_->query_pool, 1);
+#endif
     vkEndCommandBuffer(buffer_->buffer);
     finalized_ = true;
   }
@@ -1719,11 +1726,17 @@ void VulkanStream::command_sync() {
       continue;
     }
 
+    double duration_us = 0.0;
+
+// Workaround for MacOS: https://github.com/taichi-dev/taichi/issues/5888
+#if !defined(__APPLE__)
     uint64_t t[2];
     vkGetQueryPoolResults(device_.vk_device(), cmdbuf.query_pool->query_pool, 0,
                           2, sizeof(uint64_t) * 2, &t, sizeof(uint64_t),
                           VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
-    double duration_us = (t[1] - t[0]) * props.limits.timestampPeriod / 1000.0;
+    duration_us = (t[1] - t[0]) * props.limits.timestampPeriod / 1000.0;
+#endif
+
     device_time_elapsed_us_ += duration_us;
   }
 
@@ -2390,12 +2403,12 @@ void VulkanSurface::create_swap_chain() {
   extent.height =
       std::max(capabilities.minImageExtent.height,
                std::min(capabilities.maxImageExtent.height, extent.height));
-  TI_INFO("Creating suface of {}x{}", width, height);
+  TI_INFO("Creating suface of {}x{}", extent.width, extent.height);
   VkImageUsageFlags usage =
       VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
 
-  this->width_ = width;
-  this->height_ = height;
+  this->width_ = extent.width;
+  this->height_ = extent.height;
 
   VkSwapchainCreateInfoKHR createInfo;
   createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR;
diff --git a/taichi/rhi/vulkan/vulkan_device_creator.cpp b/taichi/rhi/vulkan/vulkan_device_creator.cpp
index 8910934a65f36a..1bd8f522198d10 100644
--- a/taichi/rhi/vulkan/vulkan_device_creator.cpp
+++ b/taichi/rhi/vulkan/vulkan_device_creator.cpp
@@ -46,7 +46,16 @@ vk_debug_callback(VkDebugUtilsMessageSeverityFlagBitsEXT message_severity,
                   const VkDebugUtilsMessengerCallbackDataEXT *p_callback_data,
                   void *p_user_data) {
   if (message_severity > VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT) {
-    TI_WARN("validation layer: {}", p_callback_data->pMessage);
+    TI_WARN("validation layer: {}, {}", message_type,
+            p_callback_data->pMessage);
+  }
+  if (message_type == VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT &&
+      message_severity == VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT &&
+      strstr(p_callback_data->pMessage, "DEBUG-PRINTF") != NULL) {
+    // Message format is "BLABLA | MessageID=xxxxx | <DEBUG_PRINT_MSG>"
+    std::string msg(p_callback_data->pMessage);
+    auto const pos = msg.find_last_of("|");
+    std::cout << msg.substr(pos + 2);
   }
   return VK_FALSE;
 }
@@ -56,6 +65,7 @@ void populate_debug_messenger_create_info(
   *create_info = {};
   create_info->sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
   create_info->messageSeverity =
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT |
       VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT |
       VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
       VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
@@ -256,8 +266,10 @@ void VulkanDeviceCreator::create_instance(bool manual_create) {
   create_info.pApplicationInfo = &app_info;
 
   if (params_.enable_validation_layer) {
-    TI_ASSERT_INFO(check_validation_layer_support(),
-                   "validation layers requested but not available");
+    if (!check_validation_layer_support()) {
+      TI_WARN("validation layers requested but not available, turning off...");
+      params_.enable_validation_layer = false;
+    }
   }
 
   VkDebugUtilsMessengerCreateInfoEXT debug_create_info{};
@@ -273,6 +285,19 @@ void VulkanDeviceCreator::create_instance(bool manual_create) {
     create_info.pNext = nullptr;
   }
 
+  // Response to `DebugPrintf`.
+  VkValidationFeaturesEXT vf = {};
+  if (params_.enable_validation_layer) {
+    std::array<VkValidationFeatureEnableEXT, 1> vfes = {
+        VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT};
+
+    vf.sType = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT;
+    vf.pNext = create_info.pNext;
+    vf.enabledValidationFeatureCount = vfes.size();
+    vf.pEnabledValidationFeatures = vfes.data();
+    create_info.pNext = &vf;
+  }
+
   std::unordered_set<std::string> extensions;
   for (auto ext : get_required_extensions(params_.enable_validation_layer)) {
     extensions.insert(std::string(ext));
@@ -515,6 +540,9 @@ void VulkanDeviceCreator::create_logical_device(bool manual_create) {
       enabled_extensions.push_back(ext.extensionName);
     } else if (name == VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME) {
       enabled_extensions.push_back(ext.extensionName);
+    } else if (name == VK_KHR_SHADER_NON_SEMANTIC_INFO_EXTENSION_NAME) {
+      ti_device_->set_cap(DeviceCapability::spirv_has_non_semantic_info, true);
+      enabled_extensions.push_back(ext.extensionName);
     } else if (std::find(params_.additional_device_extensions.begin(),
                          params_.additional_device_extensions.end(),
                          name) != params_.additional_device_extensions.end()) {
diff --git a/taichi/runtime/gfx/runtime.cpp b/taichi/runtime/gfx/runtime.cpp
index 9d5333512a9e5a..273e9a35157ec8 100644
--- a/taichi/runtime/gfx/runtime.cpp
+++ b/taichi/runtime/gfx/runtime.cpp
@@ -702,6 +702,23 @@ size_t GfxRuntime::get_root_buffer_size(int id) const {
   return it->second;
 }
 
+void GfxRuntime::enqueue_compute_op_lambda(
+    std::function<void(Device *device, CommandList *cmdlist)> op,
+    const std::vector<ComputeOpImageRef> &image_refs) {
+  for (const auto &ref : image_refs) {
+    TI_ASSERT(last_image_layouts_.find(ref.image.alloc_id) !=
+              last_image_layouts_.end());
+    transition_image(ref.image, ref.initial_layout);
+  }
+
+  ensure_current_cmdlist();
+  op(device_, current_cmdlist_.get());
+
+  for (const auto &ref : image_refs) {
+    last_image_layouts_[ref.image.alloc_id] = ref.final_layout;
+  }
+}
+
 GfxRuntime::RegisterParams run_codegen(
     Kernel *kernel,
     Device *device,
diff --git a/taichi/runtime/gfx/runtime.h b/taichi/runtime/gfx/runtime.h
index 810299b83e0c51..9e9116fbc78b2f 100644
--- a/taichi/runtime/gfx/runtime.h
+++ b/taichi/runtime/gfx/runtime.h
@@ -11,6 +11,7 @@
 #include "taichi/program/compile_config.h"
 #include "taichi/struct/snode_tree.h"
 #include "taichi/program/snode_expr_utils.h"
+#include "taichi/program/program_impl.h"
 
 namespace taichi {
 namespace lang {
@@ -124,6 +125,10 @@ class TI_DLL_EXPORT GfxRuntime {
 
   size_t get_root_buffer_size(int id) const;
 
+  void enqueue_compute_op_lambda(
+      std::function<void(Device *device, CommandList *cmdlist)> op,
+      const std::vector<ComputeOpImageRef> &image_refs);
+
  private:
   friend class taichi::lang::gfx::SNodeTreeManager;
 
diff --git a/taichi/runtime/llvm/llvm_offline_cache.cpp b/taichi/runtime/llvm/llvm_offline_cache.cpp
index fed16a0911d710..70d6bf853889f7 100644
--- a/taichi/runtime/llvm/llvm_offline_cache.cpp
+++ b/taichi/runtime/llvm/llvm_offline_cache.cpp
@@ -27,13 +27,6 @@ using Format = LlvmOfflineCache::Format;
 constexpr char kMetadataFilename[] = "metadata";
 constexpr char kMetadataFileLockName[] = "metadata.lock";
 
-static bool is_current_llvm_cache_version(
-    const LlvmOfflineCache::Version &ver) {
-  // TODO(PGZXB): Do more detailed checking
-  return ver[0] == TI_VERSION_MAJOR && ver[1] == TI_VERSION_MINOR &&
-         ver[2] == TI_VERSION_PATCH;
-}
-
 static std::string get_llvm_cache_metadata_file_path(const std::string &dir) {
   return taichi::join_path(dir, std::string(kMetadataFilename) + ".tcb");
 }
@@ -60,13 +53,6 @@ struct CacheCleanerUtils<LlvmOfflineCache> {
   using MetadataType = LlvmOfflineCache;
   using KernelMetaData = typename MetadataType::KernelMetadata;
 
-  // To load metadata from file
-  static bool load_metadata(const CacheCleanerConfig &config,
-                            MetadataType &result) {
-    return read_from_binary_file(
-        result, taichi::join_path(config.path, config.metadata_filename));
-  }
-
   // To save metadata as file
   static bool save_metadata(const CacheCleanerConfig &config,
                             const MetadataType &data) {
@@ -84,12 +70,6 @@ struct CacheCleanerUtils<LlvmOfflineCache> {
     return true;
   }
 
-  // To check version
-  static bool check_version(const CacheCleanerConfig &config,
-                            const Version &version) {
-    return is_current_llvm_cache_version(version);
-  }
-
   // To get cache files name
   static std::vector<std::string> get_cache_files(
       const CacheCleanerConfig &config,
@@ -106,6 +86,13 @@ struct CacheCleanerUtils<LlvmOfflineCache> {
   static void remove_other_files(const CacheCleanerConfig &config) {
     // Do nothing
   }
+
+  // To check if a file is cache file
+  static bool is_valid_cache_file(const CacheCleanerConfig &config,
+                                  const std::string &name) {
+    std::string ext = filename_extension(name);
+    return ext == "ll" || ext == "bc";
+  }
 };
 
 }  // namespace offline_cache
@@ -126,20 +113,17 @@ bool LlvmOfflineCacheFileReader::load_meta_data(
     LlvmOfflineCache &data,
     const std::string &cache_file_path,
     bool with_lock) {
+  using offline_cache::load_metadata_with_checking;
+  using Error = offline_cache::LoadMetadataError;
   const auto tcb_path = get_llvm_cache_metadata_file_path(cache_file_path);
-  {
-    // No the best way to check for filepath existence, but whatever... See
-    // https://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exists-using-standard-c-c11-14-17-c
-    std::ifstream fs(tcb_path, std::ios::in | std::ios::binary);
-    if (!fs.good()) {
-      TI_DEBUG("LLVM cache {} does not exist", cache_file_path);
-      return false;
-    }
+
+  if (!taichi::path_exists(tcb_path)) {
+    TI_DEBUG("File {} not found", tcb_path);
+    return false;
   }
 
   if (!with_lock) {
-    read_from_binary_file(data, tcb_path);
-    return true;
+    return Error::kNoError == load_metadata_with_checking(data, tcb_path);
   }
 
   std::string lock_path =
@@ -150,8 +134,7 @@ bool LlvmOfflineCacheFileReader::load_meta_data(
         TI_WARN("Unlock {} failed", lock_path);
       }
     });
-    read_from_binary_file(data, tcb_path);
-    return true;
+    return Error::kNoError == load_metadata_with_checking(data, tcb_path);
   }
   TI_WARN("Lock {} failed", lock_path);
   return false;
@@ -389,10 +372,6 @@ void LlvmOfflineCacheFileWriter::mangle_offloaded_task_name(
     for (auto &offload : compiled_data.tasks) {
       std::string mangled_name =
           offline_cache::mangle_name(offload.name, kernel_key);
-      TI_DEBUG(
-          "Mangle offloaded-task from internal name '{}' to offline cache "
-          "key '{}'",
-          offload.name, mangled_name);
       auto func = compiled_data.module->getFunction(offload.name);
       TI_ASSERT(func != nullptr);
       func->setName(mangled_name);
diff --git a/taichi/runtime/llvm/llvm_offline_cache.h b/taichi/runtime/llvm/llvm_offline_cache.h
index c4eac6cc4fc078..5142da03bb4a10 100644
--- a/taichi/runtime/llvm/llvm_offline_cache.h
+++ b/taichi/runtime/llvm/llvm_offline_cache.h
@@ -100,6 +100,7 @@ struct LlvmOfflineCache {
   std::unordered_map<std::string, KernelCacheData>
       kernels;  // key = kernel_name
 
+  // NOTE: The "version" must be the first field to be serialized
   TI_IO_DEF(version, size, fields, kernels);
 };
 
diff --git a/taichi/runtime/program_impls/opengl/opengl_program.cpp b/taichi/runtime/program_impls/opengl/opengl_program.cpp
index c0da4c403a1065..4df1cad1db3add 100644
--- a/taichi/runtime/program_impls/opengl/opengl_program.cpp
+++ b/taichi/runtime/program_impls/opengl/opengl_program.cpp
@@ -1,5 +1,6 @@
 #include "opengl_program.h"
 
+#include "taichi/analysis/offline_cache_util.h"
 #include "taichi/rhi/opengl/opengl_api.h"
 #include "taichi/runtime/gfx/aot_module_builder_impl.h"
 #include "taichi/runtime/gfx/aot_module_loader_impl.h"
@@ -7,20 +8,18 @@
 namespace taichi {
 namespace lang {
 
-namespace opengl {
+namespace {
 
-FunctionType compile_to_executable(Kernel *kernel,
-                                   gfx::GfxRuntime *runtime,
-                                   gfx::SNodeTreeManager *snode_tree_mgr) {
-  auto handle = runtime->register_taichi_kernel(
-      gfx::run_codegen(kernel, runtime->get_ti_device(),
-                       snode_tree_mgr->get_compiled_structs()));
+FunctionType register_params_to_executable(
+    gfx::GfxRuntime::RegisterParams &&params,
+    gfx::GfxRuntime *runtime) {
+  auto handle = runtime->register_taichi_kernel(std::move(params));
   return [runtime, handle](RuntimeContext &ctx) {
     runtime->launch_kernel(handle, &ctx);
   };
 }
 
-}  // namespace opengl
+}  // namespace
 
 OpenglProgramImpl::OpenglProgramImpl(CompileConfig &config)
     : ProgramImpl(config) {
@@ -28,9 +27,8 @@ OpenglProgramImpl::OpenglProgramImpl(CompileConfig &config)
 
 FunctionType OpenglProgramImpl::compile(Kernel *kernel,
                                         OffloadedStmt *offloaded) {
-  spirv::lower(kernel);
-  return opengl::compile_to_executable(kernel, runtime_.get(),
-                                       snode_tree_mgr_.get());
+  return register_params_to_executable(
+      get_cache_manager()->load_or_compile(config, kernel), runtime_.get());
 }
 
 void OpenglProgramImpl::materialize_runtime(MemoryPool *memory_pool,
@@ -87,11 +85,39 @@ DeviceAllocation OpenglProgramImpl::allocate_texture(
 
 std::unique_ptr<aot::Kernel> OpenglProgramImpl::make_aot_kernel(
     Kernel &kernel) {
-  spirv::lower(&kernel);
-  std::vector<gfx::CompiledSNodeStructs> compiled_structs;
-  gfx::GfxRuntime::RegisterParams kparams =
-      gfx::run_codegen(&kernel, get_compute_device(), compiled_structs);
-  return std::make_unique<gfx::KernelImpl>(runtime_.get(), std::move(kparams));
+  auto params = get_cache_manager()->load_or_compile(config, &kernel);
+  return std::make_unique<gfx::KernelImpl>(runtime_.get(), std::move(params));
+}
+
+void OpenglProgramImpl::dump_cache_data_to_disk() {
+  const auto &mgr = get_cache_manager();
+  mgr->clean_offline_cache(offline_cache::string_to_clean_cache_policy(
+                               config->offline_cache_cleaning_policy),
+                           config->offline_cache_max_size_of_files,
+                           config->offline_cache_cleaning_factor);
+  mgr->dump_with_merging();
+}
+
+const std::unique_ptr<gfx::CacheManager>
+    &OpenglProgramImpl::get_cache_manager() {
+  if (!cache_manager_) {
+    TI_ASSERT(runtime_ && snode_tree_mgr_ && device_);
+    auto target_device = std::make_unique<aot::TargetDevice>(config->arch);
+    device_->clone_caps(*target_device);
+    using Mgr = gfx::CacheManager;
+    Mgr::Params params;
+    params.arch = config->arch;
+    params.mode =
+        offline_cache::enabled_wip_offline_cache(config->offline_cache)
+            ? Mgr::MemAndDiskCache
+            : Mgr::MemCache;
+    params.cache_path = config->offline_cache_file_path;
+    params.runtime = runtime_.get();
+    params.target_device = std::move(target_device);
+    params.compiled_structs = &snode_tree_mgr_->get_compiled_structs();
+    cache_manager_ = std::make_unique<gfx::CacheManager>(std::move(params));
+  }
+  return cache_manager_;
 }
 
 }  // namespace lang
diff --git a/taichi/runtime/program_impls/opengl/opengl_program.h b/taichi/runtime/program_impls/opengl/opengl_program.h
index 9d4ccbf9b7405b..7965c9221b22a5 100644
--- a/taichi/runtime/program_impls/opengl/opengl_program.h
+++ b/taichi/runtime/program_impls/opengl/opengl_program.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "taichi/cache/gfx/cache_manager.h"
 #include "taichi/runtime/gfx/runtime.h"
 #include "taichi/runtime/gfx/snode_tree_manager.h"
 #include "taichi/program/program_impl.h"
@@ -63,11 +64,16 @@ class OpenglProgramImpl : public ProgramImpl {
 
   std::unique_ptr<aot::Kernel> make_aot_kernel(Kernel &kernel) override;
 
+  void dump_cache_data_to_disk() override;
+
+  const std::unique_ptr<gfx::CacheManager> &get_cache_manager();
+
  private:
   std::shared_ptr<Device> device_{nullptr};
   std::unique_ptr<gfx::GfxRuntime> runtime_{nullptr};
   std::unique_ptr<gfx::SNodeTreeManager> snode_tree_mgr_{nullptr};
   std::vector<spirv::CompiledSNodeStructs> aot_compiled_snode_structs_;
+  std::unique_ptr<gfx::CacheManager> cache_manager_{nullptr};
 };
 
 }  // namespace lang
diff --git a/taichi/runtime/program_impls/vulkan/vulkan_program.cpp b/taichi/runtime/program_impls/vulkan/vulkan_program.cpp
index f256c73128b815..9d1ac33268ce73 100644
--- a/taichi/runtime/program_impls/vulkan/vulkan_program.cpp
+++ b/taichi/runtime/program_impls/vulkan/vulkan_program.cpp
@@ -130,6 +130,11 @@ void VulkanProgramImpl::materialize_runtime(MemoryPool *memory_pool,
     int32_t patch = std::atoll(config->vk_api_version.c_str() + idot2 + 1);
     evd_params.api_version = VK_MAKE_API_VERSION(0, major, minor, patch);
   }
+
+  if (config->debug) {
+    TI_WARN("Enabling vulkan validation layer in debug mode");
+    evd_params.enable_validation_layer = true;
+  }
 #if !defined(ANDROID)
   if (glfw_window) {
     // then we should be able to create a device with graphics abilities
@@ -207,6 +212,12 @@ std::unique_ptr<aot::Kernel> VulkanProgramImpl::make_aot_kernel(
                                            std::move(params));
 }
 
+void VulkanProgramImpl::enqueue_compute_op_lambda(
+    std::function<void(Device *device, CommandList *cmdlist)> op,
+    const std::vector<ComputeOpImageRef> &image_refs) {
+  vulkan_runtime_->enqueue_compute_op_lambda(op, image_refs);
+}
+
 void VulkanProgramImpl::dump_cache_data_to_disk() {
   const auto &mgr = get_cache_manager();
   mgr->clean_offline_cache(offline_cache::string_to_clean_cache_policy(
diff --git a/taichi/runtime/program_impls/vulkan/vulkan_program.h b/taichi/runtime/program_impls/vulkan/vulkan_program.h
index 754983a3c2a9a8..9a8edd8b2232a8 100644
--- a/taichi/runtime/program_impls/vulkan/vulkan_program.h
+++ b/taichi/runtime/program_impls/vulkan/vulkan_program.h
@@ -93,6 +93,10 @@ class VulkanProgramImpl : public ProgramImpl {
 
   std::unique_ptr<aot::Kernel> make_aot_kernel(Kernel &kernel) override;
 
+  void enqueue_compute_op_lambda(
+      std::function<void(Device *device, CommandList *cmdlist)> op,
+      const std::vector<ComputeOpImageRef> &image_refs) override;
+
   void dump_cache_data_to_disk() override;
 
   const std::unique_ptr<gfx::CacheManager> &get_cache_manager();
diff --git a/taichi/system/memory_pool.h b/taichi/system/memory_pool.h
index c41185f1bbc4a1..25da2e840ddf03 100644
--- a/taichi/system/memory_pool.h
+++ b/taichi/system/memory_pool.h
@@ -3,6 +3,7 @@
 #include "taichi/system/unified_allocator.h"
 #define TI_RUNTIME_HOST
 #include "taichi/runtime/llvm/runtime_module/mem_request.h"
+#undef TI_RUNTIME_HOST
 #include "taichi/rhi/device.h"
 #include <mutex>
 #include <vector>
diff --git a/taichi/system/timer.cpp b/taichi/system/timer.cpp
index 45491f4eea29a9..fb0d61472e2310 100644
--- a/taichi/system/timer.cpp
+++ b/taichi/system/timer.cpp
@@ -220,7 +220,7 @@ uint64 Time::get_cycles() {
 #else
 
 uint64 Time::get_cycles() {
-#if defined(TI_ARCH_x64)
+#if defined(TI_ARCH_x64) && !(defined(__arm64__) || defined(__aarch64__))
   unsigned int lo, hi;
   __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
   return ((uint64)hi << 32) | lo;
diff --git a/taichi/transforms/scalarize.cpp b/taichi/transforms/scalarize.cpp
index e140a3bf0e6659..70ca1cd551e493 100644
--- a/taichi/transforms/scalarize.cpp
+++ b/taichi/transforms/scalarize.cpp
@@ -8,10 +8,14 @@ TLANG_NAMESPACE_BEGIN
 
 class Scalarize : public IRVisitor {
  public:
+  DelayedIRModifier modifier_;
+
   Scalarize(IRNode *node) {
     allow_undefined_visitor = true;
     invoke_default_visitor = false;
     node->accept(this);
+
+    modifier_.modify_ir();
   }
 
   /*
@@ -51,18 +55,75 @@ class Scalarize : public IRVisitor {
       int num_elements = val_tensor_type->get_num_elements();
       for (int i = 0; i < num_elements; i++) {
         auto const_stmt = std::make_unique<ConstStmt>(
-            TypedConstant(stmt->val->ret_type.get_element_type(), i));
+            TypedConstant(get_data_type<int32>(), i));
 
         auto ptr_offset_stmt =
             std::make_unique<PtrOffsetStmt>(stmt->dest, const_stmt.get());
         auto scalarized_stmt = std::make_unique<T>(ptr_offset_stmt.get(),
                                                    matrix_init_stmt->values[i]);
 
-        stmt->insert_before_me(std::move(const_stmt));
-        stmt->insert_before_me(std::move(ptr_offset_stmt));
-        stmt->insert_before_me(std::move(scalarized_stmt));
+        modifier_.insert_before(stmt, std::move(const_stmt));
+        modifier_.insert_before(stmt, std::move(ptr_offset_stmt));
+        modifier_.insert_before(stmt, std::move(scalarized_stmt));
+      }
+      modifier_.erase(stmt);
+    }
+  }
+
+  /*
+
+    Before:
+      TensorType<4 x i32> val = LoadStmt(TensorType<4 x i32>* src)
+
+    After:
+      i32* addr0 = PtrOffsetStmt(TensorType<4 x i32>* src, 0)
+      i32* addr1 = PtrOffsetStmt(TensorType<4 x i32>* src, 1)
+      i32* addr2 = PtrOffsetStmt(TensorType<4 x i32>* src, 2)
+      i32* addr3 = PtrOffsetStmt(TensorType<4 x i32>* src, 3)
+
+      i32 val0 = LoadStmt(addr0)
+      i32 val1 = LoadStmt(addr1)
+      i32 val2 = LoadStmt(addr2)
+      i32 val3 = LoadStmt(addr3)
+
+      tmp = MatrixInitStmt(val0, val1, val2, val3)
+
+      stmt->replace_all_usages_with(tmp)
+  */
+  template <typename T>
+  void scalarize_load_stmt(T *stmt) {
+    auto src_dtype = stmt->src->ret_type.ptr_removed();
+    if (src_dtype->template is<TensorType>()) {
+      // Needs scalarize
+      auto src_tensor_type = src_dtype->template as<TensorType>();
+
+      std::vector<Stmt *> matrix_init_values;
+      int num_elements = src_tensor_type->get_num_elements();
+
+      for (size_t i = 0; i < num_elements; i++) {
+        auto const_stmt = std::make_unique<ConstStmt>(
+            TypedConstant(get_data_type<int32>(), i));
+
+        auto ptr_offset_stmt =
+            std::make_unique<PtrOffsetStmt>(stmt->src, const_stmt.get());
+        auto scalarized_stmt = std::make_unique<T>(ptr_offset_stmt.get());
+
+        matrix_init_values.push_back(scalarized_stmt.get());
+
+        modifier_.insert_before(stmt, std::move(const_stmt));
+        modifier_.insert_before(stmt, std::move(ptr_offset_stmt));
+        modifier_.insert_before(stmt, std::move(scalarized_stmt));
       }
-      stmt->parent->erase(stmt);
+
+      auto matrix_init_stmt =
+          std::make_unique<MatrixInitStmt>(matrix_init_values);
+
+      matrix_init_stmt->ret_type = src_dtype;
+
+      stmt->replace_usages_with(matrix_init_stmt.get());
+      modifier_.insert_before(stmt, std::move(matrix_init_stmt));
+
+      modifier_.erase(stmt);
     }
   }
 
@@ -107,6 +168,14 @@ class Scalarize : public IRVisitor {
   void visit(LocalStoreStmt *stmt) override {
     scalarize_store_stmt<LocalStoreStmt>(stmt);
   }
+
+  void visit(GlobalLoadStmt *stmt) override {
+    scalarize_load_stmt<GlobalLoadStmt>(stmt);
+  }
+
+  void visit(LocalLoadStmt *stmt) override {
+    scalarize_load_stmt<LocalLoadStmt>(stmt);
+  }
 };
 
 namespace irpass {
diff --git a/taichi/ui/backends/vulkan/canvas.cpp b/taichi/ui/backends/vulkan/canvas.cpp
index b2c7a2af52c7a7..55aa7900549f7c 100644
--- a/taichi/ui/backends/vulkan/canvas.cpp
+++ b/taichi/ui/backends/vulkan/canvas.cpp
@@ -18,6 +18,10 @@ void Canvas::set_image(const SetImageInfo &info) {
   renderer_->set_image(info);
 }
 
+void Canvas::set_image(Texture *tex) {
+  renderer_->set_image(tex);
+}
+
 void Canvas::triangles(const TrianglesInfo &info) {
   renderer_->triangles(info);
 }
diff --git a/taichi/ui/backends/vulkan/canvas.h b/taichi/ui/backends/vulkan/canvas.h
index 287c1953621434..e4e6c12da8e19a 100644
--- a/taichi/ui/backends/vulkan/canvas.h
+++ b/taichi/ui/backends/vulkan/canvas.h
@@ -43,6 +43,8 @@ class TI_DLL_EXPORT Canvas final : public CanvasBase {
 
   virtual void set_image(const SetImageInfo &info) override;
 
+  virtual void set_image(taichi::lang::Texture *tex) override;
+
   virtual void triangles(const TrianglesInfo &info) override;
 
   virtual void circles(const CirclesInfo &info) override;
diff --git a/taichi/ui/backends/vulkan/renderable.cpp b/taichi/ui/backends/vulkan/renderable.cpp
index 81e92b2fc8d659..01050ec168c550 100644
--- a/taichi/ui/backends/vulkan/renderable.cpp
+++ b/taichi/ui/backends/vulkan/renderable.cpp
@@ -39,12 +39,39 @@ void Renderable::init_buffers() {
   create_bindings();
 }
 
+void copy_helper(Program *prog,
+                 DevicePtr dst,
+                 DevicePtr src,
+                 DevicePtr staging,
+                 size_t size) {
+  if (prog && dst.device == src.device &&
+      dst.device == prog->get_graphics_device()) {
+    prog->enqueue_compute_op_lambda(
+        [=](Device *device, CommandList *cmdlist) {
+          cmdlist->buffer_barrier(src);
+          cmdlist->buffer_copy(dst, src, size);
+          cmdlist->buffer_barrier(dst);
+        },
+        {});
+  } else {
+    Device::MemcpyCapability memcpy_cap =
+        Device::check_memcpy_capability(dst, src, size);
+    if (memcpy_cap == Device::MemcpyCapability::Direct) {
+      Device::memcpy_direct(dst, src, size);
+    } else if (memcpy_cap == Device::MemcpyCapability::RequiresStagingBuffer) {
+      Device::memcpy_via_staging(dst, staging, src, size);
+    } else {
+      TI_NOT_IMPLEMENTED;
+    }
+  }
+}
+
 void Renderable::update_data(const RenderableInfo &info) {
   TI_ASSERT(info.vbo_attrs == config_.vbo_attrs);
   // We might not have a current program if GGUI is used in external apps to
   // load AOT modules
   Program *prog = app_context_->prog();
-  if (prog) {
+  if (prog && prog->get_graphics_device() != &app_context_->device()) {
     prog->flush();
   }
 
@@ -114,18 +141,8 @@ void Renderable::update_data(const RenderableInfo &info) {
   }
 
   const uint64_t vbo_size = config_.vbo_size() * num_vertices;
-
-  Device::MemcpyCapability memcpy_cap = Device::check_memcpy_capability(
-      vertex_buffer_.get_ptr(), vbo_dev_ptr, vbo_size);
-  if (memcpy_cap == Device::MemcpyCapability::Direct) {
-    Device::memcpy_direct(vertex_buffer_.get_ptr(), vbo_dev_ptr, vbo_size);
-  } else if (memcpy_cap == Device::MemcpyCapability::RequiresStagingBuffer) {
-    Device::memcpy_via_staging(vertex_buffer_.get_ptr(),
-                               staging_vertex_buffer_.get_ptr(), vbo_dev_ptr,
-                               vbo_size);
-  } else {
-    TI_NOT_IMPLEMENTED;
-  }
+  copy_helper(prog, vertex_buffer_.get_ptr(0), vbo_dev_ptr,
+              staging_vertex_buffer_.get_ptr(), vbo_size);
 
   if (info.indices.valid) {
     indexed_ = true;
@@ -134,15 +151,8 @@ void Renderable::update_data(const RenderableInfo &info) {
       ibo_dev_ptr = get_device_ptr(prog, info.indices.snode);
     }
     uint64_t ibo_size = num_indices * sizeof(int);
-    if (memcpy_cap == Device::MemcpyCapability::Direct) {
-      Device::memcpy_direct(index_buffer_.get_ptr(), ibo_dev_ptr, ibo_size);
-    } else if (memcpy_cap == Device::MemcpyCapability::RequiresStagingBuffer) {
-      Device::memcpy_via_staging(index_buffer_.get_ptr(),
-                                 staging_index_buffer_.get_ptr(), ibo_dev_ptr,
-                                 ibo_size);
-    } else {
-      TI_NOT_IMPLEMENTED;
-    }
+    copy_helper(prog, index_buffer_.get_ptr(), ibo_dev_ptr,
+                staging_index_buffer_.get_ptr(), ibo_size);
   }
 }
 
diff --git a/taichi/ui/backends/vulkan/renderables/mesh.cpp b/taichi/ui/backends/vulkan/renderables/mesh.cpp
index e82b62aefc199c..66423e09efa903 100644
--- a/taichi/ui/backends/vulkan/renderables/mesh.cpp
+++ b/taichi/ui/backends/vulkan/renderables/mesh.cpp
@@ -88,18 +88,22 @@ void Mesh::update_data(const MeshInfo &info, const Scene &scene) {
       attr_dev_ptr =
           get_device_ptr(prog, info.mesh_attribute_info.mesh_attribute.snode);
     }
+    // TODO : At present, we donnot support copying from cuda device memory to a
+    // host-visible vulkan device memory directly on Windows platform, which is
+    // not a ideal way for handling storage buffer. So here we set the
+    // `mesh_ssbo` vulkan buffer as device-local memory using staging buffer
+    // filling data. However, that is not what is used to do for a storage
+    // buffer (usually set as host-visible memory), we should f`ix this on
+    // Windows in future.
     Device::MemcpyCapability memcpy_cap = Device::check_memcpy_capability(
         mesh_storage_buffer_.get_ptr(), attr_dev_ptr, mesh_ssbo_size_);
     if (memcpy_cap == Device::MemcpyCapability::Direct) {
       Device::memcpy_direct(mesh_storage_buffer_.get_ptr(), attr_dev_ptr,
                             mesh_ssbo_size_);
     } else if (memcpy_cap == Device::MemcpyCapability::RequiresStagingBuffer) {
-      void *ssbo_mapped = app_context_->device().map(mesh_storage_buffer_);
-      DeviceAllocation attr_buffer(attr_dev_ptr);
-      void *attr_mapped = attr_dev_ptr.device->map(attr_buffer);
-      memcpy(ssbo_mapped, attr_mapped, mesh_ssbo_size_);
-      app_context_->device().unmap(mesh_storage_buffer_);
-      attr_dev_ptr.device->unmap(attr_buffer);
+      Device::memcpy_via_staging(mesh_storage_buffer_.get_ptr(),
+                                 staging_vertex_buffer_.get_ptr(), attr_dev_ptr,
+                                 mesh_ssbo_size_);
     } else {
       TI_NOT_IMPLEMENTED;
     }
@@ -163,7 +167,8 @@ void Mesh::create_mesh_storage_buffers() {
   if (mesh_ssbo_size_ == 0) {
     mesh_ssbo_size_ = 4 * 4 * sizeof(float);
   }
-  Device::AllocParams sb_params{mesh_ssbo_size_, true, false, true,
+  Device::AllocParams sb_params{mesh_ssbo_size_, false, false,
+                                app_context_->requires_export_sharing(),
                                 AllocUsage::Storage};
   mesh_storage_buffer_ = app_context_->device().allocate_memory(sb_params);
 }
diff --git a/taichi/ui/backends/vulkan/renderables/set_image.cpp b/taichi/ui/backends/vulkan/renderables/set_image.cpp
index e0d3e331ccd239..b56514a687e86b 100644
--- a/taichi/ui/backends/vulkan/renderables/set_image.cpp
+++ b/taichi/ui/backends/vulkan/renderables/set_image.cpp
@@ -1,6 +1,7 @@
 #include "set_image.h"
 
 #include "taichi/program/program.h"
+#include "taichi/program/texture.h"
 #include "taichi/ui/utils/utils.h"
 
 using taichi::lang::Program;
@@ -20,8 +21,8 @@ int SetImage::get_correct_dimension(int dimension) {
   }
 }
 
-void SetImage::update_ubo(float x_factor, float y_factor) {
-  UniformBufferObject ubo = {x_factor, y_factor};
+void SetImage::update_ubo(float x_factor, float y_factor, bool transpose) {
+  UniformBufferObject ubo = {x_factor, y_factor, int(transpose)};
   void *mapped = app_context_->device().map(uniform_buffer_);
   memcpy(mapped, &ubo, sizeof(ubo));
   app_context_->device().unmap(uniform_buffer_);
@@ -31,10 +32,7 @@ void SetImage::update_data(const SetImageInfo &info) {
   // We might not have a current program if GGUI is used in external apps to
   // load AOT modules
   Program *prog = app_context_->prog();
-  StreamSemaphore data_ready_sema{nullptr};
-  if (prog) {
-    data_ready_sema = prog->flush();
-  }
+  StreamSemaphore sema = nullptr;
 
   const FieldInfo &img = info.img;
 
@@ -57,19 +55,22 @@ void SetImage::update_data(const SetImageInfo &info) {
   int new_width = get_correct_dimension(img.shape[0]);
   int new_height = get_correct_dimension(img.shape[1]);
 
-  if (new_width != width || new_height != height) {
+  BufferFormat fmt = BufferFormat::rgba8;
+  if (texture_dtype_ == taichi::lang::PrimitiveType::f32) {
+    fmt = BufferFormat::rgba32f;
+  }
+
+  if (new_width != width || new_height != height || fmt != format_) {
     destroy_texture();
     free_buffers();
-    init_set_image(app_context_, new_width, new_height);
+    init_set_image(app_context_, new_width, new_height, fmt);
   }
 
-  update_ubo(img.shape[0] / (float)new_width, img.shape[1] / (float)new_height);
+  update_ubo(img.shape[0] / (float)new_width, img.shape[1] / (float)new_height,
+             true);
 
   int pixels = width * height;
 
-  app_context_->device().image_transition(texture_, ImageLayout::undefined,
-                                          ImageLayout::transfer_dst);
-
   uint64_t img_size = pixels * data_type_size(texture_dtype_) * 4;
 
   // If there is no current program, VBO information should be provided directly
@@ -77,12 +78,22 @@ void SetImage::update_data(const SetImageInfo &info) {
   DevicePtr img_dev_ptr = info.img.dev_alloc.get_ptr();
   if (prog) {
     img_dev_ptr = get_device_ptr(prog, img.snode);
+    if (img_dev_ptr.device != &app_context_->device()) {
+      sema = prog->flush();
+    }
   }
+  bool use_enqueued_op =
+      prog && (img_dev_ptr.device == &app_context_->device());
 
   Device::MemcpyCapability memcpy_cap = Device::check_memcpy_capability(
       gpu_staging_buffer_.get_ptr(), img_dev_ptr, img_size);
   if (memcpy_cap == Device::MemcpyCapability::Direct) {
-    Device::memcpy_direct(gpu_staging_buffer_.get_ptr(), img_dev_ptr, img_size);
+    // If it's the same device, we do not use the staging buffer and directly
+    // copy from the src ptr to the image in the `copy_op`
+    if (!use_enqueued_op) {
+      Device::memcpy_direct(gpu_staging_buffer_.get_ptr(), img_dev_ptr,
+                            img_size);
+    }
   } else if (memcpy_cap == Device::MemcpyCapability::RequiresStagingBuffer) {
     Device::memcpy_via_staging(gpu_staging_buffer_.get_ptr(),
                                cpu_staging_buffer_.get_ptr(), img_dev_ptr,
@@ -96,29 +107,95 @@ void SetImage::update_data(const SetImageInfo &info) {
   copy_params.image_extent.x = height;
   copy_params.image_extent.y = width;
 
-  auto stream = app_context_->device().get_graphics_stream();
-  auto cmd_list = stream->new_command_list();
-  cmd_list->image_transition(texture_, ImageLayout::undefined,
-                             ImageLayout::transfer_dst);
-  cmd_list->buffer_to_image(texture_, gpu_staging_buffer_.get_ptr(0),
-                            ImageLayout::transfer_dst, copy_params);
-
-  cmd_list->image_transition(texture_, ImageLayout::transfer_dst,
-                             ImageLayout::shader_read);
-  if (data_ready_sema) {
-    stream->submit(cmd_list.get(), {data_ready_sema});
+  DevicePtr src_ptr =
+      use_enqueued_op ? img_dev_ptr : gpu_staging_buffer_.get_ptr(0);
+
+  auto copy_op = [texture = this->texture_, src_ptr, copy_params](
+                     Device *device, CommandList *cmdlist) {
+    cmdlist->image_transition(texture, ImageLayout::undefined,
+                              ImageLayout::transfer_dst);
+    cmdlist->buffer_barrier(src_ptr);
+    cmdlist->buffer_to_image(texture, src_ptr, ImageLayout::transfer_dst,
+                             copy_params);
+    cmdlist->image_transition(texture, ImageLayout::transfer_dst,
+                              ImageLayout::shader_read);
+  };
+
+  if (use_enqueued_op) {
+    prog->enqueue_compute_op_lambda(copy_op, {});
+  } else {
+    auto stream = app_context_->device().get_graphics_stream();
+    auto cmd_list = stream->new_command_list();
+    copy_op(&app_context_->device(), cmd_list.get());
+    if (sema) {
+      stream->submit(cmd_list.get(), {sema});
+    } else {
+      stream->submit(cmd_list.get());
+    }
+  }
+}
+
+void SetImage::update_data(Texture *tex) {
+  Program *prog = app_context_->prog();
+
+  auto shape = tex->get_size();
+  auto fmt = tex->get_buffer_format();
+
+  TI_ASSERT_INFO(shape[2] == 1,
+                 "Must be a 2D image! Received image shape: {}x{}x{}", shape[0],
+                 shape[1], shape[2]);
+
+  // Reminder: y/x is flipped in Taichi. I would like to use the correct
+  // orientation, but we have existing code already using the previous
+  // convention
+  if (shape[1] != width || shape[0] != height || fmt != format_) {
+    destroy_texture();
+    free_buffers();
+    init_set_image(app_context_, shape[1], shape[0], fmt);
+  }
+
+  update_ubo(1.0f, 1.0f, false);
+
+  ImageCopyParams copy_params;
+  copy_params.width = shape[0];
+  copy_params.height = shape[1];
+  copy_params.depth = shape[2];
+
+  DeviceAllocation src_alloc = tex->get_device_allocation();
+  auto copy_op = [texture = this->texture_, src_alloc, copy_params](
+                     Device *device, CommandList *cmdlist) {
+    cmdlist->image_transition(texture, ImageLayout::undefined,
+                              ImageLayout::transfer_dst);
+    cmdlist->copy_image(texture, src_alloc, ImageLayout::transfer_dst,
+                        ImageLayout::transfer_src, copy_params);
+    cmdlist->image_transition(texture, ImageLayout::transfer_dst,
+                              ImageLayout::shader_read);
+  };
+
+  // In the current state if we called this direct image update data method, we
+  // gurantee to have a program.
+  // FIXME: However, if we don't have a Program, where does the layout come
+  // from?
+  if (prog) {
+    prog->enqueue_compute_op_lambda(
+        copy_op, {ComputeOpImageRef{src_alloc, ImageLayout::transfer_src,
+                                    ImageLayout::transfer_src}});
   } else {
+    auto stream = app_context_->device().get_graphics_stream();
+    auto cmd_list = stream->new_command_list();
+    copy_op(&app_context_->device(), cmd_list.get());
     stream->submit(cmd_list.get());
   }
 }
 
 SetImage::SetImage(AppContext *app_context, VertexAttributes vbo_attrs) {
-  init_set_image(app_context, 1, 1);
+  init_set_image(app_context, 1, 1, BufferFormat::rgba8);
 }
 
 void SetImage::init_set_image(AppContext *app_context,
                               int img_width,
-                              int img_height) {
+                              int img_height,
+                              taichi::lang::BufferFormat format) {
   RenderableConfig config = {
       6,
       6,
@@ -138,8 +215,9 @@ void SetImage::init_set_image(AppContext *app_context,
 
   Renderable::init(config, app_context);
 
-  width = img_width;
-  height = img_height;
+  this->width = img_width;
+  this->height = img_height;
+  format_ = format;
 
   create_texture();
 
@@ -154,10 +232,7 @@ void SetImage::create_texture() {
 
   ImageParams params;
   params.dimension = ImageDimension::d2D;
-  params.format = BufferFormat::rgba8;
-  if (texture_dtype_ == taichi::lang::PrimitiveType::f32) {
-    params.format = BufferFormat::rgba32f;
-  }
+  params.format = format_;
   params.initial_layout = ImageLayout::shader_read;
   // these are flipped because taichi is y-major and vulkan is x-major
   params.x = height;
diff --git a/taichi/ui/backends/vulkan/renderables/set_image.h b/taichi/ui/backends/vulkan/renderables/set_image.h
index 6e2323fcf9f64e..8698726d0433f1 100644
--- a/taichi/ui/backends/vulkan/renderables/set_image.h
+++ b/taichi/ui/backends/vulkan/renderables/set_image.h
@@ -35,12 +35,15 @@ class SetImage final : public Renderable {
     // the actual image is only a corner of the whole image
     float x_factor{1.0};
     float y_factor{1.0};
+    int transpose{0};
   };
 
   SetImage(AppContext *app_context, VertexAttributes vbo_attrs);
 
   void update_data(const SetImageInfo &info);
 
+  void update_data(taichi::lang::Texture *tex);
+
   virtual void cleanup() override;
 
  private:
@@ -50,8 +53,13 @@ class SetImage final : public Renderable {
   taichi::lang::DataType texture_dtype_{taichi::lang::PrimitiveType::u8};
   taichi::lang::DeviceAllocation texture_;
 
+  taichi::lang::BufferFormat format_;
+
  private:
-  void init_set_image(AppContext *app_context, int img_width, int img_height);
+  void init_set_image(AppContext *app_context,
+                      int img_width,
+                      int img_height,
+                      taichi::lang::BufferFormat format);
 
   virtual void create_bindings() override;
 
@@ -64,7 +72,7 @@ class SetImage final : public Renderable {
 
   int get_correct_dimension(int dimension);
 
-  void update_ubo(float x_factor, float y_factor);
+  void update_ubo(float x_factor, float y_factor, bool transpose);
 };
 
 }  // namespace vulkan
diff --git a/taichi/ui/backends/vulkan/renderer.cpp b/taichi/ui/backends/vulkan/renderer.cpp
index e01803940c1091..74c52ceabb45d2 100644
--- a/taichi/ui/backends/vulkan/renderer.cpp
+++ b/taichi/ui/backends/vulkan/renderer.cpp
@@ -50,6 +50,12 @@ void Renderer::set_image(const SetImageInfo &info) {
   next_renderable_ += 1;
 }
 
+void Renderer::set_image(Texture *tex) {
+  SetImage *s = get_renderable_of_type<SetImage>(VboHelpers::all());
+  s->update_data(tex);
+  next_renderable_ += 1;
+}
+
 void Renderer::triangles(const TrianglesInfo &info) {
   Triangles *triangles =
       get_renderable_of_type<Triangles>(info.renderable_info.vbo_attrs);
diff --git a/taichi/ui/backends/vulkan/renderer.h b/taichi/ui/backends/vulkan/renderer.h
index e7e0084e673468..dd9bdc664a83b8 100644
--- a/taichi/ui/backends/vulkan/renderer.h
+++ b/taichi/ui/backends/vulkan/renderer.h
@@ -54,6 +54,8 @@ class TI_DLL_EXPORT Renderer {
 
   void set_image(const SetImageInfo &info);
 
+  void set_image(taichi::lang::Texture *tex);
+
   void triangles(const TrianglesInfo &info);
 
   void circles(const CirclesInfo &info);
diff --git a/taichi/ui/common/canvas_base.h b/taichi/ui/common/canvas_base.h
index fbf29c4fe1f28b..98af1523b177cf 100644
--- a/taichi/ui/common/canvas_base.h
+++ b/taichi/ui/common/canvas_base.h
@@ -5,6 +5,14 @@
 #include "taichi/ui/common/renderable_info.h"
 #include "taichi/ui/utils/utils.h"
 
+namespace taichi {
+namespace lang {
+
+class Texture;
+
+}
+}  // namespace taichi
+
 TI_UI_NAMESPACE_BEGIN
 
 struct SetImageInfo {
@@ -32,6 +40,7 @@ class CanvasBase {
  public:
   virtual void set_background_color(const glm::vec3 &color) = 0;
   virtual void set_image(const SetImageInfo &info) = 0;
+  virtual void set_image(taichi::lang::Texture *tex) = 0;
   virtual void triangles(const TrianglesInfo &info) = 0;
   virtual void circles(const CirclesInfo &info) = 0;
   virtual void lines(const LinesInfo &info) = 0;
diff --git a/taichi/util/io.h b/taichi/util/io.h
index 30c41f91516929..6409dbf2cd2bf3 100644
--- a/taichi/util/io.h
+++ b/taichi/util/io.h
@@ -14,6 +14,11 @@
 
 #if defined(TI_PLATFORM_WINDOWS)
 #include <filesystem>
+#else  // POSIX
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #endif
 
 TI_NAMESPACE_BEGIN
@@ -51,6 +56,47 @@ inline bool remove(const std::string &path) {
   return std::remove(path.c_str()) == 0;
 }
 
+template <typename Visitor>  // void(const std::string &name, bool is_dir)
+inline bool traverse_directory(const std::string &dir, Visitor v) {
+#if defined(TI_PLATFORM_WINDOWS)
+  namespace fs = std::filesystem;
+  std::error_code ec{};
+  auto iter = fs::directory_iterator(dir, ec);
+  if (ec) {
+    return false;
+  }
+  for (auto &f : iter) {
+    v(f.path().filename().string(), f.is_directory());
+  }
+  return true;
+#else  // POSIX
+  struct dirent *f = nullptr;
+  DIR *directory = ::opendir(dir.c_str());
+  if (!directory) {
+    return false;
+  }
+  while ((f = ::readdir(directory))) {
+    auto fullpath = join_path(dir, f->d_name);
+    struct stat stat_buf;
+    auto ret = ::stat(fullpath.c_str(), &stat_buf);
+    TI_ASSERT(ret == 0);
+    v(f->d_name, S_ISDIR(stat_buf.st_mode));
+  }
+  auto ret = ::closedir(directory);
+  TI_ASSERT(ret == 0);
+  return true;
+#endif
+}
+
+inline std::string filename_extension(const std::string &filename) {
+  std::string postfix;
+  auto pos = filename.find_last_of('.');
+  if (pos != std::string::npos) {
+    postfix = filename.substr(pos + 1);
+  }
+  return postfix;
+}
+
 template <typename T>
 void write_to_disk(const T &dat, std::string fn) {
   FILE *f = fopen(fn.c_str(), "wb");
diff --git a/taichi/util/offline_cache.h b/taichi/util/offline_cache.h
index 961c17b75392f3..ff760de1996a95 100644
--- a/taichi/util/offline_cache.h
+++ b/taichi/util/offline_cache.h
@@ -9,6 +9,7 @@
 
 #include "taichi/common/core.h"
 #include "taichi/common/cleanup.h"
+#include "taichi/common/version.h"
 #include "taichi/util/io.h"
 #include "taichi/util/lock.h"
 
@@ -60,9 +61,46 @@ struct Metadata {
   std::size_t size{0};  // byte
   std::unordered_map<std::string, KernelMetadata> kernels;
 
+  // NOTE: The "version" must be the first field to be serialized
   TI_IO_DEF(version, size, kernels);
 };
 
+enum class LoadMetadataError {
+  kNoError,
+  kCorrupted,
+  kFileNotFound,
+  kVersionNotMatched,
+};
+
+template <typename MetadataType>
+inline LoadMetadataError load_metadata_with_checking(
+    MetadataType &result,
+    const std::string &filepath) {
+  if (!taichi::path_exists(filepath)) {
+    TI_DEBUG("Offline cache metadata file {} not found", filepath);
+    return LoadMetadataError::kFileNotFound;
+  }
+
+  using VerType = std::remove_reference_t<decltype(result.version)>;
+  static_assert(std::is_same_v<VerType, Version>);
+  const std::vector<uint8> bytes = read_data_from_file(filepath);
+
+  VerType ver{};
+  if (!read_from_binary(ver, bytes.data(), bytes.size(), false)) {
+    return LoadMetadataError::kCorrupted;
+  }
+  if (ver[0] != TI_VERSION_MAJOR || ver[1] != TI_VERSION_MINOR ||
+      ver[2] != TI_VERSION_PATCH) {
+    TI_DEBUG("The offline cache metadata file {} is old (version={}.{}.{})",
+             filepath, ver[0], ver[1], ver[2]);
+    return LoadMetadataError::kVersionNotMatched;
+  }
+
+  return !read_from_binary(result, bytes.data(), bytes.size())
+             ? LoadMetadataError::kCorrupted
+             : LoadMetadataError::kNoError;
+}
+
 struct CacheCleanerConfig {
   std::string path;
   CleanCachePolicy policy;
@@ -77,12 +115,6 @@ template <typename MetadataType>
 struct CacheCleanerUtils {
   using KernelMetaData = typename MetadataType::KernelMetadata;
 
-  // To load metadata from file
-  static bool load_metadata(const CacheCleanerConfig &config,
-                            MetadataType &result) {
-    TI_NOT_IMPLEMENTED;
-  }
-
   // To save metadata as file
   static bool save_metadata(const CacheCleanerConfig &config,
                             const MetadataType &data) {
@@ -94,12 +126,6 @@ struct CacheCleanerUtils {
     TI_NOT_IMPLEMENTED;
   }
 
-  // To check version
-  static bool check_version(const CacheCleanerConfig &config,
-                            const Version &version) {
-    TI_NOT_IMPLEMENTED;
-  }
-
   // To get cache files name
   static std::vector<std::string> get_cache_files(
       const CacheCleanerConfig &config,
@@ -111,6 +137,12 @@ struct CacheCleanerUtils {
   static void remove_other_files(const CacheCleanerConfig &config) {
     TI_NOT_IMPLEMENTED;
   }
+
+  // To check if a file is cache file
+  static bool is_valid_cache_file(const CacheCleanerConfig &config,
+                                  const std::string &name) {
+    TI_NOT_IMPLEMENTED;
+  }
 };
 
 template <typename MetadataType>
@@ -158,19 +190,25 @@ class CacheCleaner {
       });
       TI_DEBUG("Start cleaning cache");
 
-      if (!Utils::load_metadata(config, cache_data)) {
+      using Error = LoadMetadataError;
+      Error error = load_metadata_with_checking(cache_data, metadata_file);
+      if (error == Error::kFileNotFound) {
         return;
-      }
-
-      if ((policy & CleanOldVersion) &&
-          !Utils::check_version(config, cache_data.version)) {
-        if (taichi::remove(metadata_file)) {
-          taichi::remove(debugging_metadata_file);
-          Utils::remove_other_files(config);
-          for (const auto &[k, v] : cache_data.kernels) {
-            for (const auto &f : Utils::get_cache_files(config, v)) {
-              taichi::remove(taichi::join_path(path, f));
-            }
+      } else if (error == Error::kCorrupted ||
+                 error == Error::kVersionNotMatched) {
+        if (policy &
+            CleanOldVersion) {  // Remove cache files and metadata files
+          TI_DEBUG("Removing all cache files");
+          if (taichi::remove(metadata_file)) {
+            taichi::remove(debugging_metadata_file);
+            Utils::remove_other_files(config);
+            bool success = taichi::traverse_directory(
+                config.path, [&config](const std::string &name, bool is_dir) {
+                  if (!is_dir && Utils::is_valid_cache_file(config, name)) {
+                    taichi::remove(taichi::join_path(config.path, name));
+                  }
+                });
+            TI_ASSERT(success);
           }
         }
         return;
diff --git a/tests/cpp/offline_cache/load_metadata_test.cpp b/tests/cpp/offline_cache/load_metadata_test.cpp
new file mode 100644
index 00000000000000..9db7d7de92d9a7
--- /dev/null
+++ b/tests/cpp/offline_cache/load_metadata_test.cpp
@@ -0,0 +1,112 @@
+#include "gtest/gtest.h"
+#include "taichi/common/version.h"
+#include "taichi/util/offline_cache.h"
+
+#ifdef TI_WITH_LLVM
+#include "taichi/runtime/llvm/llvm_offline_cache.h"
+#endif  // TI_WITH_LLVM
+
+namespace taichi {
+namespace lang {
+
+namespace {
+
+namespace oc = offline_cache;
+
+inline void gen_old_version(oc::Version &ver) {
+  auto &[major, minor, patch] = ver;
+  major = std::max(TI_VERSION_MAJOR - 1, 0);
+  minor = std::max(TI_VERSION_MINOR - 1, 0);
+  patch = std::max(TI_VERSION_PATCH - 1, 0);
+}
+
+template <typename MetadataType>
+MetadataType gen_metadata(const oc::Version &ver) {
+  MetadataType result;
+  result.size = 1024;
+  std::copy(std::begin(ver), std::end(ver), std::begin(result.version));
+  result.kernels["1"] = {};
+  result.kernels["2"] = {};
+  return result;
+}
+
+template <typename MetadataType>
+MetadataType gen_old_metadata() {
+  oc::Version old_ver{};
+  gen_old_version(old_ver);
+  return gen_metadata<MetadataType>(old_ver);
+}
+
+template <typename MetadataType>
+MetadataType gen_correct_metadata() {
+  oc::Version ver{TI_VERSION_MAJOR, TI_VERSION_MINOR, TI_VERSION_PATCH};
+  return gen_metadata<MetadataType>(ver);
+}
+
+template <typename MetadataType>
+void load_metadata_test() {
+  std::string fake_file = fmt::format("{}.tcb", std::tmpnam(nullptr));
+  std::string old_file = fmt::format("{}.tcb", std::tmpnam(nullptr));
+  std::string corrupted_file = fmt::format("{}.tcb", std::tmpnam(nullptr));
+  std::string true_file = fmt::format("{}.tcb", std::tmpnam(nullptr));
+
+  // Generate metadata & Save as file
+  write_to_binary_file(gen_correct_metadata<MetadataType>(), true_file);
+  // Generate old metadata & Save as file
+  write_to_binary_file(gen_old_metadata<MetadataType>(), old_file);
+  // Generate corrupted metadata file
+  write_to_binary_file(gen_correct_metadata<MetadataType>(), corrupted_file);
+  std::ofstream(corrupted_file, std::ios::app | std::ios::binary)
+      << "I-AM-BAD-BYTES" << std::flush;
+
+  using Error = oc::LoadMetadataError;
+  Error error = Error::kNoError;
+
+  // Load a non-existing metadata file
+  {
+    MetadataType data;
+    error = oc::load_metadata_with_checking(data, fake_file);
+    EXPECT_EQ(error, Error::kFileNotFound);
+  }
+  // Load a old metadata file
+  {
+    MetadataType data;
+    error = oc::load_metadata_with_checking(data, old_file);
+    EXPECT_EQ(error, Error::kVersionNotMatched);
+  }
+  // Load a corrupted metadata file
+  {
+    MetadataType data;
+    error = oc::load_metadata_with_checking(data, corrupted_file);
+    EXPECT_EQ(error, Error::kCorrupted);
+  }
+  // Load a correct metadata file
+  {
+    MetadataType data;
+    error = oc::load_metadata_with_checking(data, true_file);
+    auto [major, minor, patch] = data.version;
+    EXPECT_EQ(error, Error::kNoError);
+    EXPECT_EQ(major, TI_VERSION_MAJOR);
+    EXPECT_EQ(minor, TI_VERSION_MINOR);
+    EXPECT_EQ(patch, TI_VERSION_PATCH);
+    EXPECT_EQ(data.size, 1024);
+    EXPECT_TRUE(data.kernels.count("1"));
+    EXPECT_TRUE(data.kernels.count("2"));
+  }
+
+  taichi::remove(old_file);
+  taichi::remove(corrupted_file);
+  taichi::remove(true_file);
+}
+
+}  // namespace
+
+TEST(OfflineCache, LoadMetadata) {
+#ifdef TI_WITH_LLVM
+  load_metadata_test<LlvmOfflineCache>();
+#endif  // TI_WITH_LLVM
+  load_metadata_test<oc::Metadata>();
+}
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/tests/cpp/transforms/scalarize_test.cpp b/tests/cpp/transforms/scalarize_test.cpp
index 749fb055e3d988..ab373492f6c0d1 100644
--- a/tests/cpp/transforms/scalarize_test.cpp
+++ b/tests/cpp/transforms/scalarize_test.cpp
@@ -9,7 +9,7 @@ namespace lang {
 
 // Basic tests within a basic block
 template <typename T>
-void test_scalarize() {
+void test_store_scalarize() {
   TestProgram test_prog;
   test_prog.setup();
 
@@ -76,9 +76,69 @@ void test_scalarize() {
   EXPECT_EQ(block->statements[16]->is<T>(), true);
 }
 
+template <typename T>
+void test_load_scalarize() {
+  TestProgram test_prog;
+  test_prog.setup();
+
+  auto block = std::make_unique<Block>();
+
+  auto func = []() {};
+  auto kernel =
+      std::make_unique<Kernel>(*test_prog.prog(), func, "fake_kernel");
+  block->kernel = kernel.get();
+
+  auto &type_factory = TypeFactory::get_instance();
+
+  /*
+    TensorType<4 x i32>* %1 = ExternalPtrStmt()
+    TensorType<4 x i32>  %2 = LoadStmt(%1)
+  */
+  Type *tensor_type = type_factory.get_tensor_type(
+      {2, 2}, type_factory.get_primitive_type(PrimitiveTypeID::i32));
+  auto argload_stmt = block->push_back<ArgLoadStmt>(0 /*arg_id*/, tensor_type);
+
+  std::vector<Stmt *> indices = {};
+  Stmt *src_stmt = block->push_back<ExternalPtrStmt>(
+      argload_stmt, indices);  // fake ExternalPtrStmt
+  src_stmt->ret_type = type_factory.get_pointer_type(tensor_type);
+
+  block->push_back<T>(src_stmt);
+
+  irpass::scalarize(block.get());
+
+  EXPECT_EQ(block->size(), 1 /*argload*/ + 1 /*external_ptr*/ + 4 /*const*/ +
+                               4 /*ptroffset*/ + 4 /*load*/ +
+                               1 /*matrix_init*/);
+
+  // Check for scalarized statements
+  EXPECT_EQ(block->statements[2]->is<ConstStmt>(), true);
+  EXPECT_EQ(block->statements[3]->is<PtrOffsetStmt>(), true);
+  EXPECT_EQ(block->statements[4]->is<T>(), true);
+
+  EXPECT_EQ(block->statements[5]->is<ConstStmt>(), true);
+  EXPECT_EQ(block->statements[6]->is<PtrOffsetStmt>(), true);
+  EXPECT_EQ(block->statements[7]->is<T>(), true);
+
+  EXPECT_EQ(block->statements[8]->is<ConstStmt>(), true);
+  EXPECT_EQ(block->statements[9]->is<PtrOffsetStmt>(), true);
+  EXPECT_EQ(block->statements[10]->is<T>(), true);
+
+  EXPECT_EQ(block->statements[11]->is<ConstStmt>(), true);
+  EXPECT_EQ(block->statements[12]->is<PtrOffsetStmt>(), true);
+  EXPECT_EQ(block->statements[13]->is<T>(), true);
+
+  EXPECT_EQ(block->statements[14]->is<MatrixInitStmt>(), true);
+}
+
 TEST(Scalarize, ScalarizeStore) {
-  test_scalarize<GlobalStoreStmt>();
-  test_scalarize<LocalStoreStmt>();
+  test_store_scalarize<GlobalStoreStmt>();
+  test_store_scalarize<LocalStoreStmt>();
+}
+
+TEST(Scalarize, ScalarizeLoad) {
+  test_load_scalarize<GlobalLoadStmt>();
+  test_load_scalarize<LocalLoadStmt>();
 }
 
 }  // namespace lang
diff --git a/tests/python/test_ggui.py b/tests/python/test_ggui.py
index dca8494990a39f..4550dcccf1090f 100644
--- a/tests/python/test_ggui.py
+++ b/tests/python/test_ggui.py
@@ -301,7 +301,7 @@ def render():
 
     render()
 
-    verify_image(window.get_image_buffer_as_numpy(), 'test_set_image')
+    verify_image(window.get_image_buffer_as_numpy(), 'test_set_image', 0.3)
     window.destroy()
 
 
@@ -390,7 +390,7 @@ def render():
 
 
 @pytest.mark.skipif(not _ti_core.GGUI_AVAILABLE, reason="GGUI Not Available")
-@test_utils.test(arch=supported_archs)
+@test_utils.test(arch=supported_archs, exclude=[(ti.vulkan, "Darwin")])
 def test_fetching_depth_attachment():
     window = ti.ui.Window("test", (512, 512), vsync=True, show_window=False)
     canvas = window.get_canvas()
@@ -419,7 +419,7 @@ def render():
 
 
 @pytest.mark.skipif(not _ti_core.GGUI_AVAILABLE, reason="GGUI Not Available")
-@test_utils.test(arch=supported_archs)
+@test_utils.test(arch=supported_archs, exclude=[(ti.vulkan, "Darwin")])
 def test_draw_lines():
     N = 10
     particles_pos = ti.Vector.field(3, dtype=ti.f32, shape=N)
@@ -464,7 +464,7 @@ def render():
 
 
 @pytest.mark.skipif(not _ti_core.GGUI_AVAILABLE, reason="GGUI Not Available")
-@test_utils.test(arch=supported_archs)
+@test_utils.test(arch=supported_archs, exclude=[(ti.vulkan, "Darwin")])
 def test_draw_part_of_particles():
     N = 10
     particles_pos = ti.Vector.field(3, dtype=ti.f32, shape=N)
@@ -598,7 +598,7 @@ def render():
 
 
 @pytest.mark.skipif(not _ti_core.GGUI_AVAILABLE, reason="GGUI Not Available")
-@test_utils.test(arch=supported_archs)
+@test_utils.test(arch=supported_archs, exclude=[(ti.vulkan, "Darwin")])
 def test_draw_part_of_lines():
     N = 10
     particles_pos = ti.Vector.field(3, dtype=ti.f32, shape=N)
@@ -753,10 +753,6 @@ def render():
                             transforms=instances_transforms)
         canvas.scene(scene)
 
-    if (platform.system() == 'Windows'):
-        # FIXME:Fix the bug that drawing mesh instance report bugs on Windows
-        return
-
     for i in range(30):
         update_transform(30)
         render()
@@ -868,10 +864,6 @@ def render():
                             instance_offset=2)
         canvas.scene(scene)
 
-    if (platform.system() == 'Windows'):
-        # FIXME:Fix the bug that drawing mesh instance report bugs on Windows
-        return
-
     for _ in range(RENDER_REPEAT):
         render()
         window.get_image_buffer_as_numpy()
diff --git a/tests/python/test_graph.py b/tests/python/test_graph.py
index 3ca1e56f4f1138..92787c6354e083 100644
--- a/tests/python/test_graph.py
+++ b/tests/python/test_graph.py
@@ -306,7 +306,7 @@ def foo(a: dt, b: ti.types.ndarray(dtype=dt, field_dim=1)):
 
 
 @pytest.mark.parametrize('dt', [ti.i32, ti.i64, ti.u32, ti.u64])
-@test_utils.test(arch=supported_archs_cgraph)
+@test_utils.test(arch=supported_archs_cgraph, exclude=[(ti.vulkan, "Darwin")])
 def test_arg_int(dt):
     @ti.kernel
     def foo(a: dt, b: ti.types.ndarray(dtype=dt, field_dim=1)):
diff --git a/tests/python/test_matrix.py b/tests/python/test_matrix.py
index 1ecd1b65911170..715e59c9210adc 100644
--- a/tests/python/test_matrix.py
+++ b/tests/python/test_matrix.py
@@ -115,7 +115,7 @@ def func(t: ti.i32):
         m += ti.Matrix([[3, 4], [5, t]])
         print(m @ v)
         print(r.x, r.y, r.z, r.w)
-        s = w.transpose() @ m
+        s = w @ m
         print(s)
         print(m)
 
@@ -701,6 +701,24 @@ def bar():
         bar()
 
 
+@test_utils.test(arch=get_host_arch_list(), debug=True)
+def test_matrix_vector_multiplication():
+    mat = ti.math.mat3(1)
+    vec = ti.math.vec3(3)
+    r = mat @ vec
+    for i in range(3):
+        assert r[i] == 9
+
+    @ti.kernel
+    def foo():
+        mat = ti.math.mat3(1)
+        vec = ti.math.vec3(3)
+        r = mat @ vec
+        assert r[0] == r[1] == r[2] == 9
+
+    foo()
+
+
 @test_utils.test(arch=[ti.cuda, ti.cpu], real_matrix=True)
 def test_local_matrix_read():
 
@@ -788,3 +806,52 @@ def bar():
     with pytest.raises(TaichiCompilationError,
                        match=r'Expected 1 indices, but got 2'):
         bar()
+
+
+@test_utils.test()
+def test_vector_vector_t():
+    @ti.kernel
+    def foo() -> ti.types.matrix(2, 2, ti.f32):
+        a = ti.Vector([1.0, 2.0])
+        b = ti.Vector([1.0, 2.0])
+        return a @ b.transpose()
+
+    assert foo() == [[1.0, 2.0], [2.0, 4.0]]
+
+
+@test_utils.test(arch=[ti.cuda, ti.cpu],
+                 real_matrix=True,
+                 real_matrix_scalarize=True)
+def test_store_scalarize():
+    @ti.kernel
+    def func(a: ti.types.ndarray()):
+        for i in range(5):
+            a[i] = [[i, i + 1], [i + 2, i + 3]]
+
+    x = ti.Matrix.ndarray(2, 2, ti.i32, shape=5)
+    func(x)
+
+    assert (x[0] == [[0, 1], [2, 3]])
+    assert (x[1] == [[1, 2], [3, 4]])
+    assert (x[2] == [[2, 3], [4, 5]])
+    assert (x[3] == [[3, 4], [5, 6]])
+    assert (x[4] == [[4, 5], [6, 7]])
+
+
+@test_utils.test(arch=[ti.cuda, ti.cpu],
+                 real_matrix=True,
+                 real_matrix_scalarize=True)
+def test_load_store_scalarize():
+    @ti.kernel
+    def func(a: ti.types.ndarray()):
+        for i in range(3):
+            a[i] = [[i, i + 1], [i + 2, i + 3]]
+
+        a[3] = a[1]
+        a[4] = a[2]
+
+    x = ti.Matrix.ndarray(2, 2, ti.i32, shape=5)
+    func(x)
+
+    assert (x[3] == [[1, 2], [3, 4]])
+    assert (x[4] == [[2, 3], [4, 5]])
diff --git a/tests/python/test_matrix_different_type.py b/tests/python/test_matrix_different_type.py
index 397e72c40f3c1b..9a33ca4e7d369b 100644
--- a/tests/python/test_matrix_different_type.py
+++ b/tests/python/test_matrix_different_type.py
@@ -1,80 +1,14 @@
-from pytest import approx
+import pytest
 
 import taichi as ti
 from tests import test_utils
 
 
-# TODO: test more matrix operations
-@test_utils.test()
-def test_vector():
-    type_list = [ti.f32, ti.i32]
-
-    a = ti.Vector.field(len(type_list), dtype=type_list, shape=())
-    b = ti.Vector.field(len(type_list), dtype=type_list, shape=())
-    c = ti.Vector.field(len(type_list), dtype=type_list, shape=())
-
-    @ti.kernel
-    def init():
-        a[None] = [1.0, 3]
-        b[None] = [2.0, 4]
-        c[None] = a[None] + b[None]
-
-    def verify():
-        assert isinstance(a[None][0], float)
-        assert isinstance(a[None][1], int)
-        assert isinstance(b[None][0], float)
-        assert isinstance(b[None][1], int)
-        assert c[None][0] == 3.0
-        assert c[None][1] == 7
-
-    init()
-    verify()
-
-
-# TODO: Support different element types of Matrix on opengl
-@test_utils.test(require=ti.extension.data64, exclude=ti.opengl)
-def test_matrix():
-    type_list = [[ti.f32, ti.i32], [ti.i64, ti.f32]]
-    a = ti.Matrix.field(len(type_list),
-                        len(type_list[0]),
-                        dtype=type_list,
-                        shape=())
-    b = ti.Matrix.field(len(type_list),
-                        len(type_list[0]),
-                        dtype=type_list,
-                        shape=())
-    c = ti.Matrix.field(len(type_list),
-                        len(type_list[0]),
-                        dtype=type_list,
-                        shape=())
-
-    @ti.kernel
-    def init():
-        a[None] = [[1.0, 3], [1, 3.0]]
-        b[None] = [[2.0, 4], [-2, -3.0]]
-        c[None] = a[None] + b[None]
-
-    def verify():
-        assert isinstance(a[None][0, 0], float)
-        assert isinstance(a[None][0, 1], int)
-        assert isinstance(b[None][0, 0], float)
-        assert isinstance(b[None][0, 1], int)
-        assert c[None][0, 0] == 3.0
-        assert c[None][0, 1] == 7
-        assert c[None][1, 0] == -1
-        assert c[None][1, 1] == 0.0
-
-    init()
-    verify()
-
-
 @test_utils.test(require=ti.extension.quant_basic)
-def test_quant_type():
-    qit1 = ti.types.quant.int(bits=10, signed=True)
-    qfxt1 = ti.types.quant.fixed(bits=10, signed=True, scale=0.1)
-    qit2 = ti.types.quant.int(bits=22, signed=False)
-    qfxt2 = ti.types.quant.fixed(bits=22, signed=False, scale=0.1)
-    type_list = [[qit1, qfxt2], [qfxt1, qit2]]
+def test_valid():
+    qflt = ti.types.quant.float(exp=8, frac=5, signed=True)
+    qfxt = ti.types.quant.fixed(bits=10, signed=True, scale=0.1)
+    type_list = [[qflt, qfxt], [qflt, qfxt]]
     a = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
     b = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
     c = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
@@ -99,15 +33,27 @@ def test_quant_type():
 
     @ti.kernel
     def init():
-        a[0] = [[1, 3.], [2., 1]]
-        b[0] = [[2, 4.], [-2., 1]]
+        a[0] = [[1.0, 3.0], [2.0, 1.0]]
+        b[0] = [[2.0, 4.0], [-2.0, 1.0]]
         c[0] = a[0] + b[0]
 
     def verify():
-        assert c[0][0, 0] == approx(3, 1e-3)
-        assert c[0][0, 1] == approx(7.0, 1e-3)
-        assert c[0][1, 0] == approx(0, 1e-3)
-        assert c[0][1, 1] == approx(2, 1e-3)
+        assert c[0][0, 0] == pytest.approx(3.0)
+        assert c[0][0, 1] == pytest.approx(7.0)
+        assert c[0][1, 0] == pytest.approx(0.0)
+        assert c[0][1, 1] == pytest.approx(2.0)
 
     init()
     verify()
+
+
+@test_utils.test(require=ti.extension.quant_basic)
+def test_invalid():
+    qit = ti.types.quant.int(bits=10, signed=True)
+    qfxt = ti.types.quant.fixed(bits=10, signed=True, scale=0.1)
+    type_list = [qit, qfxt]
+    with pytest.raises(
+            RuntimeError,
+            match=
+            'Member fields of a matrix field must have the same compute type'):
+        a = ti.Vector.field(len(type_list), dtype=type_list)
diff --git a/tests/python/test_offline_cache.py b/tests/python/test_offline_cache.py
index 1a0f13dfae5226..cb09af713a192c 100644
--- a/tests/python/test_offline_cache.py
+++ b/tests/python/test_offline_cache.py
@@ -18,7 +18,9 @@
 OFFLINE_CACHE_TEMP_DIR = mkdtemp()
 atexit.register(lambda: rmdir(OFFLINE_CACHE_TEMP_DIR))
 
-supported_archs_offline_cache = [ti.cpu, ti.cuda, ti.vulkan]
+supported_llvm_archs = {ti.cpu, ti.cuda}
+supported_gfx_archs = {ti.opengl, ti.vulkan}
+supported_archs_offline_cache = supported_llvm_archs | supported_gfx_archs
 supported_archs_offline_cache = [
     v for v in supported_archs_offline_cache
     if v in test_utils.expected_archs()
@@ -40,12 +42,19 @@ def cache_files_size(path):
 
 
 def expected_num_cache_files(arch, num_offloads: List[int] = None) -> int:
+    assert arch in supported_archs_offline_cache
     if not num_offloads:
         return 0
-    result = sum(num_offloads) if arch in [ti.vulkan] else len(num_offloads)
-    if arch in [ti.cpu, ti.cuda]:
+    result = 0
+    # code files
+    if arch in supported_llvm_archs:
+        result += len(num_offloads)
+    elif arch in supported_gfx_archs:
+        result += sum(num_offloads)
+    # metadata files
+    if arch in supported_llvm_archs:
         result += 2  # metadata.{json, tcb}
-    elif arch in [ti.vulkan]:
+    elif arch in supported_gfx_archs:
         # metadata.{json, tcb}, graphs.tcb, offline_cache_metadata.tcb
         result += 4
     return result
@@ -56,9 +65,9 @@ def tmp_offline_cache_file_path():
 
 
 def backend_specified_cache_path(arch):
-    if arch in [ti.cpu, ti.cuda]:
+    if arch in supported_llvm_archs:
         return join(tmp_offline_cache_file_path(), 'llvm')
-    elif arch in [ti.vulkan]:
+    elif arch in supported_gfx_archs:
         return join(tmp_offline_cache_file_path(), 'gfx')
     assert False
 
diff --git a/tests/python/test_print.py b/tests/python/test_print.py
index df3328149345ce..49244dd5f14ff1 100644
--- a/tests/python/test_print.py
+++ b/tests/python/test_print.py
@@ -3,6 +3,9 @@
 import taichi as ti
 from tests import test_utils
 
+#TODO: validation layer support on macos vulkan backend is not working.
+vk_on_mac = (ti.vulkan, 'Darwin')
+
 
 # Not really testable..
 # Just making sure it does not crash
@@ -23,8 +26,7 @@ def func():
 
 # TODO: As described by @k-ye above, what we want to ensure
 #       is that, the content shows on console is *correct*.
-@test_utils.test(exclude=[ti.vulkan,
-                          ti.dx11])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
 def test_multi_print():
     @ti.kernel
     def func(x: ti.i32, y: ti.f32):
@@ -34,8 +36,8 @@ def func(x: ti.i32, y: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan,
-                          ti.dx11])  # TODO(changyu): enable ti.vulkan
+# TODO: vulkan doesn't support %s but we should ignore it instead of crashing.
+@test_utils.test(exclude=[ti.vulkan, ti.dx11])
 def test_print_string():
     @ti.kernel
     def func(x: ti.i32, y: ti.f32):
@@ -47,8 +49,7 @@ def func(x: ti.i32, y: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan,
-                          ti.dx11])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
 def test_print_matrix():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=())
     y = ti.Vector.field(3, dtype=ti.f32, shape=3)
@@ -64,8 +65,7 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan,
-                          ti.dx11])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
 def test_print_sep_end():
     @ti.kernel
     def func():
@@ -85,8 +85,7 @@ def func():
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan,
-                          ti.dx11])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
 def test_print_multiple_threads():
     x = ti.field(dtype=ti.f32, shape=(128, ))
 
@@ -102,8 +101,7 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan,
-                          ti.dx11])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
 def test_print_list():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=(2, 3))
     y = ti.Vector.field(3, dtype=ti.f32, shape=())
@@ -124,7 +122,7 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(arch=ti.cpu)
+@test_utils.test(arch=[ti.cpu, ti.vulkan], exclude=[vk_on_mac], debug=True)
 def test_python_scope_print_field():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=())
     y = ti.Vector.field(3, dtype=ti.f32, shape=3)
@@ -135,7 +133,7 @@ def test_python_scope_print_field():
     print(z)
 
 
-@test_utils.test(arch=ti.cpu)
+@test_utils.test(arch=[ti.cpu, ti.vulkan], exclude=[vk_on_mac], debug=True)
 def test_print_string_format():
     @ti.kernel
     def func(k: ti.f32):
@@ -151,7 +149,7 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(arch=ti.cpu)
+@test_utils.test(arch=[ti.cpu, ti.vulkan], exclude=[vk_on_mac], debug=True)
 def test_print_fstring():
     def foo1(x):
         return x + 1
diff --git a/tests/python/test_scan.py b/tests/python/test_scan.py
index 0a19bc855460df..72dd5f415cb30d 100644
--- a/tests/python/test_scan.py
+++ b/tests/python/test_scan.py
@@ -2,7 +2,7 @@
 from tests import test_utils
 
 
-@test_utils.test(arch=[ti.cuda, ti.vulkan])
+@test_utils.test(arch=[ti.cuda, ti.vulkan], exclude=[(ti.vulkan, "Darwin")])
 def test_scan():
     def test_scan_for_dtype(dtype, N):
         arr = ti.field(dtype, N)
diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index accf5b1dbc9a94..3679d72f861837 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -434,7 +434,7 @@ def reduce_all() -> dtype:
 # i.e. any device other than a subgroup size of 1 should have one non active group
 
 
-@test_utils.test(arch=ti.vulkan)
+@test_utils.test(arch=ti.vulkan, exclude=[(ti.vulkan, "Darwin")])
 def test_subgroup_reduction_add_i32():
     _test_subgroup_reduce(ti.atomic_add, subgroup.reduce_add, np.sum, 2677, 0,
                           ti.i32)
@@ -451,7 +451,7 @@ def test_subgroup_reduction_add_f32():
 #     _test_subgroup_reduce(ti.atomic_add, subgroup.reduce_mul, np.prod, 8, 1, ti.f32)
 
 
-@test_utils.test(arch=ti.vulkan)
+@test_utils.test(arch=ti.vulkan, exclude=[(ti.vulkan, "Darwin")])
 def test_subgroup_reduction_max_i32():
     _test_subgroup_reduce(ti.atomic_max, subgroup.reduce_max, np.max, 2677, 0,
                           ti.i32)
diff --git a/tests/python/test_sparse_matrix.py b/tests/python/test_sparse_matrix.py
index 1a1973ee6fa35f..10cf8451a25f03 100644
--- a/tests/python/test_sparse_matrix.py
+++ b/tests/python/test_sparse_matrix.py
@@ -379,9 +379,9 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
 
 @test_utils.test(arch=ti.cuda)
 def test_gpu_sparse_matrix():
-    h_coo_row = np.asarray([0, 0, 0, 1, 2, 2, 2, 3, 3], dtype=np.int32)
-    h_coo_col = np.asarray([0, 2, 3, 1, 0, 2, 3, 1, 3], dtype=np.int32)
-    h_coo_val = np.asarray([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0],
+    h_coo_row = np.asarray([1, 0, 0, 0, 2, 2, 2, 3, 3], dtype=np.int32)
+    h_coo_col = np.asarray([1, 0, 2, 3, 0, 2, 3, 1, 3], dtype=np.int32)
+    h_coo_val = np.asarray([4.0, 1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0],
                            dtype=np.float32)
     h_X = np.asarray([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
     h_Y = np.asarray([19.0, 8.0, 51.0, 52.0], dtype=np.float32)
diff --git a/tests/test_config.json b/tests/test_config.json
new file mode 100644
index 00000000000000..10cd191721e048
--- /dev/null
+++ b/tests/test_config.json
@@ -0,0 +1,195 @@
+{
+"aot_test_cases" : {
+    "LlvmAotTest.CpuKernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=cpu"
+    ],
+    "LlvmAotTest.CudaKernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=cuda"
+    ],
+    "LlvmAotTest.CpuField": [
+        ["cpp", "aot", "python_scripts", "field_aot_test.py"],
+        "--arch=cpu"
+    ],
+    "LlvmAotTest.CudaField": [
+        ["cpp", "aot", "python_scripts", "field_aot_test.py"],
+        "--arch=cuda"
+    ],
+    "LlvmAotTest.CpuDynamic": [
+        ["cpp", "aot", "python_scripts", "dynamic_aot_test.py"],
+        "--arch=cpu"
+    ],
+    "LlvmAotTest.CudaDynamic": [
+        ["cpp", "aot", "python_scripts", "dynamic_aot_test.py"],
+        "--arch=cuda"
+    ],
+    "LlvmAotTest.CpuBitmasked": [
+        ["cpp", "aot", "python_scripts", "bitmasked_aot_test.py"],
+        "--arch=cpu"
+    ],
+    "LlvmAotTest.CudaBitmasked": [
+        ["cpp", "aot", "python_scripts", "bitmasked_aot_test.py"],
+        "--arch=cuda"
+    ],
+    "LlvmCGraph.RunGraphCpu": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=cpu"
+    ],
+    "LlvmCGraph.RunGraphCuda": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=cuda"
+    ],
+    "LlvmCGraph.CpuField": [
+        ["cpp", "aot", "python_scripts", "field_aot_test.py"],
+        "--arch=cpu --cgraph"
+    ],
+    "LlvmCGraph.CudaField": [
+        ["cpp", "aot", "python_scripts", "field_aot_test.py"],
+        "--arch=cuda --cgraph"
+    ],
+    "LlvmCGraph.Mpm88Cpu": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=cpu --cgraph"
+    ],
+    "LlvmCGraph.Mpm88Cuda": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=cuda --cgraph"
+    ],
+    "CGraphAotTest.VulkanMpm88": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=vulkan --cgraph"
+    ],
+    "CGraphAotTest.OpenglMpm88": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=opengl --cgraph"
+    ],
+    "GfxAotTest.VulkanDenseField": [
+        ["cpp", "aot", "python_scripts",
+                     "dense_field_aot_test.py"], "--arch=vulkan"
+    ],
+    "GfxAotTest.OpenglDenseField": [
+        ["cpp", "aot", "python_scripts",
+                     "dense_field_aot_test.py"], "--arch=opengl"
+    ],
+    "GfxAotTest.VulkanKernelTest1": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=vulkan"
+    ],
+    "GfxAotTest.OpenglKernelTest1": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=opengl"
+    ],
+    "GfxAotTest.VulkanKernelTest2": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test2.py"],
+        "--arch=vulkan"
+    ],
+    "GfxAotTest.OpenglKernelTest2": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test2.py"],
+        "--arch=opengl"
+    ],
+    "CGraphAotTest.VulkanRunCGraph1": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=vulkan"
+    ],
+    "CGraphAotTest.VulkanRunCGraph2": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test2.py"],
+        "--arch=vulkan --cgraph"
+    ],
+    "CGraphAotTest.OpenglRunCGraph1": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=opengl"
+    ],
+    "CGraphAotTest.OpenglRunCGraph2": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test2.py"],
+        "--arch=opengl --cgraph"
+    ]
+},
+
+"capi_aot_test_cases" : {
+    "CapiMpm88Test.Vulkan": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=vulkan"
+    ],
+    "CapiMpm88Test.Opengl": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=opengl"
+    ],
+    "CapiMpm88Test.Cuda": [
+        ["cpp", "aot", "python_scripts", "mpm88_graph_aot.py"],
+        "--arch=cuda"
+    ],
+    "CapiSphTest.Vulkan": [
+        ["cpp", "aot", "python_scripts", "sph_aot.py"],
+        "--arch=vulkan"
+    ],
+    "CapiSphTest.Opengl": [
+        ["cpp", "aot", "python_scripts", "sph_aot.py"],
+        "--arch=opengl"
+    ],
+    "CapiSphTest.Cuda": [
+        ["cpp", "aot", "python_scripts", "sph_aot.py"],
+        "--arch=cuda"
+    ],
+    "CapiCometTest.Cuda": [
+        ["cpp", "aot", "python_scripts", "comet_aot.py"],
+        "--arch=cuda"
+    ],
+    "CapiTaichiSparseTest.Cuda": [
+        ["cpp", "aot", "python_scripts", "taichi_sparse_test.py"],
+        ""
+    ],
+    "CapiAotTest.CpuField": [
+        ["cpp", "aot", "python_scripts", "field_aot_test.py"],
+        "--arch=cpu"
+    ],
+    "CapiAotTest.CudaField": [
+        ["cpp", "aot", "python_scripts", "field_aot_test.py"],
+        "--arch=cuda"
+    ],
+    "CapiGraphTest.CpuGraph": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=cpu"
+    ],
+    "CapiGraphTest.CudaGraph": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=cuda"
+    ],
+    "CapiGraphTest.VulkanGraph": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=vulkan"
+    ],
+    "CapiGraphTest.VulkanTextureGraph": [
+        ["cpp", "aot", "python_scripts", "texture_aot_test.py"],
+        "--arch=vulkan"
+    ],
+    "CapiGraphTest.OpenglGraph": [
+        ["cpp", "aot", "python_scripts", "graph_aot_test.py"],
+        "--arch=opengl"
+    ],
+    "CapiAotTest.CpuKernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=cpu"
+    ],
+    "CapiAotTest.CudaKernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=cuda"
+    ],
+    "CapiAotTest.VulkanKernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=vulkan"
+    ],
+    "CapiAotTest.OpenglKernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=opengl"
+    ],
+    "CapiDryRun.VulkanAotModule": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=vulkan"
+    ],
+    "CapiDryRun.OpenglAotModule": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=opengl"
+    ]
+}
+}
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d0eecec78bc613..17a9c0b880b31b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,8 +1,10 @@
 import copy
 import functools
 import itertools
+import json
 import os
 import pathlib
+import platform
 from errno import EEXIST
 from tempfile import NamedTemporaryFile, mkstemp
 
@@ -14,199 +16,34 @@
 
 import taichi as ti
 
-__aot_test_cases = {
-    "LlvmAotTest.CpuKernel": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=cpu"
-    ],
-    "LlvmAotTest.CudaKernel": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=cuda"
-    ],
-    "LlvmAotTest.CpuField": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'field_aot_test.py'),
-        "--arch=cpu"
-    ],
-    "LlvmAotTest.CudaField": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'field_aot_test.py'),
-        "--arch=cuda"
-    ],
-    "LlvmAotTest.CpuDynamic": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'dynamic_aot_test.py'),
-        "--arch=cpu"
-    ],
-    "LlvmAotTest.CudaDynamic": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'dynamic_aot_test.py'),
-        "--arch=cuda"
-    ],
-    "LlvmAotTest.CpuBitmasked": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'bitmasked_aot_test.py'),
-        "--arch=cpu"
-    ],
-    "LlvmAotTest.CudaBitmasked": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'bitmasked_aot_test.py'),
-        "--arch=cuda"
-    ],
-    "LlvmCGraph.RunGraphCpu": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=cpu"
-    ],
-    "LlvmCGraph.RunGraphCuda": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=cuda"
-    ],
-    "LlvmCGraph.CpuField": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'field_aot_test.py'),
-        "--arch=cpu --cgraph"
-    ],
-    "LlvmCGraph.CudaField": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'field_aot_test.py'),
-        "--arch=cuda --cgraph"
-    ],
-    "LlvmCGraph.Mpm88Cpu": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=cpu --cgraph"
-    ],
-    "LlvmCGraph.Mpm88Cuda": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=cuda --cgraph"
-    ],
-    "CGraphAotTest.VulkanMpm88": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=vulkan --cgraph"
-    ],
-    "CGraphAotTest.OpenglMpm88": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=opengl --cgraph"
-    ],
-    "GfxAotTest.VulkanDenseField": [
-        os.path.join('cpp', 'aot', 'python_scripts',
-                     'dense_field_aot_test.py'), "--arch=vulkan"
-    ],
-    "GfxAotTest.OpenglDenseField": [
-        os.path.join('cpp', 'aot', 'python_scripts',
-                     'dense_field_aot_test.py'), "--arch=opengl"
-    ],
-    "GfxAotTest.VulkanKernelTest1": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=vulkan"
-    ],
-    "GfxAotTest.OpenglKernelTest1": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=opengl"
-    ],
-    "GfxAotTest.VulkanKernelTest2": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test2.py'),
-        "--arch=vulkan"
-    ],
-    "GfxAotTest.OpenglKernelTest2": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test2.py'),
-        "--arch=opengl"
-    ],
-    "CGraphAotTest.VulkanRunCGraph1": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=vulkan"
-    ],
-    "CGraphAotTest.VulkanRunCGraph2": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test2.py'),
-        "--arch=vulkan --cgraph"
-    ],
-    "CGraphAotTest.OpenglRunCGraph1": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=opengl"
-    ],
-    "CGraphAotTest.OpenglRunCGraph2": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test2.py'),
-        "--arch=opengl --cgraph"
-    ],
-}
-
-__capi_aot_test_cases = {
-    "CapiMpm88Test.Vulkan": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=vulkan"
-    ],
-    "CapiMpm88Test.Opengl": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=opengl"
-    ],
-    "CapiMpm88Test.Cuda": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'mpm88_graph_aot.py'),
-        "--arch=cuda"
-    ],
-    "CapiSphTest.Vulkan": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'sph_aot.py'),
-        "--arch=vulkan"
-    ],
-    "CapiSphTest.Opengl": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'sph_aot.py'),
-        "--arch=opengl"
-    ],
-    "CapiSphTest.Cuda": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'sph_aot.py'),
-        "--arch=cuda"
-    ],
-    "CapiCometTest.Cuda": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'comet_aot.py'),
-        "--arch=cuda"
-    ],
-    "CapiTaichiSparseTest.Cuda": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'taichi_sparse_test.py'),
-        ""
-    ],
-    "CapiAotTest.CpuField": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'field_aot_test.py'),
-        "--arch=cpu"
-    ],
-    "CapiAotTest.CudaField": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'field_aot_test.py'),
-        "--arch=cuda"
-    ],
-    "CapiGraphTest.CpuGraph": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=cpu"
-    ],
-    "CapiGraphTest.CudaGraph": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=cuda"
-    ],
-    "CapiGraphTest.VulkanGraph": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=vulkan"
-    ],
-    "CapiGraphTest.VulkanTextureGraph": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'texture_aot_test.py'),
-        "--arch=vulkan"
-    ],
-    "CapiGraphTest.OpenglGraph": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'graph_aot_test.py'),
-        "--arch=opengl"
-    ],
-    "CapiAotTest.CpuKernel": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=cpu"
-    ],
-    "CapiAotTest.CudaKernel": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=cuda"
-    ],
-    "CapiAotTest.VulkanKernel": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=vulkan"
-    ],
-    "CapiAotTest.OpenglKernel": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=opengl"
-    ],
-    "CapiDryRun.VulkanAotModule": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=vulkan"
-    ],
-    "CapiDryRun.OpenglAotModule": [
-        os.path.join('cpp', 'aot', 'python_scripts', 'kernel_aot_test1.py'),
-        "--arch=opengl"
-    ],
-}
+
+def parse_test_configs():
+    curr_dir = os.path.dirname(os.path.abspath(__file__))
+    test_config_path = os.path.join(curr_dir, "test_config.json")
+    with open(test_config_path, "r") as f:
+        test_config = json.loads(f.read())
+
+    assert ("aot_test_cases" in test_config.keys())
+    assert ("capi_aot_test_cases" in test_config.keys())
+
+    for cpp_test_name, value in test_config["aot_test_cases"].items():
+        test_paths = value[0]
+        test_args = value[1]
+        test_config["aot_test_cases"][cpp_test_name] = [
+            os.path.join(*test_paths), test_args
+        ]
+
+    for cpp_test_name, value in test_config["capi_aot_test_cases"].items():
+        test_paths = value[0]
+        test_args = value[1]
+        test_config["capi_aot_test_cases"][cpp_test_name] = [
+            os.path.join(*test_paths), test_args
+        ]
+
+    return test_config["aot_test_cases"], test_config["capi_aot_test_cases"]
+
+
+__aot_test_cases, __capi_aot_test_cases = parse_test_configs()
 
 
 def print_aot_test_guide():
@@ -217,14 +54,14 @@ def print_aot_test_guide():
 1. A python script that compiles the Kernels and serialize into file.
 2. A C++ test that loads the file then perform execution.
 
-AOT test writer will have to configure your test case for "__aot_test_cases",
+AOT test writer will have to configure your test case for "__aot_test_cases" in "test_config.json",
 the format of which follows:
 
-        "cpp_test_name" : ["python_program_path", "--arguments"]
+        "cpp_test_name" : [["python_program_path"], "--arguments"]
 
 For example:
 
-        "LlvmProgramTest.FullPipeline": ["cpp/aot/llvm/kernel_aot_test1.py", "--arch=cpu"]
+        "LlvmProgramTest.FullPipeline": [["cpp", "aot", "llvm", "kernel_aot_test1.py"], "--arch=cpu"]
 
 The temporary directory where serialized cache file stays will be generated by run_tests.py. Both python program and C++ tests receives this directory path via environment variable "TAICHI_AOT_FOLDER_PATH".
 
@@ -402,22 +239,51 @@ def test(arch=None, exclude=None, require=None, **options):
 .. function:: ti.test(arch=[], exclude=[], require=[], **options)
 
     :parameter arch: backends to include
-    :parameter exclude: backends to exclude
+    :parameter exclude: backends and platforms to exclude
     :parameter require: extensions required
     :parameter options: other options to be passed into ``ti.init``
 
     """
+    def exclude_arch_platform(arch, system, exclude):
+        # Preprocess exclude
+        if exclude is None:
+            exclude = []
+        if not isinstance(exclude, (list, tuple)):
+            exclude = [exclude]
+
+        for pair in exclude:
+            exclude_arch = None
+            exclude_sys = None
+            if isinstance(pair, (list, tuple)):
+                if len(pair) == 1:
+                    # exclude = [(vulkan), ...]
+                    exclude_arch = pair[0]
+                else:
+                    # exclude = [(vulkan, Darwin), ...]
+                    assert len(pair) == 2
+                    exclude_arch = pair[0]
+                    exclude_sys = pair[1]
+            else:
+                # exclude = [vulkan, cpu, ...]
+                exclude_arch = pair
+
+            assert (exclude_arch is not None) or (exclude_sys is not None)
+            if exclude_arch and exclude_sys:
+                if exclude_arch == arch and exclude_sys == system:
+                    return True
+            elif exclude_arch and exclude_arch == arch:
+                return True
+            elif exclude_sys and exclude_sys == system:
+                return True
+
+        return False
 
     if arch is None:
         arch = []
-    if exclude is None:
-        exclude = []
     if require is None:
         require = []
     if not isinstance(arch, (list, tuple)):
         arch = [arch]
-    if not isinstance(exclude, (list, tuple)):
-        exclude = [exclude]
     if not isinstance(require, (list, tuple)):
         require = [require]
     archs_expected = expected_archs()
@@ -434,7 +300,11 @@ def test(arch=None, exclude=None, require=None, **options):
         # List of (arch, options) to parametrize the test function
         parameters = []
         for req_arch, *req_params in itertools.product(*arch_params_sets):
-            if (req_arch not in arch) or (req_arch in exclude):
+            if req_arch not in arch:
+                continue
+
+            curr_system = platform.system()
+            if exclude_arch_platform(req_arch, curr_system, exclude):
                 continue
 
             if not all(
diff --git a/version.txt b/version.txt
index 99a4aef0c4d4d5..c641220244f1c6 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-v1.1.3
+v1.1.4